In [1]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv('TEST.csv')

In [2]:
df

Unnamed: 0,windspeed,rainfall,area,yield,percentageofarea,darea,yielda,fgprice,dctrice
0,0,1.535972,2870.00,3.37,0.247070,709.091324,0.832626398,20.79,11135504
1,0,2.141071,491.00,4.36,0.244428,120.014173,1.065706303,20.79,2412279.37
2,7.116629346,5.809375,1330.88,4.42,0.312983,416.543187,1.383386094,20.79,10868283.84
3,67.08946747,3.908854,5767.00,4.65,0.567852,3274.800267,2.640510013,20.79,163091064.2
4,0,0.805469,4710.00,4.14,0.130472,614.521934,0.540153038,20.79,6260539.25
...,...,...,...,...,...,...,...,...,...
21016,0,0.522989,2936.00,0,0.264956,777.910078,0,14.32,0
21017,0,0.110000,156.00,2.85,0.101081,15.768650,0.288081111,14.32,59014.04
21018,10.51271508,0.225000,6376.00,6.88,0.131057,835.618326,0.901670967,14.32,9788183.48
21019,2.823089172,0.379861,2065.00,3.14,0.115129,237.741383,0.361505057,14.32,1116516.69


In [3]:
# Split the dataset into features and target variable
X = df.drop(['dctrice','fgprice'], axis=1)
y = df.drop(['darea','percentageofarea','yielda','windspeed','rainfall','area', 'yield', 'fgprice'], axis=1)


In [4]:
X.shape, y.shape

((21021, 7), (21021, 1))

In [5]:
# Training and Temporary Set (80-20 split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Temporary set into a validation set and a test set using (50-50 split)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:

X_train.shape, y_train.shape

((16816, 7), (16816, 1))

In [7]:
y_train.info()
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16816 entries, 13628 to 15795
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dctrice  16816 non-null  object
dtypes: object(1)
memory usage: 262.8+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16816 entries, 13628 to 15795
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   windspeed         16816 non-null  object 
 1   rainfall          16816 non-null  float64
 2   area              16816 non-null  float64
 3   yield             16705 non-null  object 
 4   percentageofarea  16816 non-null  float64
 5   darea             16816 non-null  float64
 6   yielda            16816 non-null  object 
dtypes: float64(4), object(3)
memory usage: 1.0+ MB


In [8]:
X_test = pd.DataFrame(X_test).apply(pd.to_numeric, errors='coerce')
X_train = pd.DataFrame(X_train).apply(pd.to_numeric, errors='coerce')
y_train = pd.DataFrame(y_train).apply(pd.to_numeric, errors='coerce')
y_test = pd.DataFrame(y_test).apply(pd.to_numeric, errors='coerce')

In [9]:
X_train.shape, y_train.shape

((16816, 7), (16816, 1))

In [10]:
# Remove the rows with zero values from the training set and testing set
mask = X_train['windspeed'] != 0

# Apply the mask to X_train
X_train = X_train[mask]

# Apply the mask to y_train
y_train = y_train[mask]

In [11]:
X_train.shape, y_train.shape

((7867, 7), (7867, 1))

In [12]:
# Remove the rows with zero values from the training set and testing set
mask =y_train['dctrice'] != 0

# Apply the mask to X_train
X_train = X_train[mask]

# Apply the mask to y_train
y_train = y_train[mask]

In [13]:
y_train

Unnamed: 0,dctrice
10790,457824.98
17614,3613394.58
6235,49964491.13
18061,20553456.71
15375,784230.30
...,...
16023,200217.97
14423,1004840.07
16850,907034.19
11284,8478698.45


In [14]:
# Null Remover

nan_rows = X_train[X_train.isna().any(axis=1)].index
X_train = X_train.drop(nan_rows)
y_train = y_train.drop(nan_rows)

print(np.where(pd.isnull(X_train)))

(array([], dtype=int64), array([], dtype=int64))


In [15]:
# Null Remover

nan_rows = X_val[X_val.isna().any(axis=1)].index
X_val = X_val.drop(nan_rows)
y_val = y_val.drop(nan_rows)

print(np.where(pd.isnull(X_val)))

(array([], dtype=int64), array([], dtype=int64))


In [16]:
X_train.shape,y_train.shape

((6647, 7), (6647, 1))

In [17]:
nan_rows = X_test[X_test.isna().any(axis=1)].index
X_test = X_test.drop(nan_rows)
y_test = y_test.drop(nan_rows)

In [18]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from joblib import load, dump
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR

# Load the models
svr_pipeline = load('SVR.joblib')
rf = load('RFR.joblib')

# Create a RandomForestRegressor for the final estimator
final_estimator = RandomForestRegressor(
    
    n_estimators=100, 
    max_depth=4, 
    min_samples_split=8, 
    min_samples_leaf=2, 
    max_leaf_nodes=100,
    random_state=42,)

# final_estimator = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Create a StackingRegressor
stacking_regressor = StackingRegressor(
    estimators=[('svr', svr_pipeline), ('rf', rf)],
    final_estimator=final_estimator
)

stacking_regressor.fit(X_train, y_train)
predictions = stacking_regressor.predict(X_test)

# Evaluate the model
r2 = stacking_regressor.score(X_test, y_test)
print(f'R-squared (Test): {r2}')

mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error (Test): {mae}')


# Calculate and print MAE in percentage points for test set
mae_percentage_test = (mae / y_test.values.mean()) * 100
print(f'Mean Absolute Error (Test, p.p.): {mae_percentage_test}%')


predictionsVal = stacking_regressor.predict(X_val)

# Evaluate the model



maev = mean_absolute_error(y_val, predictionsVal)
print(f'Mean Absolute Error (Validation): {maev}')

r2 = stacking_regressor.score(X_val, y_val)
print(f'R-squared (Validation): {r2}')


  y = column_or_1d(y, warn=True)


R-squared (Test): 0.9498775470922874
Mean Absolute Error (Test): 2325897.5300200013
Mean Absolute Error (Test, p.p.): 27.555945933780418%
Mean Absolute Error (Validation): 2265458.415095535
R-squared (Validation): 0.9600975951809042


In [19]:
# import matplotlib.pyplot as plt
# import numpy as np

# residuals = y_test - predictions
# plt.scatter(y_test, residuals)
# plt.xlabel('Actual Values')
# plt.ylabel('Residuals')
# plt.axhline(y=0, color='r', linestyle='-')
# plt.title('Residuals vs. Actual Values')
# plt.show()

In [20]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6647 entries, 10790 to 15795
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   dctrice  6647 non-null   float64
dtypes: float64(1)
memory usage: 103.9 KB


In [21]:
print(['{:.2f}'.format(pred) for pred in predictions])

['1010028.16', '1010028.16', '1010028.16', '1010028.16', '4547770.19', '5890253.20', '1162167.82', '5155678.46', '4547770.19', '4625752.26', '1010028.16', '4853356.47', '1186355.60', '26315395.03', '4853356.47', '1160165.59', '1010028.16', '1010028.16', '136477697.56', '143382134.62', '4977323.19', '1010028.16', '1010028.16', '4625752.26', '1010028.16', '1010028.16', '1010028.16', '1186355.60', '28241636.53', '71511814.17', '4625752.26', '1010028.16', '4547770.19', '1010028.16', '1010028.16', '1010028.16', '1186355.60', '1010028.16', '4547770.19', '1010028.16', '1010028.16', '22959571.09', '1010028.16', '1010028.16', '1340523.08', '18295689.68', '1010028.16', '1010028.16', '1186355.60', '1010028.16', '1010028.16', '1010028.16', '1010028.16', '1010028.16', '1010028.16', '4547770.19', '1186355.60', '1296001.62', '6559991.22', '58427226.72', '5757337.43', '1717568.28', '6089674.18', '4547770.19', '1010028.16', '2242742.24', '5155678.46', '1010028.16', '1010028.16', '1010028.16', '1010028.

In [22]:
from sklearn.pipeline import make_pipeline
from joblib import dump

pipeline = make_pipeline(stacking_regressor)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Save the pipeline
dump(pipeline, 'STACKED-SVR_RF.joblib')

  y = column_or_1d(y, warn=True)


['STACKED-SVR_RF.joblib']