Data Preprocessing

In [None]:
import pandas as pd

In [None]:
data1 = pd.read_csv('dataset.csv')
data1.head()

Unnamed: 0,year,state,crop,land_area,yield
0,2015,Abia,sorghum,,
1,2015,Adamawa,sorghum,203.4,1.07
2,2015,Akwa-Ibom,sorghum,,
3,2015,Anambra,sorghum,,
4,2015,Bauchi,sorghum,290.8,1.04


In [None]:
data2 = pd.read_csv('weather.csv')
data2.head()

Unnamed: 0,year,state,humidity,soil_surface,max_temp,min_temp,soil_moisture,root_zone,max_speed,min_speed,radiation,rainfall
0,2015,Abia,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
1,2016,Abia,85.5,0.75,34.23,15.16,0.89,0.76,5.8,0.02,35.75,1966.99
2,2017,Abia,86.75,0.8,33.05,17.32,0.94,0.81,5.14,0.0,35.75,2046.09
3,2018,Abia,85.81,0.78,33.61,12.67,0.92,0.8,4.95,0.02,35.75,1814.06
4,2019,Abia,86.12,0.77,32.62,17.01,0.91,0.77,4.48,0.02,35.75,1930.08


In [None]:
merged_data = pd.merge(data1, data2, on=['year', 'state'])
merged_data.head()

Unnamed: 0,year,state,crop,land_area,yield,humidity,soil_surface,max_temp,min_temp,soil_moisture,root_zone,max_speed,min_speed,radiation,rainfall
0,2015,Abia,sorghum,,,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
1,2015,Abia,maize,79.62,1.16,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
2,2015,Abia,rice,14.59,2.4,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
3,2015,Abia,ginger,,,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
4,2015,Abia,yam,175.2,3.96,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65


In [None]:
merged_data = merged_data.dropna()
merged_data

Unnamed: 0,year,state,crop,land_area,yield,humidity,soil_surface,max_temp,min_temp,soil_moisture,root_zone,max_speed,min_speed,radiation,rainfall
1,2015,Abia,maize,79.62,1.160,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
2,2015,Abia,rice,14.59,2.400,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
4,2015,Abia,yam,175.20,3.960,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
5,2015,Abia,groundnut,7.20,1.344,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
7,2015,Abia,cassava,196.86,9.520,85.12,0.69,32.78,13.13,0.85,0.68,6.15,0.07,35.77,1439.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2467,2019,Zamfara,benniseed,7.45,1.570,49.44,0.38,41.55,10.62,0.42,0.40,8.27,0.01,35.13,669.73
2468,2019,Zamfara,cotton,17.61,0.410,49.44,0.38,41.55,10.62,0.42,0.40,8.27,0.01,35.13,669.73
2469,2019,Zamfara,cassava,130.17,1.790,49.44,0.38,41.55,10.62,0.42,0.40,8.27,0.01,35.13,669.73
2470,2019,Zamfara,tomatoe,39.61,5.140,49.44,0.38,41.55,10.62,0.42,0.40,8.27,0.01,35.13,669.73


Feature Selection

In [None]:
#Load libraries
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Create mapping dictionaries for state and crop
state_mapping = {state: i for i, state in enumerate(merged_data['state'].unique())}
crop_mapping = {crop: i for i, crop in enumerate(merged_data['crop'].unique())}

In [None]:
# Map the state and crop columns
merged_data['state'] = merged_data['state'].map(state_mapping)
merged_data['crop'] = merged_data['crop'].map(crop_mapping)

In [None]:
# Separate the target variable (yield) from the features
X = merged_data.drop(['yield', 'year'], axis=1)
y = merged_data['yield']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train and evaluate the model with all features
model_full = RandomForestRegressor()
model_full.fit(X_train, y_train)
y_pred_full = model_full.predict(X_test)
mse_full = mean_squared_error(y_test, y_pred_full)
r2_full = r2_score(y_test, y_pred_full)

In [None]:
# Feature Importance using Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_
important_features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)
print(important_features)

land_area        0.285005
crop             0.179553
max_temp         0.083195
root_zone        0.072139
humidity         0.071012
min_temp         0.058181
max_speed        0.056000
soil_surface     0.055505
radiation        0.041390
rainfall         0.030635
state            0.025621
min_speed        0.022831
soil_moisture    0.018932
dtype: float64


In [None]:
# Train and evaluate the model with features selected by Random Forest feature importance
selected_features_rf = important_features.index[:5]  # select top 5 features
X_train_rf = X_train[selected_features_rf]
X_test_rf = X_test[selected_features_rf]
model_rf = RandomForestRegressor()
model_rf.fit(X_train_rf, y_train)
y_pred_rf = model_rf.predict(X_test_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [None]:
# Recursive Feature Elimination (RFE)
estimator = RandomForestRegressor()
rfe = RFE(estimator, n_features_to_select=5)
rfe.fit(X_train, y_train)
selected_features = pd.Series(rfe.support_, index=X.columns)
print(selected_features[selected_features])

crop         True
land_area    True
humidity     True
max_temp     True
min_temp     True
dtype: bool


In [None]:
# Train and evaluate the model with features selected by RFE
selected_features_rfe = selected_features[selected_features].index.tolist()  # select features marked True by RFE
X_train_rfe = X_train[selected_features_rfe]
X_test_rfe = X_test[selected_features_rfe]
model_rfe = RandomForestRegressor()
model_rfe.fit(X_train_rfe, y_train)
y_pred_rfe = model_rfe.predict(X_test_rfe)
mse_rfe = mean_squared_error(y_test, y_pred_rfe)
r2_rfe = r2_score(y_test, y_pred_rfe)

In [None]:
# Lasso Regression
lasso = Lasso(alpha=0.1)  # adjust alpha value as needed
lasso.fit(X_train, y_train)
lasso_coefficients = pd.Series(lasso.coef_, index=X.columns).sort_values(ascending=False)
print(lasso_coefficients)

max_speed        0.148822
min_temp         0.074965
crop             0.032258
humidity         0.025479
land_area        0.002975
state            0.000000
soil_surface     0.000000
max_temp        -0.000000
soil_moisture   -0.000000
root_zone       -0.000000
min_speed        0.000000
radiation       -0.000000
rainfall        -0.000240
dtype: float64


In [None]:
# Train and evaluate the model with features selected by Lasso Regression
selected_features_lasso = lasso_coefficients[lasso_coefficients != 0].index.tolist()  # select features with non-zero coefficients
X_train_lasso = X_train[selected_features_lasso]
X_test_lasso = X_test[selected_features_lasso]
model_lasso = RandomForestRegressor()
model_lasso.fit(X_train_lasso, y_train)
y_pred_lasso = model_lasso.predict(X_test_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

In [None]:
# Compare the evaluation metrics of different feature subsets
print("Metrics for Full features:")
print("MSE:", mse_full)
print("R2 Score:", r2_full)


print("Metrics for Random Forest:")
print("MSE:", mse_rf)
print("R2 Score:", r2_rf)



print("Metrics for RFE:")
print("MSE:", mse_rfe)
print("R2 Score:", r2_rfe)


print("Metrics for Lasso Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)

Metrics for Full features:
MSE: 18.0046849859186
R2 Score: 0.12773585496767237
Metrics for Random Forest:
MSE: 22.707211241197495
R2 Score: -0.10008512866858443
Metrics for RFE:
MSE: 12.264918262531411
R2 Score: 0.4058075189582375
Metrics for Lasso Regression:
MSE: 20.96575172790001
R2 Score: -0.015717493541235195
