In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from lazypredict.Supervised import LazyRegressor
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin
from sklearn.model_selection import learning_curve

In [2]:
#Loading the Dataset
Path_to_data = "climate-ds.csv"
data = pd.read_csv(Path_to_data, index_col=['Unnamed: 0'])
data.head(10)

Unnamed: 0,Area,Item,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,hg/ha_yield
0,Albania,Maize,1990,1485,121.0,16.37,36613
1,Albania,Potatoes,1990,1485,121.0,16.37,66667
2,Albania,"Rice, paddy",1990,1485,121.0,16.37,23333
3,Albania,Sorghum,1990,1485,121.0,16.37,12500
4,Albania,Soybeans,1990,1485,121.0,16.37,7000
5,Albania,Wheat,1990,1485,121.0,16.37,30197
6,Albania,Maize,1991,1485,121.0,15.36,29068
7,Albania,Potatoes,1991,1485,121.0,15.36,77818
8,Albania,"Rice, paddy",1991,1485,121.0,15.36,28538
9,Albania,Sorghum,1991,1485,121.0,15.36,6667


In [3]:
data = data[data['Area'] == 'India']

In [4]:
#Checking shape
rows, columns = data.shape
print('There are {} rows and {} columns.'.format(rows, columns))

There are 4048 rows and 7 columns.


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4048 entries, 10502 to 14549
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Area                           4048 non-null   object 
 1   Item                           4048 non-null   object 
 2   Year                           4048 non-null   int64  
 3   average_rain_fall_mm_per_year  4048 non-null   int64  
 4   pesticides_tonnes              4048 non-null   float64
 5   avg_temp                       4048 non-null   float64
 6   hg/ha_yield                    4048 non-null   int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 253.0+ KB


In [6]:
data.describe()

Unnamed: 0,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,hg/ha_yield
count,4048.0,4048.0,4048.0,4048.0,4048.0
mean,2001.43,1083.0,48459.04,26.01,80884.47
std,7.06,0.0,14997.35,0.91,95950.22
min,1990.0,1083.0,14485.33,23.26,6553.0
25%,1995.0,1083.0,37423.0,25.46,13704.75
50%,2001.0,1083.0,46195.0,25.98,28124.0
75%,2008.0,1083.0,61257.0,26.67,112471.0
max,2013.0,1083.0,75000.0,28.85,385818.0


In [7]:
data.isnull().sum()

Area                             0
Item                             0
Year                             0
average_rain_fall_mm_per_year    0
pesticides_tonnes                0
avg_temp                         0
hg/ha_yield                      0
dtype: int64

No Null values. Proceeding with understanding the unique values of the Item column.

In [8]:
data["Item"].value_counts()

Item
Cassava           506
Maize             506
Potatoes          506
Rice, paddy       506
Sorghum           506
Soybeans          506
Sweet potatoes    506
Wheat             506
Name: count, dtype: int64

Removing the 'Year' column

In [9]:
data = data.drop(["Year"], axis=1)  
data.head(10)

Unnamed: 0,Area,Item,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,hg/ha_yield
10502,India,Cassava,1083,75000.0,25.58,205381
10503,India,Cassava,1083,75000.0,26.88,205381
10504,India,Cassava,1083,75000.0,25.79,205381
10505,India,Cassava,1083,75000.0,24.1,205381
10506,India,Cassava,1083,75000.0,25.25,205381
10507,India,Cassava,1083,75000.0,25.44,205381
10508,India,Cassava,1083,75000.0,25.54,205381
10509,India,Cassava,1083,75000.0,26.36,205381
10510,India,Cassava,1083,75000.0,26.91,205381
10511,India,Cassava,1083,75000.0,25.16,205381


One-Hot Encoding

In [10]:
data = pd.get_dummies(data, columns=['Area', 'Item'])  # Convert categorical variables to dummy/indicator variables
data.head(10)

Unnamed: 0,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,hg/ha_yield,Area_India,Item_Cassava,Item_Maize,Item_Potatoes,"Item_Rice, paddy",Item_Sorghum,Item_Soybeans,Item_Sweet potatoes,Item_Wheat
10502,1083,75000.0,25.58,205381,True,True,False,False,False,False,False,False,False
10503,1083,75000.0,26.88,205381,True,True,False,False,False,False,False,False,False
10504,1083,75000.0,25.79,205381,True,True,False,False,False,False,False,False,False
10505,1083,75000.0,24.1,205381,True,True,False,False,False,False,False,False,False
10506,1083,75000.0,25.25,205381,True,True,False,False,False,False,False,False,False
10507,1083,75000.0,25.44,205381,True,True,False,False,False,False,False,False,False
10508,1083,75000.0,25.54,205381,True,True,False,False,False,False,False,False,False
10509,1083,75000.0,26.36,205381,True,True,False,False,False,False,False,False,False
10510,1083,75000.0,26.91,205381,True,True,False,False,False,False,False,False,False
10511,1083,75000.0,25.16,205381,True,True,False,False,False,False,False,False,False


Feature Scaling

In [11]:
X = data.loc[:, data.columns != 'hg/ha_yield']  # All columns except target 'hg/ha_yield'
Y = data['hg/ha_yield']  # Target variable

In [12]:
X.head(10)


Unnamed: 0,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area_India,Item_Cassava,Item_Maize,Item_Potatoes,"Item_Rice, paddy",Item_Sorghum,Item_Soybeans,Item_Sweet potatoes,Item_Wheat
10502,1083,75000.0,25.58,True,True,False,False,False,False,False,False,False
10503,1083,75000.0,26.88,True,True,False,False,False,False,False,False,False
10504,1083,75000.0,25.79,True,True,False,False,False,False,False,False,False
10505,1083,75000.0,24.1,True,True,False,False,False,False,False,False,False
10506,1083,75000.0,25.25,True,True,False,False,False,False,False,False,False
10507,1083,75000.0,25.44,True,True,False,False,False,False,False,False,False
10508,1083,75000.0,25.54,True,True,False,False,False,False,False,False,False
10509,1083,75000.0,26.36,True,True,False,False,False,False,False,False,False
10510,1083,75000.0,26.91,True,True,False,False,False,False,False,False,False
10511,1083,75000.0,25.16,True,True,False,False,False,False,False,False,False


In [13]:
Y.head(10)


10502    205381
10503    205381
10504    205381
10505    205381
10506    205381
10507    205381
10508    205381
10509    205381
10510    205381
10511    205381
Name: hg/ha_yield, dtype: int64

Normalization

In [14]:
x_max = X.max()
X = X.divide(x_max)
X.describe()

Unnamed: 0,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Area_India,Item_Cassava,Item_Maize,Item_Potatoes,"Item_Rice, paddy",Item_Sorghum,Item_Soybeans,Item_Sweet potatoes,Item_Wheat
count,4048.0,4048.0,4048.0,4048.0,4048.0,4048.0,4048.0,4048.0,4048.0,4048.0,4048.0,4048.0
unique,1.0,23.0,239.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
top,1.0,1.0,0.89,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
freq,4048.0,176.0,56.0,4048.0,3542.0,3542.0,3542.0,3542.0,3542.0,3542.0,3542.0,3542.0


In [15]:
print(X.shape)
print(Y.shape)

(4048, 12)
(4048,)


TRAIN-TEST SPLIT

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
print(x_train.shape)  # Training features


(3238, 12)


In [18]:
print(y_train.shape)  # Training labels

(3238,)


FUNCTION TO CALCULATE THE RMSE

In [19]:
# Function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

RANDOM FOREST REGRESSOR TRAINING

In [20]:
print("Random Forest Regressor:")
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(x_train, y_train)
y_pred_rf = random_forest_model.predict(x_test)

Random Forest Regressor:


In [21]:
final_predictions = pd.DataFrame({
    'Actual Yield': y_test,
    'OUR Prediction': y_pred_rf,
    
})

print(final_predictions.head())

       Actual Yield  OUR Prediction
11661          9591         9591.00
10651         79663        79663.00
14410         25726        25726.00
14024        364770       364770.00
13707         20238        20238.00
