In [33]:
import pandas as pd

In [34]:
df=pd.read_csv("../Data/cleaned.csv")

In [35]:
df_encoded=pd.get_dummies(df,columns=['Crop'])
#For converting cateogorical values into binary values(1/0), which are usable by the 
# machine learning model which can't handle raw text categories

In [36]:
print(df.head())

     State Name      Crop   Area_ha    yield  ...   pH  Rainfall_mm  wind  radiation
0  Chhattisgarh      rice  548000.0   337.59  ...  6.5         1200   2.0         18
1  Chhattisgarh     maize    3000.0   666.67  ...  6.0          800   2.5         20
2  Chhattisgarh  chickpea   54000.0   500.00  ...  6.5          600   1.5         16
3  Chhattisgarh      rice  547000.0   747.71  ...  6.5         1200   2.0         18
4  Chhattisgarh     maize    3000.0  1000.00  ...  6.0          800   2.5         20

[5 rows x 13 columns]


In [37]:
print(df_encoded.head())

     State Name   Area_ha    yield  ...  Crop_cotton  Crop_maize  Crop_rice
0  Chhattisgarh  548000.0   337.59  ...        False       False       True
1  Chhattisgarh    3000.0   666.67  ...        False        True      False
2  Chhattisgarh   54000.0   500.00  ...        False       False      False
3  Chhattisgarh  547000.0   747.71  ...        False       False       True
4  Chhattisgarh    3000.0  1000.00  ...        False        True      False

[5 rows x 16 columns]


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X=df_encoded.drop(['yield','State Name'],axis=1)
Y=df_encoded['yield']

In [40]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_estimators=100,random_state=42)
model.fit(X_train,Y_train)

In [41]:
# Evaluating the model

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

import numpy as np

y_pred=model.predict(X_test)

mae=mean_absolute_error(Y_test,y_pred)
mse=np.sqrt(mean_squared_error(Y_test,y_pred))
r2=r2_score(Y_test,y_pred)

print(f"Mean Absolute error = {mae:.2f}")
print(f"Mean Squared error = {mse:.2f}")
print(f"R2 Score = {r2:.2f}")

Mean Absolute error = 0.35
Mean Squared error = 9.04
R2 Score = 1.00


In [22]:
print(f"Dataset size: {len(df)} rows")


Dataset size: 50765 rows


In [42]:
# Saving the model
import joblib
joblib.dump(model,'crop_yield_model.pkl')


['crop_yield_model.pkl']

In [43]:
input_columns=X.columns.tolist()
joblib.dump(input_columns,'model_input_columns.pkl')

['model_input_columns.pkl']

In [25]:
pip install streamlit joblib pandas plotly

Collecting plotly
  Downloading plotly-6.2.0-py3-none-any.whl.metadata (8.5 kB)
Downloading plotly-6.2.0-py3-none-any.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   --------------------------------------

In [44]:
import streamlit as st
model = joblib.load('crop_yield_model.pkl')
print("Type of model:", type(model))



Type of model: <class 'sklearn.ensemble._forest.RandomForestRegressor'>
