In [1]:
!pip install pandas scikit-learn joblib




In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv("India Agriculture Crop Production.csv")
df.head()

Unnamed: 0,State,District,Crop,Year,Season,Area,Area Units,Production,Production Units,Yield
0,Andaman and Nicobar Islands,NICOBARS,Arecanut,2001-02,Kharif,1254.0,Hectare,2061.0,Tonnes,1.643541
1,Andaman and Nicobar Islands,NICOBARS,Arecanut,2002-03,Whole Year,1258.0,Hectare,2083.0,Tonnes,1.655803
2,Andaman and Nicobar Islands,NICOBARS,Arecanut,2003-04,Whole Year,1261.0,Hectare,1525.0,Tonnes,1.209358
3,Andaman and Nicobar Islands,NORTH AND MIDDLE ANDAMAN,Arecanut,2001-02,Kharif,3100.0,Hectare,5239.0,Tonnes,1.69
4,Andaman and Nicobar Islands,SOUTH ANDAMANS,Arecanut,2002-03,Whole Year,3105.0,Hectare,5267.0,Tonnes,1.696296


In [4]:
# Remove rows where Production is missing
df = df.dropna(subset=["Production"])

# Filter only Kerala
df = df[df["State"] == "Kerala"].copy()

# Remove zero yield rows
df = df[df["Yield"] > 0]

print("Final Shape:", df.shape)

Final Shape: (4626, 10)


In [5]:
df_model = df[['District', 'Crop', 'Season', 'Year', 'Area', 'Yield']].copy()

# Convert Year like "2003-04" → 2003
df_model["Year"] = df_model["Year"].str[:4].astype(int)

df_model.head()

Unnamed: 0,District,Crop,Season,Year,Area,Yield
17737,ALAPPUZHA,Arecanut,Whole Year,2001,2389.0,0.331519
17738,ALAPPUZHA,Arecanut,Whole Year,2002,2441.0,0.416223
17739,ALAPPUZHA,Arecanut,Whole Year,2003,2416.0,0.404801
17740,ERNAKULAM,Arecanut,Whole Year,2001,4662.0,0.773702
17741,ERNAKULAM,Arecanut,Whole Year,2002,4886.0,0.762996


In [6]:
df_model = df_model[df_model["Yield"] < 20]

print(df_model["Yield"].describe())

count    3795.000000
mean        3.006746
std         3.741608
min         0.001942
25%         0.520271
50%         1.903621
75%         3.000000
max        19.964999
Name: Yield, dtype: float64


In [7]:
le_district = LabelEncoder()
le_crop = LabelEncoder()
le_season = LabelEncoder()

df_model["District"] = le_district.fit_transform(df_model["District"])
df_model["Crop"] = le_crop.fit_transform(df_model["Crop"])
df_model["Season"] = le_season.fit_transform(df_model["Season"])

In [8]:
X = df_model[["District", "Crop", "Season", "Year", "Area"]]
y = df_model["Yield"]

print("Feature Order:", X.columns)

Feature Order: Index(['District', 'Crop', 'Season', 'Year', 'Area'], dtype='object')


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [11]:
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)

print("Kerala Model MAE:", mae)

Kerala Model MAE: 0.28880794823536304


In [12]:
joblib.dump(model, "kerala_yield_model.pkl")
joblib.dump(le_district, "le_district.pkl")
joblib.dump(le_crop, "le_crop.pkl")
joblib.dump(le_season, "le_season.pkl")

print("Model and encoders saved successfully!")

Model and encoders saved successfully!


In [13]:
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

# Predict on test data
y_pred = model.predict(X_test)

# Calculate metrics
loss = mean_absolute_error(y_test, y_pred)   # treat MAE as loss
accuracy = r2_score(y_test, y_pred)          # treat R² as accuracy

print("Test loss :", loss)
print("Test accuracy :", accuracy)

Test loss : 0.28880794823536304
Test accuracy : 0.9635110156575933
