In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

In [2]:
df = pd.read_csv("crop_yield.csv")


In [3]:
print(df.head())

           Crop  Crop_Year       Season  State     Area  Production  \
0      Arecanut       1997  Whole Year   Assam  73814.0       56708   
1     Arhar/Tur       1997  Kharif       Assam   6637.0        4685   
2   Castor seed       1997  Kharif       Assam    796.0          22   
3      Coconut        1997  Whole Year   Assam  19656.0   126905000   
4  Cotton(lint)       1997  Kharif       Assam   1739.0         794   

   Annual_Rainfall  Fertilizer  Pesticide        Yield  
0           2051.4  7024878.38   22882.34     0.796087  
1           2051.4   631643.29    2057.47     0.710435  
2           2051.4    75755.32     246.76     0.238333  
3           2051.4  1870661.52    6093.36  5238.051739  
4           2051.4   165500.63     539.09     0.420909  


In [4]:
print(df.columns)

Index(['Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production',
       'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield'],
      dtype='object')


In [5]:
# Map prices and costs
market_price_dict = {
    'Arecanut': 250000,        
    'Arhar/Tur': 6200,
    'Castor seed': 5300,      
    'Coconut ': 11000,         
    'Cotton(lint)': 52000,    
}

In [6]:
cost_cultivation_dict = {
    'Arecanut': 244365,         
    'Arhar/Tur': 71370,
    'Castor seed': 40000,
    'Coconut ': 155836,
    'Cotton(lint)': 82200,
}

In [7]:
df['Market_Price'] = df['Crop'].map(market_price_dict)
df['Cost_of_Cultivation'] = df['Crop'].map(cost_cultivation_dict)
df['Revenue'] = df['Yield'] * df['Market_Price']
df['Profit'] = df['Revenue'] - df['Cost_of_Cultivation']
df['ROI'] = (df['Profit'] / df['Cost_of_Cultivation']) * 100

In [8]:
df_clean = df.dropna(subset=['ROI']).copy()

In [9]:
# Label encode categorical features
le_crop = LabelEncoder()
df_clean['Crop_encoded'] = le_crop.fit_transform(df_clean['Crop'])

In [10]:
le_state = LabelEncoder()
df_clean['State_encoded'] = le_state.fit_transform(df_clean['State'])

In [11]:
le_season = LabelEncoder()
df_clean['Season_encoded'] = le_season.fit_transform(df_clean['Season'])

In [12]:
# Features and target
features = ['Crop_encoded', 'State_encoded', 'Season_encoded', 'Area',
            'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield', 'Revenue']

In [13]:
X = df_clean[features]

In [14]:
y = df_clean['ROI']

In [15]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [17]:
# Evaluation
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

MAE: 47.912461699626434
MSE: 97815.94931480236


In [25]:
# Create a DataFrame with actual vs predicted
results_df = X_test.copy()
results_df['Actual_ROI'] = y_test.values
results_df['Predicted_ROI'] = y_pred

results_df = results_df.merge(df_clean[['Crop', 'State', 'Season', 'Crop_encoded', 'State_encoded', 'Season_encoded']],
                              on=['Crop_encoded', 'State_encoded', 'Season_encoded'],
                              how='left')


In [26]:
# Save to CSV
results_df.to_csv("predicted_results_with_names.csv", index=False)

In [27]:
# Save model
joblib.dump(model, "roi_predictor.pkl")

['roi_predictor.pkl']