In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import streamlit as st

In [3]:
# Step 1: Data Preprocessing
# Load the dataset
data = pd.read_csv("coconut_data.csv")

In [4]:
# Handle missing values
data = data.dropna()

In [5]:
data.shape

(2562, 10)

In [6]:
# Encode categorical columns using Label Encoding
categorical_cols = ['Region', 'District', 'Variety', 'Rainfed/Irrigated', 'Soil Type', 'Intercropping', 'Pest/Disease Pressure']
label_encoders = {}

In [7]:
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [8]:
# Define feature set X and target variable y
X = data.drop('Average Yield (Nuts/Tree/Year)', axis=1)
y = data['Average Yield (Nuts/Tree/Year)']

In [9]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [11]:
# Step 2: Model Building and Hyperparameter Tuning
# Define the Random Forest model
rf = RandomForestRegressor(random_state=42)

In [12]:
# Define the parameters for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [13]:
# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_rf = grid_search.best_estimator_

In [14]:
# Step 3: Model Evaluation
y_pred = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"R-Squared: {r2}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Mean Squared Error: 204.68312599523324
R-Squared: 0.5529679780232543


In [15]:
import pickle

# Step 3: Save the model and scaler as .pkl files
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Save the label encoders as well
with open('label_encoders.pkl', 'wb') as encoders_file:
    pickle.dump(label_encoders, encoders_file)

print("Model, Scaler, and Label Encoders saved successfully.")

Model, Scaler, and Label Encoders saved successfully.
