In [125]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import zscore
import joblib

In [126]:
# Load the dataset
data = pd.read_csv('coconut_data.csv')

In [127]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le  # Store encoders for inverse transformation if needed

In [128]:
# Outlier Detection and Removal
# Calculate Z-scores and filter out rows where any numerical feature's Z-score > 3
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
data = data[(np.abs(zscore(data[numerical_features])) < 3).all(axis=1)]

In [129]:
# Define features (X) and target (y)
X = data.drop(columns=['Average Yield (Nuts/Tree/Year)'])
y = data['Average Yield (Nuts/Tree/Year)']

In [130]:
# Feature Engineering: Add Polynomial Features for non-linear relationships
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))

In [131]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [132]:
# Scale the data for consistent model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [133]:
# Step 2: Define Models and Hyperparameter Tuning

# Models with expanded parameter tuning
# Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
rf_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [134]:
# XGBoost Regressor
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [6, 10, 15]
}

In [135]:
# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)
gbr_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

In [136]:
# Step 3: Training Models with Cross-Validation

def evaluate_model(model, X_train, y_train):
    # Using k-fold cross-validation for robust performance estimation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-cv_scores)
    print(f"Cross-Validated RMSE: {rmse_scores.mean():.4f} ± {rmse_scores.std():.4f}")

In [137]:
# Perform Grid Search for each model
print("\nTraining Random Forest...")
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
print("Best Random Forest Parameters:", rf_grid.best_params_)
evaluate_model(best_rf, X_train, y_train)


Training Random Forest...
Best Random Forest Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500}
Cross-Validated RMSE: 13.1400 ± 0.2088


In [138]:
print("\nTraining XGBoost...")
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_
print("Best XGBoost Parameters:", xgb_grid.best_params_)
evaluate_model(best_xgb, X_train, y_train)


Training XGBoost...




Best XGBoost Parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}
Cross-Validated RMSE: 12.6066 ± 0.1617


In [139]:
print("\nTraining Gradient Boosting...")
gbr_grid = GridSearchCV(gbr, gbr_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbr_grid.fit(X_train, y_train)
best_gbr = gbr_grid.best_estimator_
print("Best Gradient Boosting Parameters:", gbr_grid.best_params_)
evaluate_model(best_gbr, X_train, y_train)


Training Gradient Boosting...
Best Gradient Boosting Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Cross-Validated RMSE: 12.4394 ± 0.1562


In [145]:
# Step 4: Ensemble and Final Model Selection
# Combine predictions from the best models using stacking

# Function to calculate evaluation metrics
def final_evaluation(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R^2 Score: {r2}")

In [146]:
# Obtain predictions from each model
rf_predictions = best_rf.predict(X_test)
xgb_predictions = best_xgb.predict(X_test)
gbr_predictions = best_gbr.predict(X_test)

In [147]:
# Averaging Ensemble
ensemble_predictions = (rf_predictions + xgb_predictions + gbr_predictions) / 3

In [148]:
print("\nEvaluating Ensemble Model on Test Data:")
final_evaluation(y_test, ensemble_predictions)


Evaluating Ensemble Model on Test Data:
MAE: 11.013752411357265
MSE: 164.1146916277879
RMSE: 12.810725647979035
R^2 Score: 0.6097582729139697


In [149]:
import pickle

# Step 5: Save the Ensemble Model (save the components individually for later ensemble)
with open('./models/random_forest_model.pkl', 'wb') as rf_file:
    pickle.dump(best_rf, rf_file)

with open('./models/xgboost_model.pkl', 'wb') as xgb_file:
    pickle.dump(best_xgb, xgb_file)

with open('./models/gradient_boosting_model.pkl', 'wb') as gbr_file:
    pickle.dump(best_gbr, gbr_file)

print("\nIndividual models saved as 'random_forest_model.pkl', 'xgboost_model.pkl', and 'gradient_boosting_model.pkl'")


Individual models saved as 'random_forest_model.pkl', 'xgboost_model.pkl', and 'gradient_boosting_model.pkl'


In [150]:
with open('./helper/scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Save the label encoders as well
with open('./helper/label_encoders.pkl', 'wb') as encoders_file:
    pickle.dump(label_encoders, encoders_file)
    
# Save polynomial features
with open('./helper/polynomial_features.pkl', 'wb') as poly_file:
    pickle.dump(poly, poly_file)

print("polynomial_features , Scaler, and Label Encoders saved successfully.")

polynomial_features , Scaler, and Label Encoders saved successfully.


In [151]:
poly

In [152]:
label_encoders

{'Region': LabelEncoder(),
 'District': LabelEncoder(),
 'Variety': LabelEncoder(),
 'Rainfed/Irrigated': LabelEncoder(),
 'Soil Type': LabelEncoder(),
 'Intercropping': LabelEncoder(),
 'Pest/Disease Pressure': LabelEncoder()}

In [115]:
scaler.feature_names_in_

array(['Region', 'District', 'Variety', 'Age (Years)',
       'Rainfed/Irrigated', 'Soil Type', 'Intercropping',
       'Pest/Disease Pressure', 'Fertilizer Use (kg/tree/year)',
       'Region District', 'Region Variety', 'Region Age (Years)',
       'Region Rainfed/Irrigated', 'Region Soil Type',
       'Region Intercropping', 'Region Pest/Disease Pressure',
       'Region Fertilizer Use (kg/tree/year)', 'District Variety',
       'District Age (Years)', 'District Rainfed/Irrigated',
       'District Soil Type', 'District Intercropping',
       'District Pest/Disease Pressure',
       'District Fertilizer Use (kg/tree/year)', 'Variety Age (Years)',
       'Variety Rainfed/Irrigated', 'Variety Soil Type',
       'Variety Intercropping', 'Variety Pest/Disease Pressure',
       'Variety Fertilizer Use (kg/tree/year)',
       'Age (Years) Rainfed/Irrigated', 'Age (Years) Soil Type',
       'Age (Years) Intercropping', 'Age (Years) Pest/Disease Pressure',
       'Age (Years) Fertilizer Use

In [117]:
# Assuming scaler is already defined and fitted
feature_names = scaler.feature_names_in_

# Print each feature name in a new line
for feature in feature_names:
    print(feature)

Region
District
Variety
Age (Years)
Rainfed/Irrigated
Soil Type
Intercropping
Pest/Disease Pressure
Fertilizer Use (kg/tree/year)
Region District
Region Variety
Region Age (Years)
Region Rainfed/Irrigated
Region Soil Type
Region Intercropping
Region Pest/Disease Pressure
Region Fertilizer Use (kg/tree/year)
District Variety
District Age (Years)
District Rainfed/Irrigated
District Soil Type
District Intercropping
District Pest/Disease Pressure
District Fertilizer Use (kg/tree/year)
Variety Age (Years)
Variety Rainfed/Irrigated
Variety Soil Type
Variety Intercropping
Variety Pest/Disease Pressure
Variety Fertilizer Use (kg/tree/year)
Age (Years) Rainfed/Irrigated
Age (Years) Soil Type
Age (Years) Intercropping
Age (Years) Pest/Disease Pressure
Age (Years) Fertilizer Use (kg/tree/year)
Rainfed/Irrigated Soil Type
Rainfed/Irrigated Intercropping
Rainfed/Irrigated Pest/Disease Pressure
Rainfed/Irrigated Fertilizer Use (kg/tree/year)
Soil Type Intercropping
Soil Type Pest/Disease Pressure
So

In [121]:
X_train.shape

(9593, 45)