# Evaluation and Validation Notebook

In [1]:
# Import libraries
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load the model and test data
data_path = './data/processed/feature_engineered_data.csv'
df = pd.read_csv(data_path)
X_test = df[['population_density', 'yield_per_area', 'pest_intensity', 'tempChange(C)', 'rainfall']]
y_test = df['yield_per_area']

In [3]:
# Load trained model (assumed to have been saved from previous notebook)
import joblib
model = joblib.load('./models/random_forest_model.pkl')

# Predict on test data
y_pred = model.predict(X_test)

In [4]:
# 1. Evaluation Metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Evaluation Metrics on Test Data:")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2 Score: {r2}")

Model Evaluation Metrics on Test Data:
RMSE: 92.93526177241449
MAE: 1.3573058483523146
R2 Score: 0.5943871580715364




In [5]:
# 2. Cross-Validation for Robustness
cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='neg_mean_squared_error')
cv_rmse = (-cv_scores.mean()) ** 0.5
print(f"Cross-validated RMSE: {cv_rmse}")

Cross-validated RMSE: 143.4265280493462


In [2]:
# 3. Error Analysis
errors = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(errors, kde=True, color='purple')
plt.title('Distribution of Prediction Errors')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 4. Residual Analysis
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=errors, color='red')
plt.axhline(0, linestyle='--', color='gray')
plt.title('Residual Analysis')
plt.xlabel('True Yield per Area')
plt.ylabel('Prediction Error (Residuals)')
plt.show()
