# Crop Yield Prediction in Tanzania
### Machine Learning Project

**Objective:** Predict crop yield (kg per acre) for Tanzanian farmers using Linear Regression and Decision Tree models.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

## 2. Load and Explore Data

In [None]:
# Load the dataset
df = pd.read_csv('crop_data.csv')

# Display first few rows
print('First 5 rows of the dataset:')
df.head()

In [None]:
# Dataset shape
print(f'Dataset has {df.shape[0]} rows and {df.shape[1]} columns')
print()

# Data types
print('Data types:')
print(df.dtypes)

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
print('Missing values in each column:')
print(df.isnull().sum())

## 3. Data Preprocessing

In [None]:
# Fill missing values with the mean of each column
df['rainfall_mm'] = df['rainfall_mm'].fillna(df['rainfall_mm'].mean())
df['temperature_c'] = df['temperature_c'].fillna(df['temperature_c'].mean())
df['fertilizer_kg'] = df['fertilizer_kg'].fillna(df['fertilizer_kg'].mean())

# Verify no more missing values
print('Missing values after cleaning:')
print(df.isnull().sum())

In [None]:
# Encode categorical columns using LabelEncoder
label_encoders = {}

categorical_columns = ['region', 'crop_type', 'soil_type']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f'{col}: {list(le.classes_)}')

print()
print('Data after encoding:')
df.head()

In [None]:
# Split features and target
X = df.drop('yield_kg_per_acre', axis=1)
y = df['yield_kg_per_acre']

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set: {X_train.shape[0]} samples')
print(f'Testing set: {X_test.shape[0]} samples')

## 4. Model Training

### 4.1 Linear Regression

In [None]:
# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test)

print('Linear Regression model trained successfully!')

### 4.2 Decision Tree

In [None]:
# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
dt_predictions = dt_model.predict(X_test)

print('Decision Tree model trained successfully!')

## 5. Model Evaluation and Comparison

In [None]:
# Evaluate Linear Regression
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

# Evaluate Decision Tree
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

# Display results
print('='*50)
print('MODEL COMPARISON RESULTS')
print('='*50)
print(f'\nLinear Regression:')
print(f'  MAE  = {lr_mae:.2f} kg/acre')
print(f'  R2   = {lr_r2:.4f}')
print(f'\nDecision Tree:')
print(f'  MAE  = {dt_mae:.2f} kg/acre')
print(f'  R2   = {dt_r2:.4f}')
print()

# Determine the best model
if lr_r2 > dt_r2:
    print('>>> Linear Regression is the BETTER model!')
    best_model = lr_model
    best_name = 'Linear Regression'
else:
    print('>>> Decision Tree is the BETTER model!')
    best_model = dt_model
    best_name = 'Decision Tree'

## 6. Visualizations

In [None]:
# Chart 1: Model Comparison - R2 Score and MAE
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# R2 Score comparison
models = ['Linear Regression', 'Decision Tree']
r2_scores = [lr_r2, dt_r2]
colors = ['#3498db', '#2ecc71']

axes[0].bar(models, r2_scores, color=colors)
axes[0].set_title('R2 Score Comparison')
axes[0].set_ylabel('R2 Score')
for i, v in enumerate(r2_scores):
    axes[0].text(i, v + 0.01, f'{v:.4f}', ha='center', fontweight='bold')

# MAE comparison
mae_scores = [lr_mae, dt_mae]
axes[1].bar(models, mae_scores, color=colors)
axes[1].set_title('MAE Comparison (lower is better)')
axes[1].set_ylabel('Mean Absolute Error')
for i, v in enumerate(mae_scores):
    axes[1].text(i, v + 2, f'{v:.2f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('model_comparison.png')
plt.show()

In [None]:
# Chart 2: Actual vs Predicted values
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(y_test, lr_predictions, alpha=0.5, color='#3498db')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[0].set_xlabel('Actual Yield')
axes[0].set_ylabel('Predicted Yield')
axes[0].set_title('Linear Regression: Actual vs Predicted')

axes[1].scatter(y_test, dt_predictions, alpha=0.5, color='#2ecc71')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[1].set_xlabel('Actual Yield')
axes[1].set_ylabel('Predicted Yield')
axes[1].set_title('Decision Tree: Actual vs Predicted')

plt.tight_layout()
plt.savefig('actual_vs_predicted.png')
plt.show()

In [None]:
# Chart 3: Feature Importance from Decision Tree
feature_names = X.columns
importances = dt_model.feature_importances_

# Sort by importance
sorted_idx = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), importances[sorted_idx], color='#e74c3c')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Decision Tree - Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

In [None]:
# Chart 4: Distribution of Crop Yield
plt.figure(figsize=(8, 5))
plt.hist(df['yield_kg_per_acre'], bins=30, color='#9b59b6', edgecolor='black')
plt.xlabel('Yield (kg per acre)')
plt.ylabel('Frequency')
plt.title('Distribution of Crop Yield')
plt.tight_layout()
plt.savefig('yield_distribution.png')
plt.show()

## 7. Save the Best Model

In [None]:
# Save the best model
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the label encoders (needed for the app)
with open('encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print(f'Best model ({best_name}) saved as model.pkl')
print('Label encoders saved as encoders.pkl')
print('\nProject complete!')