# 🌾 Crop Yield Prediction using Machine Learning

This notebook implements Random Forest, XGBoost, and LSTM models for crop yield prediction.

## Features:
- Multiple ML algorithms comparison
- Feature engineering for agricultural data
- Hyperparameter tuning
- Model performance evaluation
- Feature importance analysis

In [None]:
# Import required libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

# Deep Learning (for LSTM)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Utils
import joblib
import json
from datetime import datetime
from tqdm import tqdm

# Set style
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")
print(f"TensorFlow version: {tf.__version__}")

In [None]:
# Configuration
N_SAMPLES = 5000  # Number of synthetic data samples
TEST_SIZE = 0.2
RANDOM_STATE = 42

# Paths
BASE_PATH = Path("/Users/debabratapattnayak/web-dev/greencast")
DATA_PATH = BASE_PATH / "ml_models" / "yield_prediction"
MODELS_PATH = BASE_PATH / "ml_models" / "trained_models"
RESULTS_PATH = BASE_PATH / "ml_models" / "results"

# Create directories
DATA_PATH.mkdir(parents=True, exist_ok=True)
MODELS_PATH.mkdir(parents=True, exist_ok=True)
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

print(f"📁 Data will be saved to: {DATA_PATH}")
print(f"📁 Models will be saved to: {MODELS_PATH}")
print(f"📁 Results will be saved to: {RESULTS_PATH}")

## 📊 Data Generation

Create synthetic agricultural data with realistic relationships between features and yield.

In [None]:
def create_agricultural_data(n_samples=5000):
    """Create synthetic agricultural data for demonstration"""
    
    print(f"📊 Creating agricultural data ({n_samples} samples)...")
    
    np.random.seed(42)
    
    # Crop types
    crop_types = ['Wheat', 'Corn', 'Rice', 'Soybean', 'Barley', 'Cotton']
    
    # Generate synthetic data
    data = {
        'crop_type': np.random.choice(crop_types, n_samples),
        'soil_ph': np.random.normal(6.5, 0.8, n_samples),
        'rainfall_mm': np.random.normal(800, 200, n_samples),
        'humidity_percent': np.random.normal(65, 15, n_samples),
        'temperature_celsius': np.random.normal(22, 8, n_samples),
        'fertilizer_kg_per_hectare': np.random.normal(150, 50, n_samples),
        'irrigation_hours': np.random.normal(100, 30, n_samples),
        'sunlight_hours': np.random.normal(8, 2, n_samples),
        'soil_nitrogen': np.random.normal(25, 8, n_samples),
        'soil_phosphorus': np.random.normal(15, 5, n_samples),
        'soil_potassium': np.random.normal(200, 50, n_samples),
        'elevation_meters': np.random.normal(300, 150, n_samples),
        'field_size_hectares': np.random.normal(5, 2, n_samples)
    }
    
    df = pd.DataFrame(data)
    
    # Ensure realistic ranges
    df['soil_ph'] = np.clip(df['soil_ph'], 4.0, 9.0)
    df['rainfall_mm'] = np.clip(df['rainfall_mm'], 200, 2000)
    df['humidity_percent'] = np.clip(df['humidity_percent'], 30, 95)
    df['temperature_celsius'] = np.clip(df['temperature_celsius'], 5, 40)
    df['fertilizer_kg_per_hectare'] = np.clip(df['fertilizer_kg_per_hectare'], 50, 300)
    df['irrigation_hours'] = np.clip(df['irrigation_hours'], 20, 200)
    df['sunlight_hours'] = np.clip(df['sunlight_hours'], 4, 12)
    df['field_size_hectares'] = np.clip(df['field_size_hectares'], 1, 20)
    
    # Create realistic yield based on features
    crop_multipliers = {'Wheat': 3.5, 'Corn': 8.0, 'Rice': 4.5, 'Soybean': 2.8, 'Barley': 3.2, 'Cotton': 1.2}
    df['crop_multiplier'] = df['crop_type'].map(crop_multipliers)
    
    # Complex yield calculation
    df['yield_tons_per_hectare'] = (
        df['crop_multiplier'] * 
        (1 + (df['soil_ph'] - 6.5) * 0.1) *
        (1 + (df['rainfall_mm'] - 800) / 1000) *
        (1 + (df['temperature_celsius'] - 22) * 0.02) *
        (1 + (df['fertilizer_kg_per_hectare'] - 150) / 500) *
        (1 + (df['sunlight_hours'] - 8) * 0.05) *
        np.random.normal(1, 0.15, n_samples)  # Add noise
    )
    
    # Ensure positive yields
    df['yield_tons_per_hectare'] = np.clip(df['yield_tons_per_hectare'], 0.5, 15)
    
    # Drop helper column
    df = df.drop('crop_multiplier', axis=1)
    
    print(f"✅ Data created successfully!")
    print(f"📊 Shape: {df.shape}")
    print(f"🌾 Crop types: {df['crop_type'].unique()}")
    print(f"📈 Yield range: {df['yield_tons_per_hectare'].min():.2f} - {df['yield_tons_per_hectare'].max():.2f} tons/hectare")
    
    return df

# Create the dataset
df = create_agricultural_data(N_SAMPLES)

# Save the dataset
df.to_csv(DATA_PATH / "agricultural_data.csv", index=False)
print(f"💾 Dataset saved to: {DATA_PATH / 'agricultural_data.csv'}")

# Display basic statistics
print("\n📊 Dataset Overview:")
df.head()