In [1]:
!pip install optuna
!pip install xgboost
!pip install imblearn
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

class CropPredictor:
    def __init__(self):
        self.label_encoder_soil = LabelEncoder()
        self.label_encoder_period = LabelEncoder()
        self.label_encoder_crop = LabelEncoder()
        self.scaler = StandardScaler()
        self.model = None
        self.feature_names = None

    def preprocess_data(self, data, is_training=True):
        # Create a copy of the data
        processed_data = data.copy()

        if is_training:
            # Fit and transform for training data
            processed_data['Soil Type'] = self.label_encoder_soil.fit_transform(processed_data['Soil Type'])
            processed_data['Period of Month'] = self.label_encoder_period.fit_transform(processed_data['Period of Month'])
        else:
            # Transform only for prediction data
            processed_data['Soil Type'] = self.label_encoder_soil.transform(processed_data['Soil Type'])
            processed_data['Period of Month'] = self.label_encoder_period.transform(processed_data['Period of Month'])

        # Feature Engineering
        processed_data['NPK_Ratio'] = processed_data['Nitrogen'] / (processed_data['Phosphorus'] + processed_data['Potassium'])
        processed_data['Temp_Humidity_Index'] = processed_data['Temperature'] * processed_data['Humidity'] / 100
        processed_data['Soil_Moisture_Index'] = np.log1p(processed_data['Temperature'] * processed_data['Humidity'])
        processed_data['NPK_TH_Index'] = processed_data['NPK_Ratio'] * processed_data['Temp_Humidity_Index']

        return processed_data

    def objective(self, trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'subsample': trial.suggest_float('subsample', 0.6, 0.9),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
            'gamma': trial.suggest_float('gamma', 0.01, 0.5, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1.0, log=True)
        }

        model = xgb.XGBClassifier(**params, objective='multi:softprob', eval_metric='mlogloss', random_state=42)

        cv_scores = []
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        for train_idx, val_idx in cv.split(self.X_train, self.y_train):
            X_fold_train, X_fold_val = self.X_train[train_idx], self.X_train[val_idx]
            y_fold_train, y_fold_val = self.y_train[train_idx], self.y_train[val_idx]

            model.fit(X_fold_train, y_fold_train)
            preds = model.predict(X_fold_val)
            cv_scores.append(accuracy_score(y_fold_val, preds))

        return np.mean(cv_scores)

    def train(self, data_path):
        print("Loading and preprocessing data...")
        data = pd.read_csv(data_path)

        # Encode target variable
        y = self.label_encoder_crop.fit_transform(data['Crop'])

        # Preprocess features
        X = self.preprocess_data(data.drop('Crop', axis=1), is_training=True)
        self.feature_names = X.columns.tolist()

        # Convert to numpy arrays
        X = X.values

        print("Applying SMOTE for class balancing...")
        smote = SMOTE(sampling_strategy='auto', random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        self.X_train, X_test, self.y_train, y_test = train_test_split(
            X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
        )

        # Scale features
        self.X_train = self.scaler.fit_transform(self.X_train)
        X_test = self.scaler.transform(X_test)

        print("Optimizing hyperparameters...")
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=30, timeout=600)

        best_params = study.best_params
        print(f"Best parameters: {best_params}")
        print(f"Best CV accuracy: {study.best_value:.4f}")

        print("Training final model...")
        self.model = xgb.XGBClassifier(
            **best_params,
            objective='multi:softprob',
            eval_metric='mlogloss',
            random_state=42
        )
        self.model.fit(self.X_train, self.y_train)

        preds = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        print(f"\nFinal Model Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, preds))

        print("Saving model and preprocessors...")
        with open('crop_prediction_model.pkl', 'wb') as f:
            pickle.dump({
                'model': self.model,
                'label_encoder_crop': self.label_encoder_crop,
                'label_encoder_soil': self.label_encoder_soil,
                'label_encoder_period': self.label_encoder_period,
                'scaler': self.scaler,
                'feature_names': self.feature_names
            }, f)

        return accuracy

    def predict_crop(self, input_data):
        # Convert input to DataFrame if it's a dictionary
        if isinstance(input_data, dict):
            input_data = pd.DataFrame([input_data])

        # Preprocess the input data
        processed_data = self.preprocess_data(input_data, is_training=False)

        # Ensure column order matches training data
        processed_data = processed_data[self.feature_names]

        # Scale the features
        scaled_data = self.scaler.transform(processed_data)

        # Make prediction
        prediction = self.model.predict(scaled_data)
        probabilities = self.model.predict_proba(scaled_data)

        predicted_crop = self.label_encoder_crop.inverse_transform(prediction)
        return predicted_crop[0], probabilities[0]

def main():
    predictor = CropPredictor()
    accuracy = predictor.train('updated_crop_data_cleaned.csv')

    # Sample inputs
    sample_inputs = [
        {
            'Nitrogen': 90,
            'Phosphorus': 42,
            'Potassium': 43,
            'Temperature': 25,
            'Humidity': 82,
            'pH': 6.5,
            'Soil Type': 'Clayey',
            'Period of Month': 'Kharif'
        },
        {
            'Nitrogen': 120,
            'Phosphorus': 35,
            'Potassium': 30,
            'Temperature': 28,
            'Humidity': 75,
            'pH': 7.0,
            'Soil Type': 'Loamy',
            'Period of Month': 'Rabi'
        }
    ]

    print("\nMaking predictions for sample inputs:")
    for i, sample in enumerate(sample_inputs, 1):
        predicted_crop, probabilities = predictor.predict_crop(sample)
        print(f"\nSample {i}:")
        print(f"Input Parameters: {sample}")
        print(f"Predicted Crop: {predicted_crop}")
        print(f"Confidence: {max(probabilities)*100:.2f}%")

        top_3_indices = np.argsort(probabilities)[-3:][::-1]
        print("\nTop 3 Crop Recommendations:")
        for idx in top_3_indices:
            crop_name = predictor.label_encoder_crop.inverse_transform([idx])[0]
            print(f"{crop_name}: {probabilities[idx]*100:.2f}%")

if __name__ == "__main__":
    main()


Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

[I 2025-02-19 03:47:20,532] A new study created in memory with name: no-name-afe061d9-4716-4751-acb3-22abee43f831


Optimizing hyperparameters...


[I 2025-02-19 03:53:07,901] Trial 0 finished with value: 0.5424138631758261 and parameters: {'n_estimators': 250, 'max_depth': 4, 'learning_rate': 0.04614351715100795, 'subsample': 0.6814878649620414, 'colsample_bytree': 0.6272369503240611, 'gamma': 0.02971621495434198, 'reg_lambda': 0.002162021428336622, 'reg_alpha': 0.23462801694990357}. Best is trial 0 with value: 0.5424138631758261.
[I 2025-02-19 04:04:22,882] Trial 1 finished with value: 0.7568738913029235 and parameters: {'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.0896345752793483, 'subsample': 0.700453763579332, 'colsample_bytree': 0.8328211994431439, 'gamma': 0.05347100476941819, 'reg_lambda': 0.03770223711662937, 'reg_alpha': 0.3800997235144518}. Best is trial 1 with value: 0.7568738913029235.


Best parameters: {'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.0896345752793483, 'subsample': 0.700453763579332, 'colsample_bytree': 0.8328211994431439, 'gamma': 0.05347100476941819, 'reg_lambda': 0.03770223711662937, 'reg_alpha': 0.3800997235144518}
Best CV accuracy: 0.7569
Training final model...

Final Model Accuracy: 0.7829

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       219
           1       1.00      1.00      1.00       220
           2       0.65      0.68      0.67       220
           3       0.65      0.65      0.65       220
           4       0.99      0.99      0.99       219
           5       0.95      0.98      0.96       220
           6       0.65      0.59      0.61       220
           7       0.61      0.61      0.61       220
           8       0.99      1.00      0.99       220
           9       0.64      0.59      0.61       219
          10       0.97      1.00      0

