In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import required libraries


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_columns", None)
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/crop-recommendation-dataset/Crop_recommendation.csv")

df.shape

In [None]:
df.head()

In [None]:
df["label"].value_counts()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']

plt.figure(figsize=(15, 100))
for i, col in enumerate(numeric_features):
    plt.subplot(60, 3, i+1)
    sns.distplot(x=df[col], color='indianred')
    plt.xlabel(col, weight='bold')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(25, 4))
sns.countplot(x=df["label"])
plt.show();

In [None]:
plt.style.use('ggplot')
sns.set_palette("hls", 8)
for i in df.columns[:-1]:
    fig,ax=plt.subplots(1,3,figsize=(18,4))
    sns.histplot(data=df,x=i,kde=True,bins=40,ax=ax[0])
    sns.violinplot(data=df,x=i,ax=ax[1])
    sns.boxplot(data=df,x=i,ax=ax[2])
    plt.suptitle(f'Visualizing {i}',size=20)

In [None]:
df.head()

In [None]:
df_grouped = df.groupby('label').mean().reset_index()
df_grouped

In [None]:
for i in df_grouped.columns[1:]:
    print(f'-------------------------------')
    print(f'Top 5 Most {i} requiring crops:')
    print(f'--------------------------------')
    for j ,k in df_grouped.sort_values(by=i,ascending=False)[:5][['label',i]].values:
        print(f'{j} --> {k}')


In [None]:
for i in df_grouped.columns[1:]:
    print(f'-------------------------------')
    print(f'Top 5 Least {i} requiring crops:')
    print(f'--------------------------------')
    for j ,k in df_grouped.sort_values(by=i,ascending=True)[:5][['label',i]].values:
        print(f'{j} --> {k}')

In [None]:
df.head()

In [None]:
plt.figure(figsize=(8,4))
sns.heatmap(df.drop('label', axis=1).corr(), annot=True)
plt.show()

In [None]:
df.head()

In [None]:
df = df.copy(deep=True)

df.head(2)

In [None]:
X = df.drop(columns=['label'], axis=1)
y = df['label']

In [None]:
X.head()

In [None]:
y

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder to your categorical labels (y) and transform them
y_encoded = label_encoder.fit_transform(y)

## Split the dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=12)

## Scale the dataset

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)

X_train=pd.DataFrame(X_train,columns=X.columns)

X_test=scaler.transform(X_test)

X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

acc = accuracy_score(y_test, y_pred)
p_score = precision_score(y_test, y_pred, average="weighted")


print("Accuracy Score: ", acc)
print("Precision Score: ", p_score)

In [None]:
def evaluate_model(true, predicted):
    acc = accuracy_score(true, predicted)
    p_score = precision_score(true, predicted)

    return acc, p_score

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, classification_report, roc_auc_score
from sklearn.pipeline import Pipeline

def fit_classification_models(X, y, test_size=0.2, random_state=42):
    """
    Fit classification models using XGBoost, RandomForest, and ExtraTrees classifiers.

    Parameters:
    -----------
    X : array-like of shape (n_samples, n_features)
        The input data.

    y : array-like of shape (n_samples,)
        The target variable.

    test_size : float, default=0.2
        The proportion of the dataset to include in the test split.

    random_state : int, default=42
        Controls the randomness of the dataset splitting.

    Returns:
    --------
    results : dict
        A dictionary containing the performance metrics for XGBoost, RandomForest, and ExtraTrees classifiers.
    """
    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    # Fit the label encoder to your categorical labels (y) and transform them
    y = label_encoder.fit_transform(y)
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Create pipelines for XGBoost, RandomForest, and ExtraTrees classifiers
    pipelines = {
        'XGBoost': Pipeline([('scaler', StandardScaler()), ('classifier', XGBClassifier())]),
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())]),
        'ExtraTreesClassifier':  Pipeline([('scaler', StandardScaler()), ('classifier', ExtraTreesClassifier())]),
    }

    # Fit and evaluate XGBoost, RandomForest, and ExtraTrees classifiers
    results = {}

    for classifier_name, pipeline in pipelines.items():
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        p_score = precision_score(y_test, y_pred, average="weighted")
        results[classifier_name] = {'Accuracy': score, 'Precision': p_score}

    return results


In [None]:
results = fit_classification_models(X_train, y_train)

In [None]:
# Create a DataFrame from the results
df = pd.DataFrame(results).transpose()

# Sort the DataFrame by the F1-score in descending order
df_sorted = df.sort_values(by='Precision', ascending=False)

# Display the sorted DataFrame
df_sorted

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, accuracy_score

def rf_objective(trial):
    # Use the trial object to suggest values for the RandomForestClassifier hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 100, 500, step=100)
    max_depth = trial.suggest_int("max_depth", 5, 15)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

    # Define the model with the suggested hyperparameters
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)

    # Calculate the score with 10-fold cross-validation, using accuracy as the scoring metric
    # You can replace accuracy with other classification metrics as needed
    scoring = make_scorer(accuracy_score)
    scores = cross_val_score(rf, X, y,
                             cv=KFold(n_splits=10, shuffle=True, random_state=42),
                             scoring=scoring)

    # Return the mean of 10 scores
    return scores.mean()

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Create Study object
study = optuna.create_study(direction="maximize")

# Optimize the study, use more trials to obtain a better result, use fewer trials to be more cost-efficient
study.optimize(rf_objective, n_trials=10)  # Use more trials for better results

# Print the result
best_params = study.best_params
best_score = study.best_value
print(f"Best score: {best_score}\n")
print(f"Optimized parameters: {best_params}\n")

In [None]:
rfmodel = RandomForestClassifier(
        max_depth=13,
        min_samples_split=6,
        n_estimators=300,
        n_jobs=-1)

rfmodel.fit(X_train, y_train)

In [None]:
y_preds = rfmodel.predict(X_test)

In [None]:
print("Score of the model is: ", accuracy_score(y_test, y_preds))
print("Precision of the model is: ", precision_score(y_test, y_preds, average='weighted'))

In [None]:
## Save the model

import joblib

joblib.dump(rfmodel, 'crop_model.joblib', compress=8)