Finding the most suitable model

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


path = 'data/Final_Dataset_after_temperature.csv'
df = pd.read_csv(path)
df = df.head(6000)
df.drop(columns=['Yield_ton_per_hec'], inplace=True)

target_variable = "Crop" # this is our target variable

label_encoder = LabelEncoder() # for the categorical variables

df[target_variable] = label_encoder.fit_transform(df[target_variable])

train_df, test_df = train_test_split(df, test_size=0.2) # split the data into train and test
train_df_1 = train_df.drop(columns=[target_variable])

#identify the categorical and numeric value columns
categorical_columns = train_df_1.select_dtypes(include=['object']).columns
numeric_columns = train_df_1.select_dtypes(exclude=['object']).columns

#handling the missing values 
for col in train_df_1.columns:
    if col in numeric_columns:
        train_df_1[col].fillna(train_df_1[col].mean(), inplace=True)
    elif col in categorical_columns:
        train_df_1[col].fillna('Unknown', inplace=True)

# encoding and scaling the input features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore')),('scaler', StandardScaler(with_mean = False))])

# creating a preprocessor for the input features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# now we transform the input features accordingly
input_feature_train_arr = preprocessor.fit_transform(train_df.drop(columns=[target_variable], axis=1))
input_feature_test_arr = preprocessor.transform(test_df.drop(columns=[target_variable], axis=1))

'''

# Define classification models
models = {
    "LogisticRegression": LogisticRegression(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(),
    "CatBoostClassifier": CatBoostClassifier(verbose=0),  # Set verbose to avoid printing logs
    "AdaBoostClassifier": AdaBoostClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(),
}

# Define hyperparameters for each classification model
params = {
    "LogisticRegression": {},
    "DecisionTreeClassifier": {
        'criterion': ['gini', 'entropy'],
    },
    "RandomForestClassifier": {
        'n_estimators': [8, 16, 32, 64, 128, 256],
    },
    "XGBClassifier": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
    },
    "CatBoostClassifier": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoostClassifier": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
    },
    "KNeighborsClassifier": {
        'n_neighbors': [3, 5, 7],
    },
}

# Define a function to evaluate classification models
def evaluate_classification_models(models, X_train, y_train, X_test, y_test, params):
    model_report = {}
    for model_name, model in models.items():
        print(model_name)
        param = params[model_name]
        gs = GridSearchCV(model, param, cv=3)
        gs.fit(X_train, y_train)
        model.set_params(**gs.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        model_report[model_name] = accuracy
    return model_report

# Evaluate classification models
model_report = evaluate_classification_models(models, input_feature_train_arr, train_df[target_variable], input_feature_test_arr, test_df[target_variable], params)

# Find the best model
best_model_name = max(model_report, key=model_report.get)
best_model_accuracy = model_report[best_model_name]

best_model = models[best_model_name]
best_model.fit(input_feature_train_arr, train_df[target_variable])
'''

model = CatBoostClassifier(verbose=0)

param = {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    }
gs = GridSearchCV(model, param, cv=3)
gs.fit(input_feature_train_arr, train_df[target_variable])
model.set_params(**gs.best_params_)
model.fit(input_feature_train_arr, train_df[target_variable])
y_pred = model.predict(input_feature_test_arr)
accuracy = accuracy_score(test_df[target_variable], y_pred)

with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('label_encoder.pkl', 'wb') as label_encoder_file:
    pickle.dump(label_encoder, label_encoder_file)

with open('preprocessor.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)

print(f"Accuracy of the best model: {accuracy}")
