### 1. Importing Dependancies

In [47]:
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (
                                StratifiedKFold,
                                cross_validate,
                                GridSearchCV
                                
                                )
from sklearn.metrics import confusion_matrix

warnings.filterwarnings('ignore')

now we are trying to train multiple models with the base model we currently have to understand the performance

### 2. Loading the Data

In [48]:
X_train = np.load('artifacts\X_train.npz') ['arr_0']
Y_train = np.load('artifacts\Y_train.npz')  ['arr_0']
X_test = np.load('artifacts\X_test.npz')  ['arr_0']
Y_test = np.load('artifacts\Y_test.npz')  ['arr_0']

### 3. Define Multi Models

In [53]:
# what are the hyper parameters for these models
# this is a model dictionary that upper created by defining the models

lr_param_grid = {
            'max_iter' : [1000, 5000, 10000]                        #logistic regression parameter grid
                }

dt_param_grid = {
            'max_depth' : [8, 12, 16, 20],
            'criterion' : ["gini", "entropy", "log_loss"]           #when depth increases model training time will be also increases ()
                }                                                   # decision tree parameter grid

rf_param_grid = {
            'n_estimators' : [50, 100, 150, 200],                   #this means how many trees in the random forrest/random numbers we dont know how many would we need
            'max_depth' : [8, 12, 16, 20],
            'criterion' : ["gini", "entropy", "log_loss"]
}

param_grids = {
                'Logistic Regression' : lr_param_grid,
                'Decision Tree' : dt_param_grid,
                'Random Forrest' : rf_param_grid,
}

### 4. Define Multi Models

In [54]:
models = {
            'Logistic Regression' : LogisticRegression(),
            'Decision Tree' : DecisionTreeClassifier() ,
            'Random Forrest' : RandomForestClassifier()
        }

up there is usually works like this,

ex:
df_param_grid :
- 8 x 'gini' | 8 x 'entropy' | 8 x 'log_loss'
- 12 x 'gini' | 12 x 'entropy' | 12 x 'log_loss'
- 16 x 'gini' | 16 x 'entropy' | 16 x 'log_loss'
- 20 x 'gini' | 20 x 'entropy' | 20 x 'log_loss'

this is called cross-validation
Overall we call as 'Grid search' for this operation

We are going to train,

- 3 x Linear regression Models
- 12 x Decision tree Models [max_depth : size x criterion : size]
- 48 x Random Forrest Models [n_estimators : size x max_depth : size x criterion : size]

### 4. Configure K-Fold CV

In [51]:
# cv = cross-validation

cv = StratifiedKFold(
                        n_splits=6,
                        random_state=42,
                        shuffle=True #by shuffling you make the operation more random, that means better performance
)

Now we are trying to do is that run the cross validation between these models

### 5. Multi-Model Training

In [None]:
grid_search_results = {}
for model_name, model in models.items():
    print(f"\n--- Tuning {model_name} ---")
    
    param_grid = param_grids[model_name]
    
    grid_search = GridSearchCV(
                                estimator=model,
                                param_grid=param_grid,
                                cv=cv,
                                scoring='f1',
                                verbose=1,
                                return_train_score=False
                                )
    
    print(f"Fitting gridSearchCV for {model_name}")
    
    grid_search.fit(X_train, Y_train)
    
    grid_search_results[model_name] = grid_search
    
    print(f"{model_name} gridSearchCV completed...")
    print(f"Best parameters : {grid_search.best_params_}")
    print(f"Best CV Score : {grid_search.best_score_}")  # you can easily get the best parameters and CV scores by this


--- Tuning Logistic Regression ---
Fitting gridSearchCV for Logistic Regression
Fitting 6 folds for each of 3 candidates, totalling 18 fits
Logistic Regression gridSearchCV completed...
Best parameters : {'max_iter': 1000}
Best CV Score : 0.731872171052841

--- Tuning Decision Tree ---
Fitting gridSearchCV for Decision Tree
Fitting 6 folds for each of 12 candidates, totalling 72 fits
Decision Tree gridSearchCV completed...
Best parameters : {'criterion': 'entropy', 'max_depth': 20}
Best CV Score : 0.8352759689512504

--- Tuning Random Forrest ---
Fitting gridSearchCV for Random Forrest
Fitting 6 folds for each of 48 candidates, totalling 288 fits
Random Forrest gridSearchCV completed...
Best parameters : {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200}
Best CV Score : 0.8940327293097221


- now we have the understanding of what are the parameteres.
- But the main problem with grid search is that its very time consuming.

In [56]:
grid_search_results

{'Logistic Regression': GridSearchCV(cv=StratifiedKFold(n_splits=6, random_state=42, shuffle=True),
              estimator=LogisticRegression(),
              param_grid={'max_iter': [1000, 5000, 10000]}, scoring='f1',
              verbose=1),
 'Decision Tree': GridSearchCV(cv=StratifiedKFold(n_splits=6, random_state=42, shuffle=True),
              estimator=DecisionTreeClassifier(),
              param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                          'max_depth': [8, 12, 16, 20]},
              scoring='f1', verbose=1),
 'Random Forrest': GridSearchCV(cv=StratifiedKFold(n_splits=6, random_state=42, shuffle=True),
              estimator=RandomForestClassifier(),
              param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                          'max_depth': [8, 12, 16, 20],
                          'n_estimators': [50, 100, 150, 200]},
              scoring='f1', verbose=1)}