### 1. Import Dependecies

In [21]:
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import (
                                    StratifiedKFold, 
                                    GridSearchCV
                                    )
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
warnings.filterwarnings('ignore')

### 2. Load the data

In [14]:
X_train = np.load('artifacts/X_train.npz')['arr_0']
Y_train = np.load('artifacts/Y_train.npz')['arr_0']
X_test = np.load('artifacts/X_test.npz')['arr_0']
Y_test = np.load('artifacts/Y_test.npz')['arr_0']

### 3. Define Paramters

In [15]:
lr_param_grid = {
    'model__max_iter' : [1000, 5000],
    'model__C': [0.1, 1.0, 10.0]
}

dt_param_grid = {
    'model__max_depth' : [5, 8, 10],
    'model__min_samples_leaf': [2, 5, 10],
    'model__criterion' : ["gini", "entropy"]
}

rf_param_grid = {
    'model__n_estimators' : [100, 200],
    'model__max_depth' : [8, 10, 12],
    'model__min_samples_leaf': [2, 5, 10],
    'model__class_weight': ['balanced', 'balanced_subsample', None]
}

param_grids = {
    'Logistic Regression': lr_param_grid,
    'Decision Tree': dt_param_grid,
    'Random Forest': rf_param_grid
}

### 4. Define Multi Models

In [16]:
models = {
        'Logistic Regression' : LogisticRegression(),
        'Decision Tree' :DecisionTreeClassifier(),
        'Random Forest' : RandomForestClassifier()
        }

### 5. Configure K-Fold CV

In [17]:
cv = StratifiedKFold(
                    n_splits=6,
                    random_state=42,
                    shuffle=True
                    )

### 6. Multi Model Training

In [18]:
grid_search_results={}
for model_name, model in models.items():

    print(f"\n--- Tuning {model_name} ---")

    # Create Pipeline: SMOTE -> Model
    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    param_grid = param_grids[model_name]

    grid_search = GridSearchCV(
                                estimator=pipeline,
                                param_grid=param_grid,
                                cv=cv, scoring='f1',
                                verbose=1, return_train_score=False
                                )
    
    print(f"Fitting gridSearchCV for {model_name}")

    grid_search.fit(X_train, Y_train)

    grid_search_results[model_name] = grid_search
    
    print(f"{model_name} gridSearchCV completed ...")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_}")


--- Tuning Logistic Regression ---
Fitting gridSearchCV for Logistic Regression
Fitting 6 folds for each of 6 candidates, totalling 36 fits


  File "c:\Users\www\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\www\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\www\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\www\anaconda3\Lib\subprocess.py", lin

Logistic Regression gridSearchCV completed ...
Best parameters: {'model__C': 1.0, 'model__max_iter': 1000}
Best CV score: 0.6296484628846933

--- Tuning Decision Tree ---
Fitting gridSearchCV for Decision Tree
Fitting 6 folds for each of 18 candidates, totalling 108 fits
Decision Tree gridSearchCV completed ...
Best parameters: {'model__criterion': 'gini', 'model__max_depth': 8, 'model__min_samples_leaf': 10}
Best CV score: 0.6111379034131776

--- Tuning Random Forest ---
Fitting gridSearchCV for Random Forest
Fitting 6 folds for each of 54 candidates, totalling 324 fits
Random Forest gridSearchCV completed ...
Best parameters: {'model__class_weight': None, 'model__max_depth': 8, 'model__min_samples_leaf': 5, 'model__n_estimators': 100}
Best CV score: 0.6413950365891932


In [19]:
grid_search_results

{'Logistic Regression': GridSearchCV(cv=StratifiedKFold(n_splits=6, random_state=42, shuffle=True),
              estimator=Pipeline(steps=[('smote', SMOTE(random_state=42)),
                                        ('model', LogisticRegression())]),
              param_grid={'model__C': [0.1, 1.0, 10.0],
                          'model__max_iter': [1000, 5000]},
              scoring='f1', verbose=1),
 'Decision Tree': GridSearchCV(cv=StratifiedKFold(n_splits=6, random_state=42, shuffle=True),
              estimator=Pipeline(steps=[('smote', SMOTE(random_state=42)),
                                        ('model', DecisionTreeClassifier())]),
              param_grid={'model__criterion': ['gini', 'entropy'],
                          'model__max_depth': [5, 8, 10],
                          'model__min_samples_leaf': [2, 5, 10]},
              scoring='f1', verbose=1),
 'Random Forest': GridSearchCV(cv=StratifiedKFold(n_splits=6, random_state=42, shuffle=True),
              estimat

In [22]:
# Assume X_test, Y_test are your test features and labels
for model_name, grid in grid_search_results.items():
    # Get the best estimator from GridSearchCV
    best_model = grid.best_estimator_
    
    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)
    f1 = f1_score(Y_test, y_pred)
    
    print(f"\n--- {model_name} Test Set Metrics ---")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(Y_test, y_pred))



--- Logistic Regression Test Set Metrics ---
Accuracy:  0.7356
Precision: 0.5017
Recall:    0.7727
F1-score:  0.6084

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1033
           1       0.50      0.77      0.61       374

    accuracy                           0.74      1407
   macro avg       0.70      0.75      0.70      1407
weighted avg       0.79      0.74      0.75      1407


--- Decision Tree Test Set Metrics ---
Accuracy:  0.7342
Precision: 0.5000
Recall:    0.6845
F1-score:  0.5779

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.75      0.81      1033
           1       0.50      0.68      0.58       374

    accuracy                           0.73      1407
   macro avg       0.68      0.72      0.69      1407
weighted avg       0.77      0.73      0.75      1407


--- Random Forest Test Set Metrics ---
Accuracy:  0.7520
Precision: