# Evaluation

## 1. Import libraries

In [1]:
# --- Imports ---
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report, precision_recall_curve, average_precision_score, roc_auc_score, roc_curve

## 2. Load datasets and models

In [2]:
import joblib

# Load the scaler 
scaler = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/scaler.pkl')


# --- Load datasets ---
X_train = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/X_train.pkl')
X_test = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/X_test.pkl')
y_train = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/y_train.pkl')
y_test = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/y_test.pkl')


# --- Load trained models ---
lr = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/LogisticRegression.pkl')
svc = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/SVC.pkl')
dt = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/DecisionTreeClassifier.pkl')
rf = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/RandomForestClassifier.pkl')
xgb = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/XGBClassifier.pkl')
cat = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/CatBoostClassifier.pkl')


models = [lr, svc, dt, rf, xgb, cat]

## 3. Stratified Cross-Validation (on Training Set)

In [3]:
# --- Set up stratified cross-validaion ---
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 

results = []

for model in models:
    print(f"\nCross-Validation Scores for {model.__class__.__name__}")

    # Evaluate model with corss-validation
    acc = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring="f1")
    auc_score = cross_val_score(model, X_train, y_train, cv=skf, scoring="roc_auc")
    avg_precision = cross_val_score(model, X_train, y_train, cv=skf, scoring="average_precision")


    results.append({
    'Model': model.__class__.__name__,
    'Accuracy': acc.mean(),
    'Precision': precision.mean(),
    'Recall': recall.mean(),
    'F1 Score': f1.mean(),
    'ROC AUC': auc_score.mean(),
    'PR AUC': avg_precision.mean()
    })



Cross-Validation Scores for LogisticRegression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Cross-Validation Scores for SVC

Cross-Validation Scores for DecisionTreeClassifier

Cross-Validation Scores for RandomForestClassifier

Cross-Validation Scores for XGBClassifier

Cross-Validation Scores for CatBoostClassifier
Learning rate set to 0.021794
0:	learn: 0.6813792	total: 54.6ms	remaining: 54.6s
1:	learn: 0.6714005	total: 56.8ms	remaining: 28.4s
2:	learn: 0.6601656	total: 59.2ms	remaining: 19.7s
3:	learn: 0.6495432	total: 61.3ms	remaining: 15.3s
4:	learn: 0.6388611	total: 63.3ms	remaining: 12.6s
5:	learn: 0.6299869	total: 65.3ms	remaining: 10.8s
6:	learn: 0.6200284	total: 67.8ms	remaining: 9.61s
7:	learn: 0.6113740	total: 70.1ms	remaining: 8.69s
8:	learn: 0.6025625	total: 72.3ms	remaining: 7.97s
9:	learn: 0.5950604	total: 74.5ms	remaining: 7.38s
10:	learn: 0.5877254	total: 76.6ms	remaining: 6.89s
11:	learn: 0.5810801	total: 78.9ms	remaining: 6.5s
12:	learn: 0.5743199	total: 81ms	remaining: 6.15s
13:	learn: 0.5683312	total: 83.2ms	remaining: 5.86s
14:	learn: 0.5621776	total:

Stratified cross-validation is essential when you’re working with imbalanced classes, like in the Telco Customer Churn dataset where many more customers may have stayed than churned.

## 4. Using Metrics to Evaluate on the Training Set

### 4.1. Displaying results as a table

In [4]:
results_df = pd.DataFrame(results)

display(results_df.round(4))

# Round numeric metrics
results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']] = results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']].round(4)

# Style only the numeric columns (exclude Confusion Matrix)
styled_df = results_df.style\
    .background_gradient(cmap="Wistia", subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .highlight_max(axis=0, color='#ffd1a9', subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .format(precision=4, subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .set_properties(**{'color': 'black'}, subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])


styled_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,PR AUC
0,LogisticRegression,0.7712,0.7535,0.806,0.7787,0.8535,0.8345
1,SVC,0.7661,0.7398,0.821,0.7782,0.8506,0.8327
2,DecisionTreeClassifier,0.8034,0.802,0.806,0.8039,0.8044,0.7444
3,RandomForestClassifier,0.8459,0.8495,0.8409,0.8451,0.9203,0.9093
4,XGBClassifier,0.8477,0.8441,0.8528,0.8484,0.9253,0.9307
5,CatBoostClassifier,0.8517,0.8475,0.8578,0.8526,0.9301,0.934


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,PR AUC
0,LogisticRegression,0.7712,0.7535,0.806,0.7787,0.8535,0.8345
1,SVC,0.7661,0.7398,0.821,0.7782,0.8506,0.8327
2,DecisionTreeClassifier,0.8034,0.802,0.806,0.8039,0.8044,0.7444
3,RandomForestClassifier,0.8459,0.8495,0.8409,0.8451,0.9203,0.9093
4,XGBClassifier,0.8477,0.8441,0.8528,0.8484,0.9253,0.9307
5,CatBoostClassifier,0.8517,0.8475,0.8578,0.8526,0.9301,0.934


- Accuracy: Proportion of correctly classified instances out of all instances.

- Precision: Of all positive predictions, how many were actually correct?

- Recall: Of all actual positives, how many were correctly identified?

- F1 Score: Harmonic mean of Precision and Recall – balances both.

- ROC AUC: Measures the model’s ability to distinguish between classes. Higher = better.

- PR AUC: Area under the Precision-Recall curve. Important for imbalanced data.

##### Model-by-Model Analysis:
**1. Logistic Regression**
    - F1: 0.778 – decent balance between precision and recall.
    - ROC AUC: 0.85 – good, but not the best.
    - PR AUC: 0.83 – performs reasonably well with positive classes.
- Takeaway: Good baseline model.

**2. SVC (Support Vector Classifier)**
- Slightly lower precision than Logistic Regression but higher recall.
    - F1: 0.778 – almost the same as Logistic Regression.
    - ROC AUC and PR AUC both around 0.85 and 0.83 respectively.
- Takeaway: Similar performance to Logistic Regression, but slightly better at identifying positives (higher recall).

**3. Decision Tree**
    - Accuracy: 0.80 – better than Logistic and SVC.
    - Precision & Recall both ~0.80 – balanced.
    - ROC AUC & PR AUC are lowest here.
- Takeaway: Despite good accuracy, it may not generalize well (lower AUCs suggest overfitting).

**4. Random Forest**
- Big performance jump:
    - Accuracy: 0.85
    - F1: 0.85
    - ROC AUC: 0.92
    - PR AUC: 0.91
- Takeaway: Very strong model. Good generalization. High recall (0.84) and precision (0.85).

**5. XGBoost (XGBClassifier)**
- Slightly better than Random Forest:
    - Accuracy: 0.85
    - Recall: 0.85 (best recall so far)
    - ROC AUC: 0.93
    - PR AUC: 0.93 (highest)
- Takeaway: Excellent model, especially if recall is more important (i.e., catching as many positives as possible).

**6. CatBoost**
- Best performing model overall:
    - Accuracy: 0.85+
    - Precision: 0.85
    - Recall: 0.86
    - F1: 0.85
    - ROC AUC: 0.93
    - PR AUC: 0.93
- Takeaway: Excellent at both identifying positives and avoiding false positives.

The CatBoostClassifier, XGBClassifier, and RandomForestClassifier show strong ROC AUC & PR AUC.

##### Why Not Just Use Accuracy?

- Let’s say you’re predicting customer churn, and only 20% of customers churn, while 80% do not. If a model simply predicts “no churn” every time, it will still have 80% accuracy — but it’s useless.

- So accuracy doesn’t tell the full story for imbalanced data.

##### The Metrics That Actually Matter:

**1. F1 Score:**
- Why? It balances Precision and Recall.
- Churn prediction is often a high-stakes task: You don’t want to annoy loyal customers by falsely targeting them (precision), and you don’t want to miss actual churners (recall).
- F1 score ensures both are balanced, especially when both false positives and false negatives are costly.
- Use F1 when you need a balance between false positives and false negatives — which you do here.

**2. ROC AUC (Receiver Operating Characteristic – Area Under Curve):**
- Shows how well the model distinguishes between classes across all possible thresholds.
- AUC of 0.5 = no skill; closer to 1 = better.
- ROC AUC is useful for overall model comparison, and it’s threshold-independent.
- Use ROC AUC to understand how well your model separates churners from non-churners regardless of cutoff.

**3. PR AUC (Precision-Recall AUC):**
- Like ROC AUC, but focuses more on the minority (positive) class, which is churn.
- Very useful when positive class is rare and you care more about it.
- Use PR AUC to get a better feel for model performance on actual churners — especially when the dataset is highly imbalanced.

**Why Not Just Precision or Recall Alone?**
- Precision only would mean a model that plays it safe — predicts churn only when very sure, but misses many actual churners.
- Recall only would mean the model flags almost everyone as a churner, catching all churners but irritating lots of loyal customers.
- F1 gives you the middle ground.

##### Main Business Goal Is To:
- minimize lost customers → high recall
- target real churners only → high precision
- balance both → F1
- compare models overall → ROC AUC
- focus on the churners → PR AUC



**"I want to minimize lost customers and target real churners."**

This means:
- Minimize False Negatives
    - You want to catch as many actual churners as possible.
    - This means prioritizing high recall.

- Target only actual churners
    - You don't want to waste resources on customers who aren't going to churn.
    - This means prioritizing high precision.


| Priority | Metric        | Why                                     |
| -------- | ------------- | --------------------------------------- |
| 🎯 High  | **F1 Score**  | Balances precision and recall           |
| 🎯 High  | **Precision** | Avoid wasting resources on non-churners |
| 🎯 High  | **Recall**    | Don’t miss actual churners              |
| Medium   | PR AUC        | Focused on performance for churners     |
| Medium   | ROC AUC       | General performance indicator           |
| Low      | Accuracy      | Misleading on imbalanced data           |


## 5. Model Tuning (Hyperparameter Optimization)

### 5.1. Random Search

In [5]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
from scipy.stats import uniform
from scipy.stats import randint

#### 5.1.1. Defining the Hyperparameter Spaces For Each Model

In [10]:
# --- Define hyperparameter space for the Logistic Regression Model ---
lr_param_dist = {
    'C': loguniform(1e-4, 1e4),
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['saga'],
    'max_iter': [1000]
}

# --- Define hyperparameter space for the SVC Model ---
svc_param_dist = {
    'C': loguniform(1e-3, 1e3),
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# --- Define hyperparameter space for the Decision Tree Model ---
dt_param_dist = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': [None, 'sqrt', 'log2']
}

# --- Define hyperparameter space for the Random Forest Model ---
rf_param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False]
}

# --- Define hyperparameter space for the XGBoost Model ---
xgb_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 15),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# --- Define hyperparameter space for the CatBoost Model ---
cat_param_dist = {
    'iterations': randint(100, 500),
    'depth': randint(4, 10),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'l2_leaf_reg': randint(1, 10)
}

#### 5.1.2. Conducting the Random Search for All Models

In [11]:
# param_dists = [lr_param_dist, svc_param_dist, dt_param_dist, rf_param_dist, xgb_param_dist, cat_param_dist]
# mod_names = ['lr', 'svc', 'dt', 'rf', 'xgb', 'cat']

param_dists = [lr_param_dist, dt_param_dist, rf_param_dist, xgb_param_dist, cat_param_dist]
mod_names = ['lr', 'dt', 'rf', 'xgb', 'cat']
mods = [lr, dt, rf, xgb, cat]

# Dictionary to store the results
random_searches = {}
best_models = {}

for model, params, name in zip(mods, param_dists, mod_names):
    print(f"Running RandomizedSearchCV for {name.upper()}...")

    search = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    n_iter=50,
    cv=skf,
    scoring='f1',
    verbose=2,
    n_jobs=-1,
    random_state=42
   )

    search.fit(X_train, y_train)

    # --- Store the fitted search object and best model ---
    random_searches[f"{name}_search"] = search
    best_models[f"best_{name}_model_1"] = search.best_estimator_

    print(f"Best F1 Score for {name.upper()}: {search.best_score_:.4f}")

Running RandomizedSearchCV for LR...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END C=0.0029341230215132346, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.0029341230215132346, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.0029341230215132346, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.0029341230215132346, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.0029341230215132346, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=6.155564318973028, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=6.155564318973028, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=0.36869056408461526, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=6.155564318973028, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=0.36869056408461526, max_



[CV] END C=0.02734702913886812, max_iter=1000, penalty=l1, solver=saga; total time=   3.1s
[CV] END C=0.0070331617412763316, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.13442649050633507, max_iter=1000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=0.13442649050633507, max_iter=1000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=4890.758234270084, max_iter=1000, penalty=l1, solver=saga; total time=   0.6s
[CV] END C=2.077480993119357, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=2.077480993119357, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.13442649050633507, max_iter=1000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=2.077480993119357, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=2.077480993119357, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.0714540111723761, max_iter=1000, penalty=none, solver=saga; total time=   0



[CV] END C=0.003697114486625516, max_iter=1000, penalty=l1, solver=saga; total time=   1.8s
[CV] END C=7.182472659393428, max_iter=1000, penalty=l1, solver=saga; total time=   0.6s
[CV] END C=7.182472659393428, max_iter=1000, penalty=l1, solver=saga; total time=   0.6s
[CV] END C=0.0013408920002835425, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=0.0013408920002835425, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=0.0013408920002835425, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=0.0013408920002835425, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=0.0013408920002835425, max_iter=1000, penalty=elasticnet, solver=saga; total time=   0.0s
[CV] END C=7.182472659393428, max_iter=1000, penalty=l1, solver=saga; total time=   0.6s
[CV] END C=0.040956812548496736, max_iter=1000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=0.040956812548496736, max_iter=10



[CV] END C=0.0020995949160761756, max_iter=1000, penalty=l1, solver=saga; total time=   0.2s
[CV] END C=7854.083114461339, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.040956812548496736, max_iter=1000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=7854.083114461339, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=7854.083114461339, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=7854.083114461339, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=7854.083114461339, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.14422885156020646, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.14422885156020646, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.14422885156020646, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.14422885156020646, max_iter=1000, penalty=none, solver=saga; total t



[CV] END C=0.12865252594826798, max_iter=1000, penalty=l2, solver=saga; total time=   2.2s
[CV] END C=0.12865252594826798, max_iter=1000, penalty=l2, solver=saga; total time=   2.1s
[CV] END C=2.077480993119357, max_iter=1000, penalty=none, solver=saga; total time=   0.0s
[CV] END C=0.13442649050633507, max_iter=1000, penalty=l2, solver=saga; total time=   2.1s
[CV] END C=0.13442649050633507, max_iter=1000, penalty=l2, solver=saga; total time=   2.0s
[CV] END C=0.02734702913886812, max_iter=1000, penalty=l1, solver=saga; total time=   2.2s
[CV] END C=0.02734702913886812, max_iter=1000, penalty=l1, solver=saga; total time=   2.2s




[CV] END C=0.040956812548496736, max_iter=1000, penalty=l2, solver=saga; total time=   1.5s
[CV] END C=0.040956812548496736, max_iter=1000, penalty=l2, solver=saga; total time=   1.4s


135 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_paramet

[CV] END C=0.12865252594826798, max_iter=1000, penalty=l2, solver=saga; total time=   1.2s
[CV] END C=0.12865252594826798, max_iter=1000, penalty=l2, solver=saga; total time=   1.2s
Best F1 Score for LR: 0.7819
Running RandomizedSearchCV for DT...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END criterion=log_loss, max_depth=17, max_features=log2, min_samples_leaf=8, min_samples_split=8; total time=   0.0s
[CV] END criterion=log_loss, max_depth=17, max_features=log2, min_samples_leaf=8, min_samples_split=8; total time=   0.0s
[CV] END criterion=log_loss, max_depth=17, max_features=log2, min_samples_leaf=8, min_samples_split=8; total time=   0.0s
[CV] END criterion=log_loss, max_depth=17, max_features=log2, min_samples_leaf=8, min_samples_split=8; total time=   0.0s
[CV] END criterion=log_loss, max_depth=4, max_features=sqrt, min_samples_leaf=6, min_samples_split=3; total time=   0.0s
[CV] END criterion=log_loss, max_depth=17, max_features=log2, min_samples_leaf=8,

In [12]:
for name, model in best_models.items():
    print(f"{name}")
    print(model)
    print("-" * 60)

best_lr_model_1
LogisticRegression(C=np.float64(0.09915644566638401), max_iter=1000,
                   penalty='l1', random_state=42, solver='saga')
------------------------------------------------------------
best_dt_model_1
DecisionTreeClassifier(criterion='log_loss', max_depth=12, min_samples_leaf=19,
                       min_samples_split=8, random_state=42)
------------------------------------------------------------
best_rf_model_1
RandomForestClassifier(criterion='entropy', max_depth=21, min_samples_leaf=2,
                       min_samples_split=3, n_estimators=319, random_state=42)
------------------------------------------------------------
best_xgb_model_1
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=N

In [13]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Store metrics
best_models_results = []

for name, model in best_models.items():
    y_pred = model.predict(X_train)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_train)[:, 1]
        roc_auc = roc_auc_score(y_train, y_prob)
        pr_auc = average_precision_score(y_train, y_prob)
    else:
        roc_auc = None
        pr_auc = None

    best_models_results.append({
        "Model": name.upper(),
        "Accuracy": accuracy_score(y_train, y_pred),
        "Precision": precision_score(y_train, y_pred),
        "Recall": recall_score(y_train, y_pred),
        "F1 Score": f1_score(y_train, y_pred),
        "ROC AUC": roc_auc,
        "PR AUC": pr_auc
    })

In [None]:
# Create and display the DataFrame
best_models_results_df = pd.DataFrame(best_models_results)
best_models_results_df.set_index("Model", inplace=True)
display(best_models_results_df.round(4))



# Round numeric metrics
best_models_results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']] = best_models_results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']].round(4)

# Style only the numeric columns (exclude Confusion Matrix)
styled_best_result_df = best_models_results_df.style\
    .background_gradient(cmap="Wistia", subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .highlight_max(axis=0, color='#ffd1a9', subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .format(precision=4, subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .set_properties(**{'color': 'black'}, subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])


styled_best_result_df

                  Accuracy  Precision    Recall  F1 Score   ROC AUC    PR AUC
Model                                                                        
BEST_LR_MODEL_1   0.776425   0.756285  0.815717  0.784878  0.854705  0.836452
BEST_DT_MODEL_1   0.847814   0.836997  0.863863  0.850218  0.931873  0.929501
BEST_RF_MODEL_1   0.927919   0.921734  0.935252  0.928444  0.985614  0.985860
BEST_XGB_MODEL_1  0.936635   0.925108  0.950194  0.937483  0.983864  0.983108
BEST_CAT_MODEL_1  0.919757   0.913351  0.927504  0.920373  0.976747  0.975462


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC,PR AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BEST_LR_MODEL_1,0.7764,0.7563,0.8157,0.7849,0.8547,0.8365
BEST_DT_MODEL_1,0.8478,0.837,0.8639,0.8502,0.9319,0.9295
BEST_RF_MODEL_1,0.9279,0.9217,0.9353,0.9284,0.9856,0.9859
BEST_XGB_MODEL_1,0.9366,0.9251,0.9502,0.9375,0.9839,0.9831
BEST_CAT_MODEL_1,0.9198,0.9134,0.9275,0.9204,0.9767,0.9755


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC,PR AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BEST_LR_MODEL_1,0.7764,0.7563,0.8157,0.7849,0.8547,0.8365
BEST_DT_MODEL_1,0.8478,0.837,0.8639,0.8502,0.9319,0.9295
BEST_RF_MODEL_1,0.9279,0.9217,0.9353,0.9284,0.9856,0.9859
BEST_XGB_MODEL_1,0.9366,0.9251,0.9502,0.9375,0.9839,0.9831
BEST_CAT_MODEL_1,0.9198,0.9134,0.9275,0.9204,0.9767,0.9755


##### Model-by-Model Analysis (After Hyperparameter Tuning):
**1. Logistic Regression (BEST_LR_MODEL_1)**
- Accuracy: 0.776
- F1 Score: 0.785 — a modest improvement over the untuned version
- Precision: 0.756 | Recall: 0.816 — good at catching churners
- ROC AUC: 0.8547 | PR AUC: 0.8365
- **Takeway: Improved compared to before. Still a good baseline, interpretable and simple. But weaker than tree-based models in almost every metric.**

**2. Decision Tree (BEST_DT_MODEL_1)**
- Accuracy: 0.848
- F1 Score: 0.850 — balanced and consistent
- Precision: 0.837 | Recall: 0.864
- ROC AUC: 0.9319 | PR AUC: 0.9295
- **Takeaway: Huge boost from tuning! Much more stable now. Great balance between false positives and false negatives. But Decision Trees still tend to overfit — watch for that.**

**3. Random Forest (BEST_RF_MODEL_1)**
- Accuracy: 0.928
- F1 Score: 0.928 — excellent!
- Precision: 0.922 | Recall: 0.935
- ROC AUC: 0.9856 | PR AUC: 0.9859
- **Takeaway: Outstanding performer. High precision and high recall — it catches almost all churners with few false positives. Very robust. Strong candidate for test evaluation.**

**4. XGBoost (BEST_XGB_MODEL_1)**
- Accuracy: 0.937
- F1 Score: 0.937 — best among all!
- Precision: 0.925 | Recall: 0.950 — highest recall overall
- ROC AUC: 0.9839 | PR AUC: 0.9831
- **Takeaway: Top performer. XGBoost edges out Random Forest by a small margin. Best if recall is critical — excellent for churn where missing a churner is costly.**

**5. CatBoost (BEST_CAT_MODEL_1)**
- Accuracy: 0.920
- F1 Score: 0.920
- Precision: 0.913 | Recall: 0.928
- ROC AUC: 0.9767 | PR AUC: 0.9755
- **Takeaway: Also a top-tier model. Slightly behind XGBoost and Random Forest, but CatBoost works well with categorical features and usually needs less preprocessing. Great for production.**

**NOTE: Removed SVC because the training time was too long.**

**Why SVC Is So Slow**
1. SVC is not scalable:
    - SVC (with the default kernel = 'rbf') scales very poorly with the number of training samples (time complexity is ~O(n³) for training).
    - On large datasets, even a single fit can be extremely slow.

2. Too many parameter combinations:
Even with RandomizedSearchCV, 50 iterations can be a lot if each training run is slow.

3. Cross-validation overhead:
- You’re using cv=skf (e.g., 5-fold StratifiedKFold). That means each of the 50 param sets is being trained 5 times → 250 total fits.

## 6. Final Evaluation After Random Search on Test Set (Unseen Data)

I will be using the XGBoost Model Because:
- High recall (catches more churners)
- High F1 (great balance of precision & recall)
- Excellent AUCs (generalizes well)
- Confidence in real-world performance.

In [None]:
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"Classification Report for {name.upper()}:")
    print(classification_report(y_test, y_pred))
    print("-" * 60)

In [None]:
for model in models:
    model_name = model.__class__.__name__

    # Make predictions on the test set
    y_pred = model.predict(X_test) 
    y_proba = model.predict_proba(X_test)[:, 1]


    # Final evauation metrics
    final_accuracy = accuracy_score(y_test, y_pred)
    final_precision = precision_score(y_test, y_pred)
    final_recall = recall_score(y_test, y_pred)
    final_f1 = f1_score(y_test, y_pred)
    final_roc_auc = roc_auc_score(y_test, y_proba)
    final_avg_precision = average_precision_score(y_test, y_proba)

    results.append({
    'Model': model.__class__.__name__,
    'Accuracy': final_accuracy,
    'Precision': final_precision,
    'Recall': final_recall,
    'F1 Score': final_f1,
    'ROC AUC': final_roc_auc,
    'PR AUC': final_avg_precision
    })

## 7. Visualising Results of The Test Set

### 7.1. Confusion Matrices

In [None]:
from matplotlib.colors import LinearSegmentedColormap

# Define palette
colour_palette = LinearSegmentedColormap.from_list(
    "pink_orange",
    ["#fcd5ce", "#f9dcc4", "#f8c8dc", "#fac898", "#ffb997"]
)

# Plot confusion matrices
for result in results:
    cm = result['Confusion Matrix']
    model_name = result['Model']

    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap=colour_palette,
                xticklabels=['Not Churn', 'Churn'],
                yticklabels=['Not Churn', 'Churn'],
                linewidths=0.5, linecolor='white', cbar=False)

    plt.title(f'Confusion Matrix: {model_name}', fontsize=12, fontweight='bold')
    plt.xlabel('Predicted Label', fontsize=10)
    plt.ylabel('True Label', fontsize=10)
    plt.xticks(fontsize=9)
    plt.yticks(fontsize=9, rotation=0)
    plt.tight_layout()
    plt.show()

### 7.2. Visualizing FP vs FN per Model

In [None]:
conf_matrices = {
    'LogisticRegression': [[1097, 452], [126, 435]],
    'SVC': [[1063, 486], [114, 447]],
    'DecisionTreeClassifier': [[1238, 311], [237, 324]],  
    'RandomForestClassifier': [[1326, 223], [256, 305]],  
    'XGBClassifier': [[1272, 277], [216, 345]], 
    'CatBoostClassifier': [[1297, 252], [204, 357]] 
}

data = []

for model_name, matrix in conf_matrices.items():
    tn, fp = matrix[0]
    fn, tp = matrix[1]
    data.append({
        'Model': model_name,
        'False Positives': fp,
        'False Negatives': fn
    })

fp_fn_df = pd.DataFrame(data)

In [None]:
# Melt the dataframe for grouped bar chart
df_melted = fp_fn_df.melt(id_vars='Model', value_vars=['False Positives', 'False Negatives'],
                          var_name='Error Type', value_name='Count')

# Define soft pink and orange
custom_palette = ['#FFB6B9', '#FFDAC1']  # soft pink and peach

plt.figure(figsize=(10, 6))
sns.barplot(data=df_melted, x='Model', y='Count', hue='Error Type', palette=custom_palette)

plt.title('False Positives vs False Negatives per Model', fontsize=14)
plt.xticks(rotation=20)
plt.tight_layout()
plt.legend(title='Error Type')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

**Observation:**

- The Logistic Regression and SVC models are wrongly predicting that loyal customers will churn.
- High FP (light pink) → Model wrongly thinks loyal customers will churn.
- High FN (peach) → Model fails to catch actual churners — costly mistake for a business.

### 7.3. Classification Report

In [None]:
for model in models:
    print(f"Model: {model.__class__.__name__}")
    print(classification_report(y_test, y_pred))
    print("-" * 60)

### 7.4. Visualising the ROC Curve

In [None]:
# Soft pinks and oranges
soft_colours = [
    '#fbb1bd',  # pastel pink
    '#ffd1a9',  # pastel orange
    '#fcd5ce',  # soft peach
    '#fae1dd',  # dusty rose
    '#ffe5ec',  # soft blush
    '#ffb997'   # melon orange
]

In [None]:
plt.figure(figsize=(8, 6))

for idx, model in enumerate(models):
    try:
        # Get probability scores
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:
            y_proba = model.decision_function(X_test)
    except: 
        continue # Skip if neither method is available

    # ROC Curves
    fpr, tpr = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)

    colour = soft_colours[idex % len(soft_colours)]
    plt.plot(fpr, tpr, label=f"{model.__class__.__name__} (AUC = {auc:.2f})")

    
    # Plot formatting
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves', fontsize=14, weight='bold')
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.4)
    plt.tight_layout()
    plt.show()

### 7.5. Visualising the Precision-Recall Curve

In [None]:
# Generate the curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)

In [None]:
# Plot the PR Curve

plt.figure(figsize=(8, 6)
           )
for idx, model in enumerate(models):
    try:
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:
            y_proba = model.decision_function(X_test)
    except:
        continue

    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    avg_precision = average_precision_score(y_test, y_proba)

    colours = soft_colours[idx % len(soft_colours)]
    plt.plot(recall, precision, label=f"{model.__class__.__name__} (AP = {avg_precision:.2f})")


plt.xlabel("Recall", fontsize=12)
plt.ylabel("Precision", fontsize=12)
plt.title("Precision-Recall Curves", fontsize=14, weight='bold')
plt.grid(True, linestyle='--', alpha=0.4)
plt.legend(loc="best")
plt.tight_layout()
plt.show()