# Evaluation

## 1. Import libraries

In [1]:
# --- Imports ---
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report, precision_recall_curve, average_precision_score, roc_auc_score, roc_curve

## 2. Load datasets and models

In [2]:
import joblib

# Load the scaler 
scaler = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/scaler.pkl')


# --- Load datasets ---
X_train = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/X_train.pkl')
X_test = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/X_test.pkl')
y_train = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/y_train.pkl')
y_test = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_data/y_test.pkl')


# --- Load trained models ---
lr = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/LogisticRegression.pkl')
svc = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/SVC.pkl')
dt = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/DecisionTreeClassifier.pkl')
rf = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/RandomForestClassifier.pkl')
xgb = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/XGBClassifier.pkl')
cat = joblib.load('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/saved_models/CatBoostClassifier.pkl')


models = [lr, svc, dt, rf, xgb, cat]

## 3. Stratified Cross-Validation (on Training Set)

In [3]:
# --- Set up stratified cross-validaion ---
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 

results = []

for model in models:
    print(f"\nCross-Validation Scores for {model.__class__.__name__}")

    # Evaluate model with corss-validation
    acc = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring="f1")
    auc_score = cross_val_score(model, X_train, y_train, cv=skf, scoring="roc_auc")
    avg_precision = cross_val_score(model, X_train, y_train, cv=skf, scoring="average_precision")


    results.append({
    'Model': model.__class__.__name__,
    'Accuracy': acc.mean(),
    'Precision': precision.mean(),
    'Recall': recall.mean(),
    'F1 Score': f1.mean(),
    'ROC AUC': auc_score.mean(),
    'PR AUC': avg_precision.mean()
    })



Cross-Validation Scores for LogisticRegression


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Cross-Validation Scores for SVC

Cross-Validation Scores for DecisionTreeClassifier

Cross-Validation Scores for RandomForestClassifier

Cross-Validation Scores for XGBClassifier

Cross-Validation Scores for CatBoostClassifier
Learning rate set to 0.021794
0:	learn: 0.6813792	total: 54.4ms	remaining: 54.4s
1:	learn: 0.6714005	total: 56.7ms	remaining: 28.3s
2:	learn: 0.6601656	total: 59.7ms	remaining: 19.8s
3:	learn: 0.6495432	total: 61.9ms	remaining: 15.4s
4:	learn: 0.6388611	total: 63.9ms	remaining: 12.7s
5:	learn: 0.6299869	total: 65.9ms	remaining: 10.9s
6:	learn: 0.6200284	total: 68ms	remaining: 9.65s
7:	learn: 0.6113740	total: 70ms	remaining: 8.69s
8:	learn: 0.6025625	total: 72.2ms	remaining: 7.95s
9:	learn: 0.5950604	total: 74.4ms	remaining: 7.36s
10:	learn: 0.5877254	total: 76.4ms	remaining: 6.87s
11:	learn: 0.5810801	total: 78.5ms	remaining: 6.46s
12:	learn: 0.5743199	total: 80.5ms	remaining: 6.11s
13:	learn: 0.5683312	total: 82.5ms	remaining: 5.81s
14:	learn: 0.5621776	total: 

Stratified cross-validation is essential when you’re working with imbalanced classes, like in the Telco Customer Churn dataset where many more customers may have stayed than churned.

## 4. Using Metrics to Evaluate on the Training Set

### 4.1. Displaying results as a table

In [19]:
results_df = pd.DataFrame(results)

print(results_df)

# Round numeric metrics
results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']] = results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']].round(4)

# Style only the numeric columns (exclude Confusion Matrix)
styled_df = results_df.style\
    .background_gradient(cmap="Wistia", subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .highlight_max(axis=0, color='#ffd1a9', subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .format(precision=4, subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])\
    .set_properties(**{'color': 'black'}, subset=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC'])


styled_df

                    Model  Accuracy  Precision    Recall  F1 Score   ROC AUC  \
0      LogisticRegression  0.771169   0.753536  0.806032  0.778670  0.853481   
1                     SVC  0.766051   0.739773  0.820974  0.778158  0.850555   
2  DecisionTreeClassifier  0.803404   0.801958  0.806033  0.803895  0.804386   
3  RandomForestClassifier  0.845878   0.849451  0.840896  0.845070  0.920330   
4           XGBClassifier  0.847676   0.844131  0.852792  0.848438  0.925265   
5      CatBoostClassifier  0.851688   0.847465  0.857775  0.852564  0.930086   

     PR AUC  
0  0.834519  
1  0.832692  
2  0.744433  
3  0.909266  
4  0.930683  
5  0.933987  


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,PR AUC
0,LogisticRegression,0.7712,0.7535,0.806,0.7787,0.8535,0.8345
1,SVC,0.7661,0.7398,0.821,0.7782,0.8506,0.8327
2,DecisionTreeClassifier,0.8034,0.802,0.806,0.8039,0.8044,0.7444
3,RandomForestClassifier,0.8459,0.8495,0.8409,0.8451,0.9203,0.9093
4,XGBClassifier,0.8477,0.8441,0.8528,0.8484,0.9253,0.9307
5,CatBoostClassifier,0.8517,0.8475,0.8578,0.8526,0.9301,0.934


In [None]:
results_df = pd.DataFrame(results)
print(results_df)

                    Model  Accuracy  Precision    Recall  F1 Score  \
0      LogisticRegression  0.726066   0.490417  0.775401  0.600829   
1                     SVC  0.715640   0.479100  0.796791  0.598394   
2  DecisionTreeClassifier  0.740284   0.510236  0.577540  0.541806   
3  RandomForestClassifier  0.772986   0.577652  0.543672  0.560147   
4           XGBClassifier  0.766351   0.554662  0.614973  0.583263   
5      CatBoostClassifier  0.783886   0.586207  0.636364  0.610256   

            Confusion Matrix  
0  [[1097, 452], [126, 435]]  
1  [[1063, 486], [114, 447]]  
2  [[1238, 311], [237, 324]]  
3  [[1326, 223], [256, 305]]  
4  [[1272, 277], [216, 345]]  
5  [[1297, 252], [204, 357]]  


**Here's how you can interpret what you’ve got:**

- Best Accuracy: CatBoostClassifier (0.7839)
- Best F1 Score: Also CatBoostClassifier (0.6103), which suggests it's handling precision–recall trade-offs well.
- Highest Recall: Again CatBoostClassifier (0.6364), which means it’s catching more positives (likely churners, if that’s your target).
- If you’re optimizing for detecting churners (i.e. minimizing false negatives), recall and F1-score are most important — so CatBoostClassifier seems like your strongest performer.

## 5. Model Tuning (Hyperparameter Optimization)

## 6. Final Evaluation on Test Set (Unseen Data)

In [None]:
for model in models:
    model_name = model.__class__.__name__

    # Make predictions on the test set
    y_pred = model.predict(X_test) 
    y_proba = model.predict_proba(X_test)[:, 1]


    # Final evauation metrics
    final_accuracy = accuracy_score(y_test, y_pred)
    final_precision = precision_score(y_test, y_pred)
    final_recall = recall_score(y_test, y_pred)
    final_f1 = f1_score(y_test, y_pred)
    final_roc_auc = roc_auc_score(y_test, y_proba)
    final_avg_precision = average_precision_score(y_test, y_proba)

    results.append({
    'Model': model.__class__.__name__,
    'Accuracy': final_accuracy,
    'Precision': final_precision,
    'Recall': final_recall,
    'F1 Score': final_f1,
    'ROC AUC': final_roc_auc,
    'PR AUC': final_avg_precision
    })

## 7. Visualising Results of The Test Set

### 7.1. Confusion Matrices

In [None]:
from matplotlib.colors import LinearSegmentedColormap

# Define palette
colour_palette = LinearSegmentedColormap.from_list(
    "pink_orange",
    ["#fcd5ce", "#f9dcc4", "#f8c8dc", "#fac898", "#ffb997"]
)

# Plot confusion matrices
for result in results:
    cm = result['Confusion Matrix']
    model_name = result['Model']

    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap=colour_palette,
                xticklabels=['Not Churn', 'Churn'],
                yticklabels=['Not Churn', 'Churn'],
                linewidths=0.5, linecolor='white', cbar=False)

    plt.title(f'Confusion Matrix: {model_name}', fontsize=12, fontweight='bold')
    plt.xlabel('Predicted Label', fontsize=10)
    plt.ylabel('True Label', fontsize=10)
    plt.xticks(fontsize=9)
    plt.yticks(fontsize=9, rotation=0)
    plt.tight_layout()
    plt.show()

### 7.2. Visualizing FP vs FN per Model

In [None]:
conf_matrices = {
    'LogisticRegression': [[1097, 452], [126, 435]],
    'SVC': [[1063, 486], [114, 447]],
    'DecisionTreeClassifier': [[1238, 311], [237, 324]],  
    'RandomForestClassifier': [[1326, 223], [256, 305]],  
    'XGBClassifier': [[1272, 277], [216, 345]], 
    'CatBoostClassifier': [[1297, 252], [204, 357]] 
}

data = []

for model_name, matrix in conf_matrices.items():
    tn, fp = matrix[0]
    fn, tp = matrix[1]
    data.append({
        'Model': model_name,
        'False Positives': fp,
        'False Negatives': fn
    })

fp_fn_df = pd.DataFrame(data)

In [None]:
# Melt the dataframe for grouped bar chart
df_melted = fp_fn_df.melt(id_vars='Model', value_vars=['False Positives', 'False Negatives'],
                          var_name='Error Type', value_name='Count')

# Define soft pink and orange
custom_palette = ['#FFB6B9', '#FFDAC1']  # soft pink and peach

plt.figure(figsize=(10, 6))
sns.barplot(data=df_melted, x='Model', y='Count', hue='Error Type', palette=custom_palette)

plt.title('False Positives vs False Negatives per Model', fontsize=14)
plt.xticks(rotation=20)
plt.tight_layout()
plt.legend(title='Error Type')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

**Observation:**

- The Logistic Regression and SVC models are wrongly predicting that loyal customers will churn.
- High FP (light pink) → Model wrongly thinks loyal customers will churn.
- High FN (peach) → Model fails to catch actual churners — costly mistake for a business.

### 7.3. Classification Report

In [None]:
for model in models:
    print(f"Model: {model.__class__.__name__}")
    print(classification_report(y_test, y_pred))
    print("-" * 60)

### 7.4. Visualising the ROC Curve

In [None]:
# Soft pinks and oranges
soft_colours = [
    '#fbb1bd',  # pastel pink
    '#ffd1a9',  # pastel orange
    '#fcd5ce',  # soft peach
    '#fae1dd',  # dusty rose
    '#ffe5ec',  # soft blush
    '#ffb997'   # melon orange
]

In [None]:
plt.figure(figsize=(8, 6))

for idx, model in enumerate(models):
    try:
        # Get probability scores
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:
            y_proba = model.decision_function(X_test)
    except: 
        continue # Skip if neither method is available

    # ROC Curves
    fpr, tpr = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)

    colour = soft_colours[idex % len(soft_colours)]
    plt.plot(fpr, tpr, label=f"{model.__class__.__name__} (AUC = {auc:.2f})")

    
    # Plot formatting
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves', fontsize=14, weight='bold')
    plt.legend(loc='lower right')
    plt.grid(True, linestyle='--', alpha=0.4)
    plt.tight_layout()
    plt.show()

### 7.5. Visualising the Precision-Recall Curve

In [None]:
# Generate the curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)

In [None]:
# Plot the PR Curve

plt.figure(figsize=(8, 6)
           )
for idx, model in enumerate(models):
    try:
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
        else:
            y_proba = model.decision_function(X_test)
    except:
        continue

    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    avg_precision = average_precision_score(y_test, y_proba)

    colours = soft_colours[idx % len(soft_colours)]
    plt.plot(recall, precision, label=f"{model.__class__.__name__} (AP = {avg_precision:.2f})")


plt.xlabel("Recall", fontsize=12)
plt.ylabel("Precision", fontsize=12)
plt.title("Precision-Recall Curves", fontsize=14, weight='bold')
plt.grid(True, linestyle='--', alpha=0.4)
plt.legend(loc="best")
plt.tight_layout()
plt.show()