# Model Evaluation Notebook## Comprehensive Performance Metrics for V1 and V2 ModelsThis notebook evaluates all trained models (V1 and V2) and generates complete performance statistics including:- **Accuracy**: Overall classification correctness- **Precision**: Positive prediction accuracy- **Recall**: True positive detection rate- **F1-Score**: Harmonic mean of precision and recall- **ROC-AUC**: Area under the ROC curve- **KS Statistic**: Kolmogorov-Smirnov test statistic### Model Inventory- **V1 Models (5)**: Logistic Regression, Random Forest, XGBoost, LightGBM, CatBoost- **V2 Models (15)**: 3 feature sets × 5 algorithms### Feature Sets- **V1**: 12 traditional features- **V2 Feature Set A**: 4 composite scores only- **V2 Feature Set B**: 4 composite scores + 4 key categoricals (8 features)- **V2 Feature Set C**: 4 composite scores + 6 categoricals (10 features)

## Section 1: Imports and Configuration

In [None]:
# Core librariesimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport warningsimport osfrom math import pi# Preprocessingfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import LabelEncoder, StandardScalerfrom imblearn.over_sampling import SMOTE# Modelsfrom sklearn.linear_model import LogisticRegressionfrom sklearn.ensemble import RandomForestClassifierfrom xgboost import XGBClassifierfrom lightgbm import LGBMClassifierfrom catboost import CatBoostClassifier# Evaluation metricsfrom sklearn.metrics import (    accuracy_score, precision_score, recall_score, f1_score,    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay)from scipy import stats# Configurationwarnings.filterwarnings('ignore')plt.style.use('seaborn-v0_8-darkgrid')sns.set_palette("husl")# ConstantsRANDOM_STATE = 42TARGET = 'Defaulted'DATA_PATH = '../Github Original Data.csv'print("✓ All libraries imported successfully")

## Section 2: Data Loading

In [None]:
# Load the dataset (same as used in training)df = pd.read_csv(DATA_PATH, encoding='latin-1')print(f"Dataset shape: {df.shape}")print(f"\nTarget variable '{TARGET}' distribution:")print(df[TARGET].value_counts())print(f"\nDefault rate: {df[TARGET].mean()*100:.2f}%")

## Section 3: Feature Definitions### V1 Features (12 features)Traditional features from initial modeling approach.### V2 Feature Sets- **Feature Set A**: Composite scores only (4 features)- **Feature Set B**: Composite + key categoricals (8 features)- **Feature Set C**: Extended feature set (10 features)

In [None]:
# V1 Model Features (12 features)V1_FEATURES = [    'Extra Income Brackets',    'Categorize Rent Payment',    'School Fees Categorical',    'Age Group',    'Education',    'Loan Access',  # Prior Loan Access    'CRB Class',    'Logic on Income',  # Income Diversity    'Categorizing Utility Expenses',    'Expense Relative to Income',    'Affordability (HH)',    'Living']# V2 Feature Sets# Feature Set A: Composite Scores Only (4 features)FEATURES_A = [    'Financial_Resilience_Score',    'Business_Quality_Score',    'Stability_Score',    'Expense_Management_Score']# Feature Set B: Composite + Key Categoricals (8 features)FEATURES_B = [    'Financial_Resilience_Score',    'Business_Quality_Score',    'Stability_Score',    'Expense_Management_Score',    'Age Group',    'Education',    'CRB Class',    'Living']# Feature Set C: Extended (10 features, no Prior Loan)FEATURES_C = [    'Financial_Resilience_Score',    'Business_Quality_Score',    'Stability_Score',    'Expense_Management_Score',    'Age Group',    'Education',    'CRB Class',    'Living',    'Logic on Income',    'Marital status']print("Feature sets defined:")print(f"  V1 Features: {len(V1_FEATURES)} features")print(f"  Feature Set A: {len(FEATURES_A)} features")print(f"  Feature Set B: {len(FEATURES_B)} features")print(f"  Feature Set C: {len(FEATURES_C)} features")

## Section 4: Composite Score Generation (for V2 Models)

In [None]:
def calculate_financial_resilience(row):    """    Financial Resilience Score (0-100)    Weights: Extra Income (35%), Expense Ratio (30%), Income Diversity (20%), Savings (15%)    """    score = 0        # Extra Income Level (35% weight)    extra_income = str(row.get('Extra_Income_Brackets', '')).lower()    if 'moderate' in extra_income or 'high' in extra_income:        score += 35 * 1.0    elif 'low' in extra_income and 'no' not in extra_income:        score += 35 * 0.3    else:        score += 35 * 0.6        # Expense-to-Income Ratio (30% weight)    expense_ratio = str(row.get('Expense_Ratio', '')).lower()    if '1/3' in expense_ratio:        score += 30 * 1.0    elif 'half' in expense_ratio:        score += 30 * 0.7    elif '2/3' in expense_ratio and 'more' not in expense_ratio:        score += 30 * 0.4    else:        score += 30 * 0.5        # Income Diversity (20% weight)    income_div = str(row.get('Income_Diversity', '')).lower()    if 'full' in income_div:        score += 20 * 1.0    elif 'regular' in income_div:        score += 20 * 0.7    elif 'extra' in income_div:        score += 20 * 0.5    else:        score += 20 * 0.6        # Savings Level (15% weight)    savings = str(row.get('Savings_Category', '')).lower()    if 'high' in savings:        score += 15 * 1.0    elif 'low' in savings and 'no' not in savings:        score += 15 * 0.8    else:        score += 15 * 0.6        return scoredef calculate_business_quality(row):    """    Business Quality Score (0-100)    Weights: Rent (45%), Utility (30%), Business Affordability (25%)    """    score = 0        # Rent Payment Level (45% weight)    rent = str(row.get('Rent_Category', '')).lower()    if 'high' in rent:        score += 45 * 1.0    elif 'low' in rent and 'no' not in rent:        score += 45 * 0.5    else:        score += 45 * 0.6        # Utility Expenses (30% weight)    utility = str(row.get('Utility_Category', '')).lower()    if 'high' in utility:        score += 30 * 1.0    elif 'low' in utility and 'no' not in utility:        score += 30 * 0.5    else:        score += 30 * 0.7        # Business Affordability (25% weight)    afford = str(row.get('Affordability_Business', '')).lower()    if 'profitable' in afford:        score += 25 * 1.0    else:        score += 25 * 0.5        return scoredef calculate_stability(row):    """    Stability Score (0-100)    Weights: School Fees (40%), Regular Income (30%), Income Streams (30%)    """    score = 0        # School Fees Commitment (40% weight)    school = str(row.get('SchoolFees_Category', '')).lower()    if 'high' in school:        score += 40 * 1.0    elif 'low' in school and 'no' not in school:        score += 40 * 0.5    else:        score += 40 * 0.9        # Regular Income Presence (30% weight)    regular = str(row.get('Regular_Income_Brackets', '')).lower()    if 'moderate' in regular or 'high' in regular:        score += 30 * 1.0    elif 'low' in regular and 'no' not in regular:        score += 30 * 1.1    else:        score += 30 * 0.85        # Multiple Income Streams (30% weight)    income_div = str(row.get('Income_Diversity', '')).lower()    if 'full' in income_div:        score += 30 * 1.0    elif 'regular' in income_div:        score += 30 * 0.8    elif 'extra' in income_div:        score += 30 * 0.6    else:        score += 30 * 0.7        return min(score, 100)def calculate_expense_management(row):    """    Expense Management Score (0-100)    Weights: Expense Ratio (50%), Affordability HH (35%), Utility (15%)    """    score = 0        # Expense Relative to Income (50% weight)    expense_ratio = str(row.get('Expense_Ratio', '')).lower()    if '1/3' in expense_ratio:        score += 50 * 1.0    elif 'half' in expense_ratio:        score += 50 * 0.7    elif '2/3' in expense_ratio and 'more' not in expense_ratio:        score += 50 * 0.4    else:        score += 50 * 0.5        # Affordability HH (35% weight)    afford = str(row.get('Affordability_HH', '')).lower()    if 'profitable' in afford:        score += 35 * 1.0    else:        score += 35 * 0.5        # Utility Expenses (15% weight)    utility = str(row.get('Utility_Category', '')).lower()    if 'high' in utility:        score += 15 * 1.0    elif 'low' in utility and 'no' not in utility:        score += 15 * 0.5    else:        score += 15 * 0.7        return scoreprint("✓ Composite score functions defined")

## Section 5: Prepare Intermediate Features for Composite Scores

In [None]:
# Generate intermediate features needed for composite score calculations# These ensure consistent naming across all score functions# Affordability Businessif "Affordability" in df.columns:    df["Affordability_Business"] = df["Affordability"].fillna("Unknown")else:    df["Affordability_Business"] = "Unknown"# Affordability HH (Household)if "Affordability (HH)" in df.columns:    df["Affordability_HH"] = df["Affordability (HH)"].fillna("Unknown")else:    df["Affordability_HH"] = df.get("Affordability_Business", "Unknown")# Extra Income Bracketsif "Extra Income Brackets" in df.columns:    df["Extra_Income_Brackets"] = df["Extra Income Brackets"].fillna("No Extra Income")else:    extra_income = pd.to_numeric(df.get("Extra Income", 0), errors="coerce").fillna(0)    df["Extra_Income_Brackets"] = np.where(        extra_income == 0, "No Extra Income",        np.where(extra_income < 5000, "Low Extra Income",        np.where(extra_income < 10000, "Moderate Extra Income", "High Extra Income"))    )# Expense Ratioif "Expense Relative to Income" in df.columns:    df["Expense_Ratio"] = df["Expense Relative to Income"].fillna("Unknown")else:    df["Expense_Ratio"] = "Unknown"# Income Diversityif "Logic on Income" in df.columns:    df["Income_Diversity"] = df["Logic on Income"].fillna("Unknown")else:    df["Income_Diversity"] = "Unknown"# Savings Categoryif "Savings" in df.columns:    df["Savings_Category"] = df["Savings"].fillna("No Savings")else:    df["Savings_Category"] = "No Savings"# Rent Categoryif "Categorize Rent Payment" in df.columns:    df["Rent_Category"] = df["Categorize Rent Payment"].fillna("No Rent")else:    df["Rent_Category"] = "No Rent"# Utility Categoryif "Categorizing Utility Expenses" in df.columns:    df["Utility_Category"] = df["Categorizing Utility Expenses"].fillna("None")else:    df["Utility_Category"] = "None"# School Fees Categoryif "School Fees Categorical" in df.columns:    df["SchoolFees_Category"] = df["School Fees Categorical"].fillna("None")else:    df["SchoolFees_Category"] = "None"# Regular Income Bracketsif "Regular Income" in df.columns:    regular_income = pd.to_numeric(df["Regular Income"], errors="coerce").fillna(0)    df["Regular_Income_Brackets"] = np.where(        regular_income == 0, "No Regular Income",        np.where(regular_income < 10000, "Low Regular Income",        np.where(regular_income < 20000, "Moderate Regular Income", "High Regular Income"))    )else:    df["Regular_Income_Brackets"] = "Unknown"print("✓ Intermediate features prepared")

## Section 6: Generate Composite Scores

In [None]:
# Generate the 4 composite scoresdf["Financial_Resilience_Score"] = df.apply(calculate_financial_resilience, axis=1)df["Business_Quality_Score"] = df.apply(calculate_business_quality, axis=1)df["Stability_Score"] = df.apply(calculate_stability, axis=1)df["Expense_Management_Score"] = df.apply(calculate_expense_management, axis=1)print("Composite Scores Generated:")print("=" * 60)composite_cols = ["Financial_Resilience_Score", "Business_Quality_Score",                   "Stability_Score", "Expense_Management_Score"]for col in composite_cols:    print(f"\n{col}:")    print(f"  Mean: {df[col].mean():.2f}")    print(f"  Std:  {df[col].std():.2f}")    print(f"  Min:  {df[col].min():.2f}")    print(f"  Max:  {df[col].max():.2f}")

## Section 7: Data Preprocessing and Splitting

In [None]:
def prepare_data(df, features, target, test_size=0.15, val_size=0.15):    """    Prepare data for a specific feature set.    Returns: X_train, X_val, X_test, y_train, y_val, y_test, encoders, scaler    """    # Create working dataframe    available_features = [f for f in features if f in df.columns]        if len(available_features) < len(features):        missing = set(features) - set(available_features)        print(f"Warning: Missing features: {missing}")        df_work = df[available_features + [target]].copy()        # Handle missing values    for col in available_features:        if df_work[col].dtype == "object":            mode_val = df_work[col].mode()[0] if len(df_work[col].mode()) > 0 else "Unknown"            df_work[col] = df_work[col].fillna(mode_val)        else:            df_work[col] = df_work[col].fillna(df_work[col].median())        # Encode categorical features    encoders = {}    for col in available_features:        if df_work[col].dtype == "object":            le = LabelEncoder()            df_work[col] = le.fit_transform(df_work[col].astype(str))            encoders[col] = le        # Prepare features and target    X = df_work[available_features]    y = df_work[target]        # Train/Val/Test split: 70/15/15 (stratified)    X_temp, X_test, y_temp, y_test = train_test_split(        X, y, test_size=test_size, random_state=RANDOM_STATE, stratify=y    )        val_size_adjusted = val_size / (1 - test_size)    X_train, X_val, y_train, y_val = train_test_split(        X_temp, y_temp, test_size=val_size_adjusted,         random_state=RANDOM_STATE, stratify=y_temp    )        # Scale features    scaler = StandardScaler()    X_train_scaled = scaler.fit_transform(X_train)    X_val_scaled = scaler.transform(X_val)    X_test_scaled = scaler.transform(X_test)        # Apply SMOTE to training data    smote = SMOTE(random_state=RANDOM_STATE)    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)        print(f"  Train: {X_train.shape[0]} → {X_train_smote.shape[0]} (after SMOTE)")    print(f"  Val:   {X_val.shape[0]}")    print(f"  Test:  {X_test.shape[0]}")        return X_train_smote, X_val_scaled, X_test_scaled, y_train_smote, y_val, y_test, encoders, scalerprint("✓ Data preparation function defined")

## Section 8: Evaluation Helper Functions

In [None]:
def calculate_ks_statistic(y_true, y_pred_proba):    """Calculate Kolmogorov-Smirnov statistic"""    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)    ks_statistic = max(tpr - fpr)    return ks_statisticdef evaluate_model(model, X_test, y_test, model_name):    """    Comprehensive model evaluation        Returns dict with all 6 metrics    """    # Predictions    y_pred = model.predict(X_test)    y_pred_proba = model.predict_proba(X_test)[:, 1]        # Calculate all metrics    metrics = {        'Model': model_name,        'Accuracy': accuracy_score(y_test, y_pred),        'Precision': precision_score(y_test, y_pred, zero_division=0),        'Recall': recall_score(y_test, y_pred, zero_division=0),        'F1-Score': f1_score(y_test, y_pred, zero_division=0),        'ROC-AUC': roc_auc_score(y_test, y_pred_proba),        'KS Statistic': calculate_ks_statistic(y_test, y_pred_proba)    }        return metricsprint("✓ Evaluation functions defined")

## Section 9: V1 Model Training and EvaluationTraining 5 models with 12 traditional features.

In [None]:
# Prepare V1 dataprint("Preparing V1 dataset...")X_train_v1, X_val_v1, X_test_v1, y_train_v1, y_val_v1, y_test_v1, encoders_v1, scaler_v1 = prepare_data(    df, V1_FEATURES, TARGET)# Store all resultsall_results = []# Model configurations (using simpler hyperparameters for faster training)v1_models = {    'Logistic Regression V1': LogisticRegression(        random_state=RANDOM_STATE, max_iter=1000, C=0.1    ),    'Random Forest V1': RandomForestClassifier(        n_estimators=100, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1    ),    'XGBoost V1': XGBClassifier(        n_estimators=100, max_depth=6, learning_rate=0.1,        random_state=RANDOM_STATE, eval_metric='logloss', use_label_encoder=False    ),    'LightGBM V1': LGBMClassifier(        n_estimators=100, max_depth=6, learning_rate=0.1,        random_state=RANDOM_STATE, verbose=-1    ),    'CatBoost V1': CatBoostClassifier(        iterations=100, depth=6, learning_rate=0.1,        random_state=RANDOM_STATE, verbose=False    )}print("\n" + "="*80)print("TRAINING AND EVALUATING V1 MODELS")print("="*80)for model_name, model in v1_models.items():    print(f"\nTraining {model_name}...")    model.fit(X_train_v1, y_train_v1)        # Evaluate    metrics = evaluate_model(model, X_test_v1, y_test_v1, model_name)    all_results.append(metrics)        print(f"  ROC-AUC: {metrics['ROC-AUC']:.4f} | KS: {metrics['KS Statistic']:.4f} | F1: {metrics['F1-Score']:.4f}")print("\n✓ V1 models training complete")

## Section 10: V2 Model Training and EvaluationTraining 15 models across 3 feature sets (A, B, C).

In [None]:
# Feature sets dictionaryfeature_sets = {    'A': FEATURES_A,    'B': FEATURES_B,    'C': FEATURES_C}print("\n" + "="*80)print("TRAINING AND EVALUATING V2 MODELS")print("="*80)# Train models for each feature setfor set_name, features in feature_sets.items():    print(f"\n{'='*80}")    print(f"FEATURE SET {set_name} ({len(features)} features)")    print(f"{'='*80}")        # Prepare data for this feature set    print(f"\nPreparing Feature Set {set_name} dataset...")    X_train, X_val, X_test, y_train, y_val, y_test, encoders, scaler = prepare_data(        df, features, TARGET    )        # Define models    v2_models = {        f'Logistic Regression V2 Feature Set {set_name}': LogisticRegression(            random_state=RANDOM_STATE, max_iter=1000, C=0.1        ),        f'Random Forest V2 Feature Set {set_name}': RandomForestClassifier(            n_estimators=100, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1        ),        f'XGBoost V2 Feature Set {set_name}': XGBClassifier(            n_estimators=100, max_depth=6, learning_rate=0.1,            random_state=RANDOM_STATE, eval_metric='logloss', use_label_encoder=False        ),        f'LightGBM V2 Feature Set {set_name}': LGBMClassifier(            n_estimators=100, max_depth=6, learning_rate=0.1,            random_state=RANDOM_STATE, verbose=-1        ),        f'CatBoost V2 Feature Set {set_name}': CatBoostClassifier(            iterations=100, depth=6, learning_rate=0.1,            random_state=RANDOM_STATE, verbose=False        )    }        # Train and evaluate each model    for model_name, model in v2_models.items():        print(f"\nTraining {model_name}...")        model.fit(X_train, y_train)                # Evaluate        metrics = evaluate_model(model, X_test, y_test, model_name)        all_results.append(metrics)                print(f"  ROC-AUC: {metrics['ROC-AUC']:.4f} | KS: {metrics['KS Statistic']:.4f} | F1: {metrics['F1-Score']:.4f}")print("\n✓ V2 models training complete")

## Section 11: Results Compilation

In [None]:
# Create comprehensive results DataFrameresults_df = pd.DataFrame(all_results)results_df = results_df.round(4)# Sort by ROC-AUC descendingresults_df = results_df.sort_values('ROC-AUC', ascending=False).reset_index(drop=True)# Display resultsprint("\n" + "="*100)print("COMPREHENSIVE MODEL EVALUATION RESULTS")print("="*100)print(results_df.to_string(index=False))print("="*100)

## Section 12: Visualizations

### A. ROC-AUC Comparison

In [None]:
# ROC-AUC Comparison Bar Chartplt.figure(figsize=(14, 10))colors = ['steelblue' if 'V1' in model else 'coral' for model in results_df['Model']]plt.barh(results_df['Model'], results_df['ROC-AUC'], color=colors)plt.axvline(x=0.60, color='red', linestyle='--', linewidth=2, label='Baseline (0.60)')plt.axvline(x=0.68, color='green', linestyle='--', linewidth=2, label='Target (0.68)')plt.xlabel('ROC-AUC Score', fontsize=12)plt.ylabel('Model', fontsize=12)plt.title('Model Comparison: ROC-AUC Performance', fontsize=14, fontweight='bold')plt.legend(fontsize=10)plt.xlim(0.5, max(results_df['ROC-AUC']) + 0.05)plt.grid(axis='x', alpha=0.3)plt.tight_layout()plt.savefig('../models/roc_auc_comparison.png', dpi=300, bbox_inches='tight')plt.show()print("✓ ROC-AUC comparison chart saved")

### B. KS Statistic Comparison

In [None]:
# KS Statistic Comparison Bar Chartplt.figure(figsize=(14, 10))colors = ['steelblue' if 'V1' in model else 'coral' for model in results_df['Model']]plt.barh(results_df['Model'], results_df['KS Statistic'], color=colors)plt.axvline(x=0.21, color='red', linestyle='--', linewidth=2, label='Baseline (0.21)')plt.axvline(x=0.28, color='green', linestyle='--', linewidth=2, label='Target (0.28)')plt.xlabel('KS Statistic', fontsize=12)plt.ylabel('Model', fontsize=12)plt.title('Model Comparison: Kolmogorov-Smirnov Statistic', fontsize=14, fontweight='bold')plt.legend(fontsize=10)plt.xlim(0.15, max(results_df['KS Statistic']) + 0.05)plt.grid(axis='x', alpha=0.3)plt.tight_layout()plt.savefig('../models/ks_statistic_comparison.png', dpi=300, bbox_inches='tight')plt.show()print("✓ KS Statistic comparison chart saved")

### C. Performance Heatmap

In [None]:
# Performance Heatmap showing all metrics for all modelsplt.figure(figsize=(12, 14))metrics_for_heatmap = results_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'KS Statistic']]sns.heatmap(metrics_for_heatmap, annot=True, fmt='.4f', cmap='RdYlGn',             center=0.65, linewidths=0.5, cbar_kws={'label': 'Score'})plt.title('Model Performance Heatmap - All Metrics', fontsize=14, fontweight='bold')plt.xlabel('Metrics', fontsize=12)plt.ylabel('Model', fontsize=12)plt.tight_layout()plt.savefig('../models/performance_heatmap.png', dpi=300, bbox_inches='tight')plt.show()print("✓ Performance heatmap saved")

### D. Multi-Metric Radar Chart (Top 5 Models)

In [None]:
# Radar chart for top 5 modelstop_5 = results_df.head(5)categories = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'KS Statistic']N = len(categories)# Create angles for each metricangles = [n / float(N) * 2 * pi for n in range(N)]angles += angles[:1]  # Complete the circle# Create plotfig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))# Plot each modelcolors_radar = plt.cm.Set3(np.linspace(0, 1, 5))for idx, (_, row) in enumerate(top_5.iterrows()):    values = [row['Accuracy'], row['Precision'], row['Recall'],               row['F1-Score'], row['ROC-AUC'], row['KS Statistic']]    values += values[:1]  # Complete the circle        ax.plot(angles, values, 'o-', linewidth=2, label=row['Model'], color=colors_radar[idx])    ax.fill(angles, values, alpha=0.15, color=colors_radar[idx])# Customize plotax.set_xticks(angles[:-1])ax.set_xticklabels(categories, size=10)ax.set_ylim(0, 1)ax.set_yticks([0.2, 0.4, 0.6, 0.8])ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8'], size=8)ax.grid(True, linestyle='--', alpha=0.7)ax.set_title('Top 5 Models: Multi-Metric Comparison', size=14, fontweight='bold', pad=20)plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=9)plt.tight_layout()plt.savefig('../models/top5_radar_chart.png', dpi=300, bbox_inches='tight')plt.show()print("✓ Radar chart saved")

### E. Confusion Matrix (Best Model)

In [None]:
# Get the best model name and retrain to get confusion matrixbest_model_name = results_df.iloc[0]['Model']print(f"\nGenerating confusion matrix for: {best_model_name}")# Determine which dataset to use based on model nameif 'V1' in best_model_name and 'V2' not in best_model_name:    X_test_best = X_test_v1    y_test_best = y_test_v1    X_train_best = X_train_v1    y_train_best = y_train_v1elif 'Feature Set A' in best_model_name:    X_train_best, _, X_test_best, y_train_best, _, y_test_best, _, _ = prepare_data(df, FEATURES_A, TARGET)elif 'Feature Set B' in best_model_name:    X_train_best, _, X_test_best, y_train_best, _, y_test_best, _, _ = prepare_data(df, FEATURES_B, TARGET)else:  # Feature Set C    X_train_best, _, X_test_best, y_train_best, _, y_test_best, _, _ = prepare_data(df, FEATURES_C, TARGET)# Retrain the best modelif 'Logistic Regression' in best_model_name:    best_model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, C=0.1)elif 'Random Forest' in best_model_name:    best_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1)elif 'XGBoost' in best_model_name:    best_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1,                               random_state=RANDOM_STATE, eval_metric='logloss', use_label_encoder=False)elif 'LightGBM' in best_model_name:    best_model = LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1,                                random_state=RANDOM_STATE, verbose=-1)else:  # CatBoost    best_model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1,                                    random_state=RANDOM_STATE, verbose=False)best_model.fit(X_train_best, y_train_best)y_pred_best = best_model.predict(X_test_best)# Generate confusion matrixcm = confusion_matrix(y_test_best, y_pred_best)disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Paid', 'Defaulted'])fig, ax = plt.subplots(figsize=(8, 6))disp.plot(ax=ax, cmap='Blues', values_format='d')plt.title(f'Confusion Matrix: {best_model_name}', fontsize=14, fontweight='bold')plt.tight_layout()plt.savefig('../models/best_model_confusion_matrix.png', dpi=300, bbox_inches='tight')plt.show()print("✓ Confusion matrix saved")

## Section 13: Summary Statistics

In [None]:
print("\n" + "="*80)print("SUMMARY STATISTICS")print("="*80)# Best performing modelbest_model = results_df.iloc[0]print(f"\n{'='*80}")print("BEST MODEL")print(f"{'='*80}")print(f"Model: {best_model['Model']}")print(f"  Accuracy:      {best_model['Accuracy']:.4f}")print(f"  Precision:     {best_model['Precision']:.4f}")print(f"  Recall:        {best_model['Recall']:.4f}")print(f"  F1-Score:      {best_model['F1-Score']:.4f}")print(f"  ROC-AUC:       {best_model['ROC-AUC']:.4f}")print(f"  KS Statistic:  {best_model['KS Statistic']:.4f}")# V1 vs V2 comparisonv1_results = results_df[results_df['Model'].str.contains('V1') & ~results_df['Model'].str.contains('V2')]v2_results = results_df[results_df['Model'].str.contains('V2')]print(f"\n{'='*80}")print("V1 vs V2 COMPARISON")print(f"{'='*80}")print(f"\nV1 Models (n={len(v1_results)}):")print(f"  Average ROC-AUC:      {v1_results['ROC-AUC'].mean():.4f}")print(f"  Average KS Statistic: {v1_results['KS Statistic'].mean():.4f}")print(f"  Average F1-Score:     {v1_results['F1-Score'].mean():.4f}")print(f"\nV2 Models (n={len(v2_results)}):")print(f"  Average ROC-AUC:      {v2_results['ROC-AUC'].mean():.4f}")print(f"  Average KS Statistic: {v2_results['KS Statistic'].mean():.4f}")print(f"  Average F1-Score:     {v2_results['F1-Score'].mean():.4f}")if len(v1_results) > 0 and len(v2_results) > 0:    roc_improvement = ((v2_results['ROC-AUC'].mean() - v1_results['ROC-AUC'].mean()) / v1_results['ROC-AUC'].mean() * 100)    ks_improvement = ((v2_results['KS Statistic'].mean() - v1_results['KS Statistic'].mean()) / v1_results['KS Statistic'].mean() * 100)        print(f"\nImprovement:")    print(f"  ROC-AUC:      {roc_improvement:+.2f}%")    print(f"  KS Statistic: {ks_improvement:+.2f}%")# Feature set comparison (V2 only)print(f"\n{'='*80}")print("FEATURE SET COMPARISON (V2 MODELS)")print(f"{'='*80}")for feature_set in ['A', 'B', 'C']:    fs_results = v2_results[v2_results['Model'].str.contains(f'Feature Set {feature_set}')]    if len(fs_results) > 0:        print(f"\nFeature Set {feature_set} (n={len(fs_results)}):")        print(f"  Average ROC-AUC:      {fs_results['ROC-AUC'].mean():.4f}")        print(f"  Average KS Statistic: {fs_results['KS Statistic'].mean():.4f}")        print(f"  Average F1-Score:     {fs_results['F1-Score'].mean():.4f}")        print(f"  Best Model: {fs_results.iloc[0]['Model']}")        print(f"    ROC-AUC: {fs_results.iloc[0]['ROC-AUC']:.4f}")print("\n" + "="*80)

## Section 14: Export Results

In [None]:
# Save results to CSVcsv_path = '../models/model_evaluation_results.csv'results_df.to_csv(csv_path, index=False)print(f"\n✓ Results saved to: {csv_path}")# Create markdown reportmd_path = '../models/MODEL_EVALUATION_REPORT.md'with open(md_path, 'w') as f:    f.write("# Model Evaluation Report\n\n")    f.write("## Executive Summary\n\n")    f.write(f"Evaluated **{len(results_df)}** models across V1 and V2 architectures.\n\n")        f.write("## Best Model\n\n")    f.write(f"**{best_model['Model']}**\n\n")    f.write(f"- Accuracy: {best_model['Accuracy']:.4f}\n")    f.write(f"- Precision: {best_model['Precision']:.4f}\n")    f.write(f"- Recall: {best_model['Recall']:.4f}\n")    f.write(f"- F1-Score: {best_model['F1-Score']:.4f}\n")    f.write(f"- ROC-AUC: {best_model['ROC-AUC']:.4f}\n")    f.write(f"- KS Statistic: {best_model['KS Statistic']:.4f}\n\n")        f.write("## Complete Results\n\n")    f.write(results_df.to_markdown(index=False))    f.write("\n\n")        f.write("## V1 vs V2 Comparison\n\n")    f.write(f"| Model Version | Avg ROC-AUC | Avg KS Stat | Avg F1-Score |\n")    f.write(f"|---------------|-------------|-------------|--------------|\n")    f.write(f"| V1 Models     | {v1_results['ROC-AUC'].mean():.4f}      | {v1_results['KS Statistic'].mean():.4f}      | {v1_results['F1-Score'].mean():.4f}       |\n")    f.write(f"| V2 Models     | {v2_results['ROC-AUC'].mean():.4f}      | {v2_results['KS Statistic'].mean():.4f}      | {v2_results['F1-Score'].mean():.4f}       |\n")        if len(v1_results) > 0 and len(v2_results) > 0:        roc_improvement = ((v2_results['ROC-AUC'].mean() - v1_results['ROC-AUC'].mean()) / v1_results['ROC-AUC'].mean() * 100)        f.write(f"\n**Improvement: {roc_improvement:+.2f}% in ROC-AUC**\n")        f.write("\n## Feature Set Performance (V2)\n\n")    for feature_set in ['A', 'B', 'C']:        fs_results = v2_results[v2_results['Model'].str.contains(f'Feature Set {feature_set}')]        if len(fs_results) > 0:            f.write(f"\n### Feature Set {feature_set}\n")            f.write(f"- Average ROC-AUC: {fs_results['ROC-AUC'].mean():.4f}\n")            f.write(f"- Average KS Statistic: {fs_results['KS Statistic'].mean():.4f}\n")            f.write(f"- Best Model: {fs_results.iloc[0]['Model']} (ROC-AUC: {fs_results.iloc[0]['ROC-AUC']:.4f})\n")print(f"✓ Markdown report saved to: {md_path}")print("\n" + "="*80)print("MODEL EVALUATION COMPLETE")print("="*80)print(f"\nDeliverables:")print(f"  1. CSV Results:       {csv_path}")print(f"  2. Markdown Report:   {md_path}")print(f"  3. ROC-AUC Chart:     ../models/roc_auc_comparison.png")print(f"  4. KS Stat Chart:     ../models/ks_statistic_comparison.png")print(f"  5. Heatmap:           ../models/performance_heatmap.png")print(f"  6. Radar Chart:       ../models/top5_radar_chart.png")print(f"  7. Confusion Matrix:  ../models/best_model_confusion_matrix.png")print("="*80)