# ROSE Women's Foundation - Loan Default Prediction Models
## Comprehensive ML Pipeline for Default Risk Assessment

This notebook implements:
1. Data Loading & Preprocessing
2. Multiple Model Training (Logistic Regression, Random Forest, XGBoost, LightGBM, CatBoost)
3. Evaluation Metrics (Accuracy, Precision, Recall, F1-Score, ROC-AUC, KS Statistic)
4. Model Comparison & Selection
5. Model Saving & Prediction Function
6. Model Interpretability with SHAP

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import joblib

# Preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, classification_report
)
from scipy import stats

# Interpretability
import shap

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('colorblind')
pd.set_option('display.max_columns', None)

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

---
# Section 1: Data Loading & Preprocessing

In [None]:
# Load the dataset (handle encoding issues)
# Data path is relative to notebooks directory; adjust if running from different location
DATA_PATH = '../Github Original Data.csv'
df = pd.read_csv(DATA_PATH, encoding='latin-1')
print(f"Dataset shape: {df.shape}")
print(f"\nTarget variable 'Defaulted' distribution:")
print(df['Defaulted'].value_counts())
print(f"\nDefault rate: {df['Defaulted'].mean()*100:.2f}%")

In [None]:
# Define features based on EDA findings
# Tier 1 (Priority) + Tier 2 features
FEATURE_COLUMNS = [
    # Tier 1 - Strong predictors
    'Extra Income Brackets',
    'Categorize Rent Payment',
    'School Fees Categorical',
    'Age Group',
    'Education',
    'Loan Access',  # Prior Loan Access
    'CRB Class',
    # Tier 2 - Moderate predictors
    'Logic on Income',  # Income Diversity
    'Categorizing Utility Expenses',
    'Expense Relative to Income',
    'Affordability (HH)',
    'Living',
]

TARGET = 'Defaulted'

# Check which features are available
available_features = [f for f in FEATURE_COLUMNS if f in df.columns]
missing_features = [f for f in FEATURE_COLUMNS if f not in df.columns]

print(f"Available features ({len(available_features)}): {available_features}")
print(f"\nMissing features ({len(missing_features)}): {missing_features}")

In [None]:
# Create working dataset with available features
df_model = df[available_features + [TARGET]].copy()
print(f"Working dataset shape: {df_model.shape}")
print(f"\nMissing values per column:")
print(df_model.isnull().sum())

In [None]:
# Handle missing values
# For categorical: fill with mode
# For numerical: fill with median

for col in df_model.columns:
    if col == TARGET:
        continue
    if df_model[col].dtype == 'object':
        mode_val = df_model[col].mode()[0] if len(df_model[col].mode()) > 0 else 'Unknown'
        df_model[col] = df_model[col].fillna(mode_val)
    else:
        df_model[col] = df_model[col].fillna(df_model[col].median())

print("Missing values after imputation:")
print(df_model.isnull().sum().sum())

In [None]:
# Encode categorical features using LabelEncoder
encoders = {}
df_encoded = df_model.copy()

for col in available_features:
    if df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        encoders[col] = le
        print(f"Encoded {col}: {len(le.classes_)} classes")

print(f"\nTotal encoders created: {len(encoders)}")

In [None]:
# Prepare features and target
X = df_encoded[available_features]
y = df_encoded[TARGET]

# Train/Val/Test split: 70/15/15 (stratified)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=RANDOM_STATE, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=RANDOM_STATE, stratify=y_temp
)  # 0.176 of 0.85 = ~0.15 of total

print(f"Training set: {X_train.shape[0]} samples ({y_train.mean()*100:.1f}% default)")
print(f"Validation set: {X_val.shape[0]} samples ({y_val.mean()*100:.1f}% default)")
print(f"Test set: {X_test.shape[0]} samples ({y_test.mean()*100:.1f}% default)")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for tree-based models (they don't need scaling but we keep column names)
X_train_df = pd.DataFrame(X_train_scaled, columns=available_features, index=X_train.index)
X_val_df = pd.DataFrame(X_val_scaled, columns=available_features, index=X_val.index)
X_test_df = pd.DataFrame(X_test_scaled, columns=available_features, index=X_test.index)

print("Feature scaling complete.")

In [None]:
# Apply SMOTE to training data to handle class imbalance
smote = SMOTE(random_state=RANDOM_STATE)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training set: {len(y_train)} samples")
print(f"After SMOTE: {len(y_train_smote)} samples")
print(f"Class distribution after SMOTE: {np.bincount(y_train_smote)}")

---
# Section 2: Train Multiple Models

In [None]:
def calculate_ks_statistic(y_true, y_prob):
    """Calculate Kolmogorov-Smirnov statistic."""
    # Separate probabilities by class
    prob_default = y_prob[y_true == 1]
    prob_paid = y_prob[y_true == 0]
    
    # Use scipy's KS test
    ks_stat, _ = stats.ks_2samp(prob_default, prob_paid)
    return ks_stat

def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model and return all metrics."""
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_prob),
        'KS Statistic': calculate_ks_statistic(y_test, y_prob)
    }
    
    return metrics, y_pred, y_prob

In [None]:
# Calculate scale_pos_weight for imbalanced data
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")

# Define models with their hyperparameter search spaces
models_config = {
    'Logistic Regression': {
        'model': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=RANDOM_STATE),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l2']
        },
        'use_smote': False  # Uses class_weight instead
    },
    'Random Forest': {
        'model': RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [5, 10, 15, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        },
        'use_smote': False
    },
    'XGBoost': {
        'model': XGBClassifier(
            scale_pos_weight=scale_pos_weight,
            random_state=RANDOM_STATE,
            eval_metric='logloss',
            use_label_encoder=False
        ),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        },
        'use_smote': False
    },
    'LightGBM': {
        'model': LGBMClassifier(
            class_weight='balanced',
            random_state=RANDOM_STATE,
            verbose=-1
        ),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5, 7, -1],
            'learning_rate': [0.01, 0.1, 0.2],
            'num_leaves': [31, 50, 100]
        },
        'use_smote': False
    },
    'CatBoost': {
        'model': CatBoostClassifier(
            auto_class_weights='Balanced',
            random_state=RANDOM_STATE,
            verbose=0
        ),
        'params': {
            'iterations': [100, 200],
            'depth': [4, 6, 8],
            'learning_rate': [0.01, 0.1, 0.2]
        },
        'use_smote': False
    }
}

print(f"Models to train: {list(models_config.keys())}")

In [None]:
# Train all models with RandomizedSearchCV
trained_models = {}
all_metrics = []
all_predictions = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for name, config in models_config.items():
    print(f"\n{'='*60}")
    print(f"Training: {name}")
    print('='*60)
    
    # Select data based on whether to use SMOTE
    if config['use_smote']:
        X_fit, y_fit = X_train_smote, y_train_smote
    else:
        X_fit, y_fit = X_train_scaled, y_train
    
    # Randomized search for hyperparameter tuning
    search = RandomizedSearchCV(
        config['model'],
        config['params'],
        n_iter=10,
        cv=cv,
        scoring='roc_auc',
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    
    search.fit(X_fit, y_fit)
    
    best_model = search.best_estimator_
    trained_models[name] = best_model
    
    print(f"Best parameters: {search.best_params_}")
    print(f"Best CV ROC-AUC: {search.best_score_:.4f}")
    
    # Evaluate on test set
    metrics, y_pred, y_prob = evaluate_model(best_model, X_test_scaled, y_test, name)
    all_metrics.append(metrics)
    all_predictions[name] = {'y_pred': y_pred, 'y_prob': y_prob}
    
    print(f"\nTest Set Metrics:")
    for metric, value in metrics.items():
        if metric != 'Model':
            print(f"  {metric}: {value:.4f}")

---
# Section 3: Evaluation Metrics & Visualizations

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, (name, preds) in enumerate(all_predictions.items()):
    cm = confusion_matrix(y_test, preds['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Paid', 'Defaulted'],
                yticklabels=['Paid', 'Defaulted'])
    axes[idx].set_title(f'{name}\nConfusion Matrix')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')

# Hide empty subplot
axes[-1].axis('off')
plt.tight_layout()
plt.savefig('../models/confusion_matrices.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

for name, preds in all_predictions.items():
    fpr, tpr, _ = roc_curve(y_test, preds['y_prob'])
    auc = roc_auc_score(y_test, preds['y_prob'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - All Models', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.savefig('../models/roc_curves.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Plot Precision-Recall curves
plt.figure(figsize=(10, 8))

for name, preds in all_predictions.items():
    precision, recall, _ = precision_recall_curve(y_test, preds['y_prob'])
    plt.plot(recall, precision, label=f'{name}', linewidth=2)

plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curves - All Models', fontsize=14)
plt.legend(loc='lower left', fontsize=10)
plt.grid(True, alpha=0.3)
plt.savefig('../models/precision_recall_curves.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Plot feature importance for tree-based models
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.flatten()

tree_models = ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost']

for idx, name in enumerate(tree_models):
    model = trained_models[name]
    
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    else:
        importances = np.zeros(len(available_features))
    
    # Sort by importance
    indices = np.argsort(importances)[::-1]
    
    axes[idx].barh(range(len(available_features)), importances[indices], color='steelblue')
    axes[idx].set_yticks(range(len(available_features)))
    axes[idx].set_yticklabels([available_features[i] for i in indices])
    axes[idx].set_xlabel('Importance')
    axes[idx].set_title(f'{name} - Feature Importance')
    axes[idx].invert_yaxis()

plt.tight_layout()
plt.savefig('../models/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

---
# Section 4: Model Comparison

In [None]:
# Create comparison table
comparison_df = pd.DataFrame(all_metrics)
comparison_df = comparison_df.set_index('Model')

# Round for display
comparison_display = comparison_df.round(4)

print("="*80)
print("MODEL COMPARISON - ALL METRICS")
print("="*80)
print(comparison_display.to_string())

# Save to CSV
comparison_df.to_csv('../models/model_comparison.csv')
print("\nComparison saved to models/model_comparison.csv")

In [None]:
# Visualization: Bar chart comparison
fig, ax = plt.subplots(figsize=(14, 8))

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'KS Statistic']
x = np.arange(len(metrics_to_plot))
width = 0.15

for i, model in enumerate(comparison_df.index):
    values = [comparison_df.loc[model, m] for m in metrics_to_plot]
    ax.bar(x + i*width, values, width, label=model)

ax.set_ylabel('Score', fontsize=12)
ax.set_title('Model Comparison Across All Metrics', fontsize=14)
ax.set_xticks(x + width * 2)
ax.set_xticklabels(metrics_to_plot, fontsize=11)
ax.legend(loc='upper right', fontsize=9)
ax.set_ylim(0, 1.0)
ax.axhline(y=0.75, color='red', linestyle='--', alpha=0.5, label='ROC-AUC Target')
ax.axhline(y=0.70, color='orange', linestyle='--', alpha=0.5, label='F1 Target')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../models/model_comparison_chart.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Select best model based on ROC-AUC
best_model_name = comparison_df['ROC-AUC'].idxmax()
best_model = trained_models[best_model_name]
best_metrics = comparison_df.loc[best_model_name]

print("="*60)
print("BEST MODEL SELECTED")
print("="*60)
print(f"\nModel: {best_model_name}")
print(f"\nMetrics:")
for metric, value in best_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Check against success criteria
print("\n" + "="*60)
print("SUCCESS CRITERIA CHECK")
print("="*60)
criteria = [
    ('ROC-AUC >= 0.75', best_metrics['ROC-AUC'] >= 0.75, best_metrics['ROC-AUC']),
    ('F1-Score >= 0.70', best_metrics['F1-Score'] >= 0.70, best_metrics['F1-Score']),
    ('KS Statistic >= 0.35', best_metrics['KS Statistic'] >= 0.35, best_metrics['KS Statistic']),
]

for criterion, passed, value in criteria:
    status = '✓ PASSED' if passed else '✗ NOT MET'
    print(f"{criterion}: {value:.4f} - {status}")

print(f"\nCompared 5 models: ✓ PASSED")

---
# Section 5: Save Best Model

In [None]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save best model
joblib.dump(best_model, '../models/best_loan_default_model.pkl')
print(f"Best model saved: models/best_loan_default_model.pkl")

# Save scaler
joblib.dump(scaler, '../models/scaler.pkl')
print(f"Scaler saved: models/scaler.pkl")

# Save encoders
joblib.dump(encoders, '../models/encoders.pkl')
print(f"Encoders saved: models/encoders.pkl")

# Save feature list
joblib.dump(available_features, '../models/feature_list.pkl')
print(f"Feature list saved: models/feature_list.pkl")

# Save all trained models for comparison
joblib.dump(trained_models, '../models/all_trained_models.pkl')
print(f"All models saved: models/all_trained_models.pkl")

---
# Section 6: Prediction Function

In [None]:
def predict_loan_default(borrower_features, model_path='../models/best_loan_default_model.pkl',
                         scaler_path='../models/scaler.pkl',
                         encoders_path='../models/encoders.pkl',
                         features_path='../models/feature_list.pkl'):
    """
    Predict loan default probability for a borrower.
    
    Parameters:
    -----------
    borrower_features : dict
        Dictionary with feature names as keys and values.
        Example: {
            'Extra Income Brackets': 'Low Extra Income',
            'Categorize Rent Payment': 'High Rent',
            'Age Group': 'Mid Life 40-49',
            ...
        }
    
    Returns:
    --------
    dict with:
        - default_probability: float (0-1)
        - risk_category: str ('Low', 'Medium', 'High')
        - recommendation: str
    """
    # Load model and preprocessors
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    encoders = joblib.load(encoders_path)
    feature_list = joblib.load(features_path)
    
    # Create feature vector
    X = pd.DataFrame([borrower_features])
    
    # Ensure all features are present
    for feature in feature_list:
        if feature not in X.columns:
            X[feature] = 'Unknown'  # Default value
    
    # Reorder columns
    X = X[feature_list]
    
    # Encode categorical features
    for col in feature_list:
        if col in encoders:
            le = encoders[col]
            val = str(X[col].iloc[0])
            if val in le.classes_:
                X[col] = le.transform([val])[0]
            else:
                # Handle unseen category - use most frequent
                X[col] = 0
    
    # Scale features
    X_scaled = scaler.transform(X)
    
    # Predict
    probability = model.predict_proba(X_scaled)[0, 1]
    
    # Determine risk category
    if probability < 0.3:
        risk_category = 'Low Risk'
        recommendation = 'APPROVE - Low default risk. Standard loan terms recommended.'
    elif probability < 0.5:
        risk_category = 'Medium Risk'
        recommendation = 'REVIEW - Moderate default risk. Consider reduced loan amount or additional guarantor.'
    else:
        risk_category = 'High Risk'
        recommendation = 'CAUTION - High default risk. Recommend declining or requiring strong collateral.'
    
    return {
        'default_probability': round(probability, 4),
        'risk_category': risk_category,
        'recommendation': recommendation
    }

print("Prediction function defined successfully.")

In [None]:
# Test with sample borrowers
print("="*70)
print("SAMPLE PREDICTIONS")
print("="*70)

# Sample 1: Low-risk borrower profile
low_risk_borrower = {
    'Extra Income Brackets': 'Moderate to High Extra Income',
    'Categorize Rent Payment': 'High Rent',
    'School Fees Categorical': 'High School Fees',
    'Age Group': 'Mid Life 40-49',
    'Education': 'Tertiary level (Colleges, Universities, Polytechnics)',
    'Loan Access': 'No',
    'CRB Class': 'Active Low\x96Medium Risk',
    'Logic on Income': 'Income + Extra + Regular (Full Diversity)',
    'Categorizing Utility Expenses': 'High Utility Expenses',
    'Expense Relative to Income': '1/3 or Less of Income',
    'Affordability (HH)': 'Profitable (Affordable)',
    'Living': 'Peri-Urban'
}

print("\nSample 1: Low-Risk Borrower Profile")
result1 = predict_loan_default(low_risk_borrower)
print(f"  Default Probability: {result1['default_probability']:.2%}")
print(f"  Risk Category: {result1['risk_category']}")
print(f"  Recommendation: {result1['recommendation']}")

# Sample 2: High-risk borrower profile
high_risk_borrower = {
    'Extra Income Brackets': 'No Extra Income',
    'Categorize Rent Payment': 'Low Rent',
    'School Fees Categorical': 'No School Fees',
    'Age Group': 'Young Adults 21-29',
    'Education': 'Secondary Incomplete',
    'Loan Access': 'Yes',
    'CRB Class': 'Legacy',
    'Logic on Income': 'Income Only',
    'Categorizing Utility Expenses': 'No Utility Expenses',
    'Expense Relative to Income': 'More than 2/3 of Income',
    'Affordability (HH)': 'Low/Negative Profit (Unviable)',
    'Living': 'Urban'
}

print("\nSample 2: High-Risk Borrower Profile")
result2 = predict_loan_default(high_risk_borrower)
print(f"  Default Probability: {result2['default_probability']:.2%}")
print(f"  Risk Category: {result2['risk_category']}")
print(f"  Recommendation: {result2['recommendation']}")

# Sample 3: Medium-risk borrower profile
medium_risk_borrower = {
    'Extra Income Brackets': 'Low Extra Income',
    'Categorize Rent Payment': 'Low Rent',
    'School Fees Categorical': 'Low School Fees',
    'Age Group': 'Early Mature 30-39',
    'Education': 'Secondary Complete',
    'Loan Access': 'Yes',
    'CRB Class': 'Active High\x96Medium High Risk',
    'Logic on Income': 'Income + Extra',
    'Categorizing Utility Expenses': 'Low Utility Expenses',
    'Expense Relative to Income': 'Half of Income',
    'Affordability (HH)': 'Profitable (Affordable)',
    'Living': 'Peri-Urban'
}

print("\nSample 3: Medium-Risk Borrower Profile")
result3 = predict_loan_default(medium_risk_borrower)
print(f"  Default Probability: {result3['default_probability']:.2%}")
print(f"  Risk Category: {result3['risk_category']}")
print(f"  Recommendation: {result3['recommendation']}")

---
# Section 7: Model Interpretability

In [None]:
# Logistic Regression Coefficient Analysis
lr_model = trained_models['Logistic Regression']

coefficients = pd.DataFrame({
    'Feature': available_features,
    'Coefficient': lr_model.coef_[0]
})
coefficients['Abs_Coefficient'] = np.abs(coefficients['Coefficient'])
coefficients = coefficients.sort_values('Abs_Coefficient', ascending=False)

print("="*60)
print("LOGISTIC REGRESSION COEFFICIENTS")
print("="*60)
print("\nPositive coefficients increase default probability")
print("Negative coefficients decrease default probability")
print("\n" + coefficients.to_string(index=False))

# Visualization
plt.figure(figsize=(10, 8))
colors = ['red' if c > 0 else 'green' for c in coefficients['Coefficient']]
plt.barh(coefficients['Feature'], coefficients['Coefficient'], color=colors)
plt.xlabel('Coefficient Value')
plt.title('Logistic Regression Coefficients\n(Red = Increases Default Risk, Green = Decreases Default Risk)')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.savefig('../models/logistic_coefficients.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# SHAP Analysis for best model
print(f"\nGenerating SHAP values for {best_model_name}...")

# Create explainer based on model type
if best_model_name in ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost']:
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test_scaled)
    
    # For binary classification, take the positive class
    if isinstance(shap_values, list):
        shap_values = shap_values[1]
else:
    # For Logistic Regression
    explainer = shap.LinearExplainer(best_model, X_train_scaled)
    shap_values = explainer.shap_values(X_test_scaled)

print("SHAP values computed successfully.")

In [None]:
# SHAP Summary Plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test_df, feature_names=available_features, show=False)
plt.title(f'SHAP Feature Importance - {best_model_name}')
plt.tight_layout()
plt.savefig('../models/shap_summary.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# SHAP Bar Plot (Mean absolute SHAP values)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test_df, feature_names=available_features, 
                  plot_type='bar', show=False)
plt.title(f'Mean |SHAP| Feature Importance - {best_model_name}')
plt.tight_layout()
plt.savefig('../models/shap_importance_bar.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top 15 Features Summary
mean_shap = np.abs(shap_values).mean(axis=0)
feature_importance_shap = pd.DataFrame({
    'Feature': available_features,
    'Mean |SHAP|': mean_shap
}).sort_values('Mean |SHAP|', ascending=False)

print("="*60)
print("TOP FEATURES BY SHAP IMPORTANCE")
print("="*60)
print(feature_importance_shap.head(15).to_string(index=False))

---
# Final Summary

In [None]:
print("="*70)
print("LOAN DEFAULT PREDICTION MODEL - FINAL SUMMARY")
print("="*70)

print(f"\n1. DATASET")
print(f"   - Total samples: {len(df)}")
print(f"   - Features used: {len(available_features)}")
print(f"   - Default rate: {df['Defaulted'].mean()*100:.2f}%")

print(f"\n2. MODELS TRAINED")
for model_name in trained_models.keys():
    print(f"   - {model_name}")

print(f"\n3. BEST MODEL: {best_model_name}")
print(f"   - ROC-AUC: {best_metrics['ROC-AUC']:.4f}")
print(f"   - F1-Score: {best_metrics['F1-Score']:.4f}")
print(f"   - KS Statistic: {best_metrics['KS Statistic']:.4f}")

print(f"\n4. ARTIFACTS SAVED")
print(f"   - models/best_loan_default_model.pkl")
print(f"   - models/scaler.pkl")
print(f"   - models/encoders.pkl")
print(f"   - models/feature_list.pkl")
print(f"   - models/all_trained_models.pkl")
print(f"   - models/model_comparison.csv")

print(f"\n5. VISUALIZATIONS SAVED")
print(f"   - models/confusion_matrices.png")
print(f"   - models/roc_curves.png")
print(f"   - models/precision_recall_curves.png")
print(f"   - models/feature_importance.png")
print(f"   - models/logistic_coefficients.png")
print(f"   - models/shap_summary.png")
print(f"   - models/shap_importance_bar.png")

print("\n" + "="*70)
print("READY FOR DEPLOYMENT")
print("="*70)