# SHAP Model Interpretability Analysis
This notebook contains SHAP (SHapley Additive exPlanations) visualizations to interpret the credit risk model's decisions.

In [None]:
import shap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split

# Set style for better visuals
try:
    plt.style.use('seaborn-v0_8-whitegrid')
except:
    plt.style.use('ggplot')

print("Loading data and model...")
# 1. Load Model and Metadata
rf_model = joblib.load('models/best_model.pkl')
feat_info = joblib.load('models/feature_info.pkl')
scaler = joblib.load('models/scaler.pkl')
feature_names = feat_info.get('feature_names', [])

# 2. Reconstruct X_test (Matching training pipeline)
f_df = pd.read_csv('data/processed/customer_features.csv')
t_df = pd.read_csv('data/processed/target_variable.csv')
merged = f_df.merge(t_df, on='CustomerId', how='inner')

# Ensure required feature engineering columns exist
if 'avg_amount' not in merged.columns:
    merged['avg_amount'] = merged['total_amount'] / merged['transaction_count'].replace(0, 1)
    merged['log_total_amount'] = np.log1p(merged['total_amount'].clip(lower=0))
    merged['log_transaction_count'] = np.log1p(merged['transaction_count'].clip(lower=0))
    merged['avg_txn_size'] = merged['total_amount'] / merged['transaction_count'].replace(0, 1)
    merged['amount_category_code'] = pd.cut(merged['total_amount'], bins=[-np.inf, 100, 500, 2000, np.inf], labels=[0, 1, 2, 3]).astype(float)
    merged['tx_count_category_code'] = pd.cut(merged['transaction_count'], bins=[-np.inf, 5, 20, 50, np.inf], labels=[0, 1, 2, 3]).astype(float)

X = merged[feature_names]
y = merged['is_high_risk']
_, X_test_raw, _, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_test = pd.DataFrame(scaler.transform(X_test_raw), columns=feature_names)
print(f"Setup complete. X_test shape: {X_test.shape}")

In [None]:
# Create SHAP explainer
explainer = shap.TreeExplainer(rf_model)

# Use a sample of test data for faster computation
X_sample = X_test.sample(min(200, len(X_test)), random_state=42)

# Calculate SHAP values
shap_values = explainer.shap_values(X_sample)

# Handle binary classification output format differences
if isinstance(shap_values, list):
    shap_values_risk = shap_values[1] if len(shap_values) > 1 else shap_values[0]
    base_value = explainer.expected_value[1] if hasattr(explainer, 'expected_value') and isinstance(explainer.expected_value, (list, np.ndarray)) and len(explainer.expected_value) > 1 else explainer.expected_value
else:
    shap_values_risk = shap_values
    base_value = explainer.expected_value

# 1. Summary Plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values_risk, X_sample, show=False)
plt.title('SHAP Summary Plot: Feature Impact on Credit Risk', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('shap_summary_plot.png', dpi=150, bbox_inches='tight')
plt.show()

# 2. Find High and Low Risk Examples
probabilities = rf_model.predict_proba(X_sample)[:, 1]
high_risk_idx = np.argmax(probabilities)
low_risk_idx = np.argmin(probabilities)

# 3. Waterfall Plot - High Risk customer
plt.figure(figsize=(12, 8))
try:
    exp_high = shap.Explanation(values=shap_values_risk[high_risk_idx], 
                                base_values=base_value, 
                                data=X_sample.iloc[high_risk_idx], 
                                feature_names=X_sample.columns.tolist())
    shap.waterfall_plot(exp_high, show=False)
except:
    shap.waterfall_plot(shap_values_risk[high_risk_idx], show=False)
plt.title(f'SHAP Waterfall Plot: High-Risk Customer (Score: {probabilities[high_risk_idx]:.1%})', 
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('shap_waterfall_highrisk.png', dpi=150, bbox_inches='tight')
plt.show()

# 4. Waterfall Plot - Low Risk customer
plt.figure(figsize=(12, 8))
try:
    exp_low = shap.Explanation(values=shap_values_risk[low_risk_idx], 
                               base_values=base_value, 
                               data=X_sample.iloc[low_risk_idx], 
                               feature_names=X_sample.columns.tolist())
    shap.waterfall_plot(exp_low, show=False)
except:
    shap.waterfall_plot(shap_values_risk[low_risk_idx], show=False)
plt.title(f'SHAP Waterfall Plot: Low-Risk Customer (Score: {probabilities[low_risk_idx]:.1%})', 
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('shap_waterfall_lowrisk.png', dpi=150, bbox_inches='tight')
plt.show()