In [6]:
# 06 · Global SHAP Analysis
## 1. Load Data & Model
## 2. Calculate SHAP Values
## 3. Summary Plot
## 4. Top-20 Feature Bar

from pathlib import Path
import joblib, shap, numpy as np, pandas as pd, matplotlib.pyplot as plt

ROOT = Path.cwd().parent
X = joblib.load(ROOT/"data/processed/X_train.pkl")
model = joblib.load(Root/"models/final/model.pkl")
print(X.shape)

(7043, 38)


In [7]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X, check_additivity=False)   # XGBx

In [9]:
plt.figure(figsize=(9,6))
shap.summary_plot(shap_values, X, plot_type="dot", show=False)
plt.tight_layout()
plt.savefig(ROOT/"figures/shap_summary.png", dpi=150)
plt.close()


In [15]:
PIPE = joblib.load(ROOT/"models/feature_pipeline_v2.pkl")
feature_names = PIPE.named_steps["pre"].get_feature_names_out()
X_df = pd.DataFrame(X, columns=PIPE.named_steps["pre"].get_feature_names_out())


importances = np.abs(shap_values).mean(0)
idx = np.argsort(importances)[::-1][:20]  

top_feats = feature_names[idx]

plt.figure(figsize=(6,5))
plt.barh(top_feats[::-1], importances[idx][::-1])
plt.xlabel("Mean |SHAP|")
plt.title("Top-20 Feature Importance (SHAP)")
plt.tight_layout()
plt.savefig(ROOT/"figures/feature_importance_bar.png", dpi=150)
plt.close()


In [16]:
# client_shap
idx_good = np.where((model.predict_proba(X)[:,1] < .1))[0][0]
proba = model.predict_proba(X)[:,1]
idx_mid = np.where((proba >= 0.45) & (proba <= 0.55))[0][0]
idx_bad  = np.where((model.predict_proba(X)[:,1] > .9))[0][0]
samples = [idx_good, idx_mid, idx_bad]

In [17]:
for n, i in enumerate(samples, 1):
    shap.plots.waterfall(shap.Explanation(values=shap_values[i],
                                          base_values=explainer.expected_value,
                                          data=X_df.iloc[i],
                                          feature_names=X_df.columns),
                         max_display=12, show=False)
    plt.savefig(ROOT/f"figures/client_{n}_force.png", dpi=150, bbox_inches="tight")
    plt.close()