In [9]:
# Boxplot of deltas across folds
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df_deltas = df_all_ablation[df_all_ablation["removed"] != "none"]

sns.boxplot(data=df_deltas, y="removed", x="delta_roc_auc", ax=axes[0], palette="Set2")
axes[0].axvline(0, color="red", linestyle="--", linewidth=1)
axes[0].set_xlabel("Delta ROC-AUC")
axes[0].set_ylabel("")
axes[0].set_title("ROC-AUC drop when removing each model (5 folds)")

sns.boxplot(data=df_deltas, y="removed", x="delta_pr_auc", ax=axes[1], palette="Set3")
axes[1].axvline(0, color="red", linestyle="--", linewidth=1)
axes[1].set_xlabel("Delta PR-AUC")
axes[1].set_ylabel("")
axes[1].set_title("PR-AUC drop when removing each model (5 folds)")

plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

In [None]:
# Average deltas across folds
df_summary = df_all_ablation[df_all_ablation["removed"] != "none"].groupby("removed").agg({
    "delta_roc_auc": ["mean", "std"],
    "delta_pr_auc": ["mean", "std"],
    "delta_log_loss": ["mean", "std"]
}).round(4)

df_summary.columns = ["_".join(col) for col in df_summary.columns]
df_summary = df_summary.reset_index()
df_summary

In [None]:
all_ablation = []

for fold_idx, fold in enumerate(cv_folds, start=1):
    print(f"Processing fold {fold_idx}/5...")
    
    train_idx = np.array(fold["train_idx"])
    test_idx = np.array(fold["test_idx"])
    
    results = ablation_study(X, y, train_idx, test_idx, base_models, n_splits=3)
    
    for r in results:
        r["fold"] = fold_idx
        all_ablation.append(r)

df_all_ablation = pd.DataFrame(all_ablation)
df_all_ablation.head(10)

## 3. Cross-fold stability

Check if ablation results are consistent across all 5 folds.

In [None]:
plt.figure(figsize=(8, 4))
plt.barh(df_coef["model"], df_coef["coefficient"], color="mediumseagreen")
plt.axvline(0, color="black", linestyle="-", linewidth=0.8)
plt.xlabel("Logistic Regression Coefficient")
plt.title("Meta-model weights for each base model")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
fold_data = preprocess_fold(X, y, train_idx, test_idx, winsorize=False)
Xtr, ytr = fold_data["X_train"], fold_data["y_train"]
Xte, yte = fold_data["X_test"], fold_data["y_test"]

meta_model, _ = train_stacking(Xtr, ytr, Xte, base_models=base_models, n_splits=3)

coefficients = meta_model.coef_[0]
intercept = meta_model.intercept_[0]

df_coef = pd.DataFrame({
    "model": list(base_models.keys()),
    "coefficient": coefficients
})

print(f"Intercept: {intercept:.4f}\n")
df_coef

## 2. Meta-model coefficients

Check what weights the logistic regression meta-learner assigns to each base model.

In [None]:
# Visualize deltas
df_plot = df_ablation[df_ablation["removed"] != "none"].copy()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# ROC AUC delta
axes[0].barh(df_plot["removed"], df_plot["delta_roc_auc"], color="steelblue")
axes[0].axvline(0, color="red", linestyle="--", linewidth=1)
axes[0].set_xlabel("Delta ROC-AUC")
axes[0].set_title("Impact of removing each model")
axes[0].invert_yaxis()

# PR AUC delta
axes[1].barh(df_plot["removed"], df_plot["delta_pr_auc"], color="coral")
axes[1].axvline(0, color="red", linestyle="--", linewidth=1)
axes[1].set_xlabel("Delta PR-AUC")
axes[1].set_title("Impact of removing each model")
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

Negative delta in ROC-AUC/PR-AUC means removing that model hurts performance (model is important).

In [None]:
base_models = {
    "logistic": train_logistic,
    "rf": train_random_forest,
    "gb": train_boosting,
    "xrfm": train_xrfm,
}

# Run ablation on first fold for speed
fold_1 = cv_folds[0]
train_idx = np.array(fold_1["train_idx"])
test_idx = np.array(fold_1["test_idx"])

ablation_results = ablation_study(X, y, train_idx, test_idx, base_models, n_splits=3)
df_ablation = pd.DataFrame(ablation_results)
df_ablation

In [None]:
def ablation_study(X, y, train_idx, test_idx, base_models, n_splits=3):
    """
    Drop-one ablation: remove each model and measure delta.
    """
    fold_data = preprocess_fold(X, y, train_idx, test_idx, winsorize=False)
    Xtr, ytr = fold_data["X_train"], fold_data["y_train"]
    Xte, yte = fold_data["X_test"], fold_data["y_test"]
    
    # Full stacking
    _, prob_full = train_stacking(Xtr, ytr, Xte, base_models=base_models, n_splits=n_splits)
    metrics_full = evaluate_model(yte, prob_full)
    
    results = []
    
    # Baseline: all models
    results.append({
        "removed": "none",
        "roc_auc": metrics_full["roc_auc"],
        "pr_auc": metrics_full["pr_auc"],
        "log_loss": metrics_full["log_loss"]
    })
    
    # Remove each model
    for name in base_models.keys():
        bm_partial = {k: v for k, v in base_models.items() if k != name}
        
        _, prob_partial = train_stacking(Xtr, ytr, Xte, base_models=bm_partial, n_splits=n_splits)
        metrics_partial = evaluate_model(yte, prob_partial)
        
        results.append({
            "removed": name,
            "roc_auc": metrics_partial["roc_auc"],
            "pr_auc": metrics_partial["pr_auc"],
            "log_loss": metrics_partial["log_loss"],
            "delta_roc_auc": metrics_partial["roc_auc"] - metrics_full["roc_auc"],
            "delta_pr_auc": metrics_partial["pr_auc"] - metrics_full["pr_auc"],
            "delta_log_loss": metrics_partial["log_loss"] - metrics_full["log_loss"]
        })
    
    return results

## 1. Ablation Study

Remove each model from stacking and measure performance drop.

In [None]:
DATA_PATH = "../data/data.csv"
TARGET = "default"
FOLDS_PATH = "../results/cv_folds.json"

X, y = load_data(DATA_PATH, TARGET)

with open(FOLDS_PATH, "r") as f:
    cv_folds = json.load(f)

print(f"Loaded {len(cv_folds)} folds")
print(f"Data shape: {X.shape}")

## Load data & folds from main pipeline

In [None]:
import sys
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from preprocessing.preprocess import load_data, preprocess_fold
from evaluation.metrics import evaluate_model
from models.logistic import train_logistic
from models.random_forest import train_random_forest
from models.boosting import train_boosting
from models.xrfm import train_xrfm
from models.stacking import train_stacking

sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

# Stacking Analysis

Analysis of individual model contributions to the stacking ensemble.