In [3]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("Project root added to PYTHONPATH:", PROJECT_ROOT)

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold

from preprocessing.preprocess import load_data, preprocess_fold


Project root added to PYTHONPATH: /Users/evamartin/Desktop/MDS/curs3/AML/projects/AML-Project3


In [4]:
DATA_PATH = "../data/data.csv"
TARGET = "default"

X, y = load_data(DATA_PATH, TARGET)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("\nClass distribution:")
print(y.value_counts(normalize=True))


X shape: (30000, 23)
y shape: (30000,)

Class distribution:
default
0    0.7788
1    0.2212
Name: proportion, dtype: float64


In [5]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [6]:
train_idx, test_idx = next(cv.split(X, y))

fold_data = preprocess_fold(
    X, y,
    train_idx, test_idx,
    winsorize=False
)

Xtr = fold_data["X_train"]
Xte = fold_data["X_test"]
ytr = fold_data["y_train"]
yte = fold_data["y_test"]

print("Train shape:", Xtr.shape)
print("Test shape:", Xte.shape)


Train shape: (24000, 33)
Test shape: (6000, 33)


In [7]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
    
from copy import deepcopy
from evaluation.metrics import evaluate_model
from models.stacking import train_stacking

def stacking_ablation(Xtr, ytr, Xte, yte, base_models, n_splits=3):
    """
    Drop-one ablation for stacking.
    Returns metric deltas relative to full stacking.
    """

    # Full stacking
    _, prob_full = train_stacking(
        Xtr, ytr, Xte,
        base_models=base_models,
        n_splits=n_splits
    )
    full = evaluate_model(yte, prob_full)

    rows = [{
        "removed": "none",
        **full
    }]

    for name in list(base_models.keys()):
        bm = deepcopy(base_models)
        bm.pop(name)

        _, prob = train_stacking(
            Xtr, ytr, Xte,
            base_models=bm,
            n_splits=n_splits
        )
        m = evaluate_model(yte, prob)

        rows.append({
            "removed": name,
            "delta_roc_auc": m["roc_auc"] - full["roc_auc"],
            "delta_pr_auc": m["pr_auc"] - full["pr_auc"],
            "delta_log_loss": m["log_loss"] - full["log_loss"],
            "delta_brier": m["brier"] - full["brier"],
        })

    return rows


In [8]:
from models.logistic import train_logistic
from models.random_forest import train_random_forest
from models.boosting import train_boosting
from models.xrfm import train_xrfm

base_models = {
    "logistic": train_logistic,
    "rf": train_random_forest,
    "gb": train_boosting,
    "xrfm": train_xrfm,
}

rows = stacking_ablation(
    Xtr, ytr, Xte, yte,
    base_models=base_models,
    n_splits=3
)

import pandas as pd
df_ablation = pd.DataFrame(rows)
df_ablation



[STACKING] Base model: logistic
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3

[STACKING] Base model: rf
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3

[STACKING] Base model: gb
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3


KeyboardInterrupt: 