In [5]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("Project root added to PYTHONPATH:", PROJECT_ROOT)

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold

from src.data.preprocess import load_data, preprocess_fold


Project root added to PYTHONPATH: /Users/evamartin/Desktop/MDS/curs3/AML/projects/AML-Project3


In [17]:
DATA_PATH = "../data/data.csv"
TARGET = "default"

X, y = load_data(DATA_PATH, TARGET)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("\nClass distribution:")
print(y.value_counts(normalize=True))


X shape: (30000, 23)
y shape: (30000,)

Class distribution:
default
0    0.7788
1    0.2212
Name: proportion, dtype: float64


In [18]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [19]:
train_idx, test_idx = next(cv.split(X, y))

fold_data = preprocess_fold(
    X, y,
    train_idx, test_idx,
    winsorize=False
)

Xtr = fold_data["X_train"]
Xte = fold_data["X_test"]
ytr = fold_data["y_train"]
yte = fold_data["y_test"]

print("Train shape:", Xtr.shape)
print("Test shape:", Xte.shape)


Train shape: (24000, 23)
Test shape: (6000, 23)


In [20]:
print("Train mean (abs):", Xtr.mean().abs().mean())
print("Train std:", Xtr.std().mean())

print("Test mean (abs):", Xte.mean().abs().mean())
print("Test std:", Xte.std().mean())



Train mean (abs): 4.049578707792166e-17
Train std: 1.0000208339843975
Test mean (abs): 0.012427657996872699
Test std: 1.0717407042108311


In [21]:
fold_data_w = preprocess_fold(
    X, y,
    train_idx, test_idx,
    winsorize=True
)

Xtr_w = fold_data_w["X_train"]

print("With winsorization:")
print("Mean abs:", Xtr_w.mean().abs().mean())
print("Std:", Xtr_w.std().mean())


With winsorization:
Mean abs: 5.0510320563815817e-17
Std: 1.0000208339843975


In [23]:
from models.logistic import train_logistic
from evaluation.metrics import evaluate_model

model, y_prob = train_logistic(Xtr, ytr, Xte)

metrics = evaluate_model(yte, y_prob)
metrics


{'roc_auc': np.float64(0.7244675405388677),
 'pr_auc': np.float64(0.5016627284852238),
 'log_loss': 0.4657501909229874,
 'brier': np.float64(0.14510045548514897),
 'f1': 0.3628169014084507,
 'balanced_accuracy': np.float64(0.6078573712658855)}

In [24]:
y.value_counts()


default
0    23364
1     6636
Name: count, dtype: int64

In [26]:
from models.random_forest import train_random_forest
from evaluation.metrics import evaluate_model

rf_model, rf_prob = train_random_forest(Xtr, ytr, Xte)

rf_metrics = evaluate_model(yte, rf_prob)
rf_metrics


{'roc_auc': np.float64(0.7872542717960884),
 'pr_auc': np.float64(0.5603562617478409),
 'log_loss': 0.45852037443793414,
 'brier': np.float64(0.14547420623135726),
 'f1': 0.554773082942097,
 'balanced_accuracy': np.float64(0.7113991067007757)}

In [28]:
from models.boosting import train_boosting
from evaluation.metrics import evaluate_model

gb_model, gb_prob = train_boosting(Xtr, ytr, Xte)

gb_metrics = evaluate_model(yte, gb_prob)
gb_metrics


{'roc_auc': np.float64(0.7914176773446526),
 'pr_auc': np.float64(0.5484803806314421),
 'log_loss': 0.4248297126580792,
 'brier': np.float64(0.13332380025581247),
 'f1': 0.47642197374817696,
 'balanced_accuracy': np.float64(0.6589100408483248)}

In [45]:


xrfm_model, xrfm_prob = train_xrfm(Xtr, ytr, Xte)
xrfm_metrics = evaluate_model(yte, xrfm_prob)
xrfm_metrics





None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 19200, d: 23, and nval: 4800
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 16.75937509536743 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 13.78135871887207 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 12.78040599822998 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 12.97499680519104 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 12.524492979049683 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [01:21<?, ?it/s]

Tree has no split, stopping training
Using hard routing for tree prediction





{'roc_auc': np.float64(0.7406024193090857),
 'pr_auc': np.float64(0.49155697512140567),
 'log_loss': 0.5322042371664031,
 'brier': np.float64(0.15055783885954827),
 'f1': 0.4444444444444444,
 'balanced_accuracy': np.float64(0.6430477904769764)}

In [54]:
_, val_prob = train_xrfm(X_tr2, y_tr2, X_val2)
print(val_prob.shape, val_prob.min(), val_prob.max())


TypeError: expected np.ndarray (got DataFrame)

In [56]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from models.xx import train_xrfm


inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

tr_idx, val_idx = next(inner_cv.split(Xtr, ytr))

X_tr2 = Xtr.iloc[tr_idx]
y_tr2 = ytr.iloc[tr_idx]
X_val2 = Xtr.iloc[val_idx]

# esto es EXACTAMENTE lo que stacking hace
_, val_prob = train_xrfm(X_tr2, y_tr2, X_val2)

print(val_prob.shape, val_prob.min(), val_prob.max())




None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 12800, d: 23, and nval: 3200
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 4.989893913269043 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 3.7895307540893555 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 3.431349992752075 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 3.824734926223755 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 3.4753060340881348 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [00:22<?, ?it/s]

Tree has no split, stopping training
Using hard routing for tree prediction
(8000,) 0.001 0.999





In [57]:
from models.logistic import train_logistic
from models.random_forest import train_random_forest
from models.boosting import train_boosting
from models.stacking import train_stacking
from evaluation.metrics import evaluate_model

base_models_fast = {
    "logistic": train_logistic,
    "rf": train_random_forest,
    "gb": train_boosting,
}


In [58]:
stack_model, stack_prob = train_stacking(
    Xtr, ytr, Xte,
    base_models=base_models_fast,
    n_splits=3
)

evaluate_model(yte, stack_prob)



Generating OOF predictions for logistic
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3

Generating OOF predictions for rf
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3

Generating OOF predictions for gb
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3


{'roc_auc': np.float64(0.7939362866706552),
 'pr_auc': np.float64(0.5662343820640482),
 'log_loss': 0.42294460715184307,
 'brier': np.float64(0.1323456771767259),
 'f1': 0.4744773942634905,
 'balanced_accuracy': np.float64(0.6579429877042416)}

In [59]:
base_models_full = {
    "logistic": train_logistic,
    "rf": train_random_forest,
    "gb": train_boosting,
    "xrfm": train_xrfm,
}

stack_model, stack_prob = train_stacking(
    Xtr, ytr, Xte,
    base_models=base_models_full,
    n_splits=3   # mantenlo bajo para no esperar horas
)

evaluate_model(yte, stack_prob)


Generating OOF predictions for logistic
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3

Generating OOF predictions for rf
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3

Generating OOF predictions for gb
  Inner fold 1/3
  Inner fold 2/3
  Inner fold 3/3





Generating OOF predictions for xrfm
  Inner fold 1/3
None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 12800, d: 23, and nval: 3200
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 4.781532049179077 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 4.716062068939209 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 3.7817418575286865 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 3.5484349727630615 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 3.9374899864196777 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [00:24<?, ?it/s]


Tree has no split, stopping training
Using hard routing for tree prediction
None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 12800, d: 23, and nval: 3200
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 3.705414056777954 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 3.390803098678589 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 3.58070707321167 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 3.4373340606689453 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 3.985748052597046 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [00:21<?, ?it/s]


Tree has no split, stopping training
Using hard routing for tree prediction
  Inner fold 2/3
None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 12800, d: 23, and nval: 3200
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 3.8490869998931885 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 3.43896484375 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 3.446969985961914 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 3.3814690113067627 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 3.447479009628296 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [00:21<?, ?it/s]


Tree has no split, stopping training
Using hard routing for tree prediction
None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 12800, d: 23, and nval: 3200
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 3.786815881729126 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 3.693286895751953 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 3.5517830848693848 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 3.3865201473236084 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 3.411823034286499 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [00:21<?, ?it/s]


Tree has no split, stopping training
Using hard routing for tree prediction
  Inner fold 3/3
None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 12800, d: 23, and nval: 3200
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 4.052418947219849 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 3.629703998565674 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 3.394035816192627 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 3.7090930938720703 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 3.7287979125976562 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [00:21<?, ?it/s]


Tree has no split, stopping training
Using hard routing for tree prediction
None
Fitting xRFM with 1 trees and 0 iterations per tree


Building trees:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting RFM with ntrain: 12800, d: 23, and nval: 3200
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 0: 3.4065728187561035 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 1: 3.3845622539520264 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 2: 4.266696214675903 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 3: 3.398921251296997 seconds
Using cheap batch size
Optimal M batch size: 10000
Time taken for round 4: 3.313722848892212 seconds
Using cheap batch size
Optimal M batch size: 10000


Building trees:   0%|          | 0/1 [00:21<?, ?it/s]

Tree has no split, stopping training
Using hard routing for tree prediction





{'roc_auc': np.float64(0.7934972445432416),
 'pr_auc': np.float64(0.565097559700661),
 'log_loss': 0.423196126905582,
 'brier': np.float64(0.13243919656713043),
 'f1': 0.4692682926829268,
 'balanced_accuracy': np.float64(0.655307445535567)}