In [1]:
# A) Setup
!pip install --quiet scikit-learn==1.3.2 xgboost==1.7.6 lightgbm==4.0.0

from pathlib import Path
import numpy as np, pandas as pd, json, os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, f1_score, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

print("✅ Setup listo")


✅ Setup listo


In [2]:
# B) Carga de datasets de features (ya generados en p16/p18)
from google.colab import drive
drive.mount("/content/drive")

BASE = Path("/content/drive/MyDrive/CognitivaAI/p11_alt_backbones")
VAL_PATH = BASE/"val_patient_features_backbones.csv"
TEST_PATH = BASE/"test_patient_features_backbones.csv"

val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

print("VAL:", val_df.shape, "| TEST:", test_df.shape)

# Definir cohortes
val_df['cohort'] = val_df['patient_id'].str[:4]
test_df['cohort'] = test_df['patient_id'].str[:4]

y_val, y_test = val_df['y_true'].values, test_df['y_true'].values

# Selección de features (descartamos columnas no numéricas / ids)
X_val = val_df.drop(columns=['patient_id','y_true','cohort'])
X_test = test_df.drop(columns=['patient_id','y_true','cohort'])

print("Features totales:", X_val.shape[1])


Mounted at /content/drive
VAL: (69, 58) | TEST: (70, 58)
Features totales: 56


In [3]:
# C) Definición de varios meta-modelos
base_models = {
    "LR": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "HGB": HistGradientBoostingClassifier(max_iter=500),
    "GB": GradientBoostingClassifier(n_estimators=300),
    "RF": RandomForestClassifier(n_estimators=400, max_depth=8),
    "LGBM": LGBMClassifier(n_estimators=300),
    "XGB": XGBClassifier(n_estimators=300, eval_metric="logloss", use_label_encoder=False)
}

print("Modelos preparados:", list(base_models.keys()))


Modelos preparados: ['LR', 'HGB', 'GB', 'RF', 'LGBM', 'XGB']




In [5]:
# D) Cross-validation con imputación selectiva (solo para modelos que NO aceptan NaN)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Modelos que toleran NaN directamente
ACCEPTS_NAN = {"HGB", "LGBM", "XGB"}  # HistGradientBoosting, LightGBM, XGBoost

meta_val, meta_test = {}, {}

def make_pipeline_if_needed(name, base_est):
    """Para LR/GB/RF/ET: imputer+flags (+scaler para LR) -> estimator"""
    needs_impute = name not in ACCEPTS_NAN
    if not needs_impute:
        return base_est, needs_impute
    steps = [
        ("imp", SimpleImputer(strategy="median", add_indicator=True)),
    ]
    if name == "LR":
        steps.append(("scaler", StandardScaler(with_mean=True, with_std=True)))
    steps.append(("clf", base_est))
    return Pipeline(steps), needs_impute

for name, model in base_models.items():
    print(f"\n🔹 Entrenando {name}")
    preds_val = np.zeros(len(y_val), dtype=float)
    preds_test = np.zeros(len(y_test), dtype=float)

    for train_idx, valid_idx in skf.split(X_val, y_val):
        X_tr, y_tr = X_val.iloc[train_idx], y_val[train_idx]
        X_va, y_va = X_val.iloc[valid_idx], y_val[valid_idx]

        est, imputed = make_pipeline_if_needed(name, model)

        # Fit en el fold de entrenamiento
        est.fit(X_tr, y_tr)

        # Calibración isotónica sobre el fold de validación
        cal = CalibratedClassifierCV(est, cv="prefit", method="isotonic")
        cal.fit(X_va, y_va)

        # Guardar OOF de validación
        preds_val[valid_idx] = cal.predict_proba(X_va)[:, 1]
        # Promediar predicción en test
        preds_test += cal.predict_proba(X_test)[:, 1] / skf.n_splits

    meta_val[name] = preds_val
    meta_test[name] = preds_test

print("OOF listo:", {k: v.shape for k, v in meta_val.items()})




🔹 Entrenando LR

🔹 Entrenando HGB

🔹 Entrenando GB

🔹 Entrenando RF

🔹 Entrenando LGBM
[LightGBM] [Info] Number of positive: 24, number of negative: 31
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 435
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.436364 -> initscore=-0.255933
[LightGBM] [Info] Start training from score -0.255933
[LightGBM] [Info] Number of positive: 25, number of negative: 30
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.454545 -> initscore=-0.182322
[LightGBM] [Info] Start training from score -0.182322
[LightGBM] [Info] Number of positive: 25, number of negative: 30
You can set `force_row_

In [6]:
# E) Construcción del meta dataset
meta_val_df = pd.DataFrame(meta_val)
meta_test_df = pd.DataFrame(meta_test)

# Meta-modelo logístico
meta_lr = LogisticRegression(max_iter=2000)
meta_lr.fit(meta_val_df, y_val)

oof_val = meta_lr.predict_proba(meta_val_df)[:,1]
oof_test = meta_lr.predict_proba(meta_test_df)[:,1]

print("Meta predicciones listas:", oof_val.shape, oof_test.shape)


Meta predicciones listas: (69,) (70,)


In [7]:
# F) Evaluación
def eval_preds(y_true, y_score, thr=0.5):
    return {
        "AUC": roc_auc_score(y_true, y_score),
        "PRAUC": average_precision_score(y_true, y_score),
        "Acc": accuracy_score(y_true, y_score>=thr),
        "P": precision_score(y_true, y_score>=thr),
        "R": recall_score(y_true, y_score>=thr),
        "F1": f1_score(y_true, y_score>=thr),
        "Brier": brier_score_loss(y_true, y_score),
        "n": len(y_true)
    }

print("[VAL]", eval_preds(y_val, oof_val, thr=0.5))
print("[TEST]", eval_preds(y_test, oof_test, thr=0.5))


[VAL] {'AUC': 0.9643463497453311, 'PRAUC': 0.9657703126868211, 'Acc': 0.9130434782608695, 'P': 0.9629629629629629, 'R': 0.8387096774193549, 'F1': 0.896551724137931, 'Brier': 0.07084253869021467, 'n': 69}
[TEST] {'AUC': 0.7294407894736843, 'PRAUC': 0.6877287173074216, 'Acc': 0.7142857142857143, 'P': 0.7727272727272727, 'R': 0.53125, 'F1': 0.6296296296296297, 'Brier': 0.22558116979629994, 'n': 70}
