In [1]:
# ============================================================
# 📓 p21_meta_refine: Meta-ensemble refinado con calibración
# ============================================================
# A) Librerías y paths
!pip install lightgbm xgboost --quiet

from pathlib import Path
import os, json
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, \
                            precision_score, recall_score, f1_score, brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import HistGradientBoostingClassifier

import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

# Rutas
BASE = Path("/content/drive/MyDrive/CognitivaAI")
P20 = BASE/"p20_meta_calibration"
P21 = BASE/"p21_meta_refine"
P21.mkdir(parents=True, exist_ok=True)

print("Rutas listas:", P20, P21)


Mounted at /content/drive
Rutas listas: /content/drive/MyDrive/CognitivaAI/p20_meta_calibration /content/drive/MyDrive/CognitivaAI/p21_meta_refine


In [2]:
# B) Cargar features de pacientes de p19/p20
VAL_PATH = BASE/"p11_alt_backbones/val_patient_features_backbones.csv"
TEST_PATH= BASE/"p11_alt_backbones/test_patient_features_backbones.csv"

val_df  = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

print("VAL:", val_df.shape, "| TEST:", test_df.shape)
print("Cols totales:", len(val_df.columns)-2)

# Separar cohortes
val_df["cohort"]  = val_df.patient_id.str[:4]
test_df["cohort"] = test_df.patient_id.str[:4]

val_df.head(3)


VAL: (69, 58) | TEST: (70, 58)
Cols totales: 56


Unnamed: 0,patient_id,y_true,SwinTiny_mean,SwinTiny_p2,SwinTiny_top7,SwinTiny_trimmed20,convnext_tiny.in12k_ft_in1k_slices_mean,convnext_tiny.in12k_ft_in1k_slices_p2,convnext_tiny.in12k_ft_in1k_slices_top7,convnext_tiny.in12k_ft_in1k_slices_trimmed20,...,slice_preds_seedENS_p2,slice_preds_seedENS_top7,slice_preds_seedENS_trimmed20,slice_preds_top7,slice_preds_trimmed20,slices_preds_mean,slices_preds_p2,slices_preds_top7,slices_preds_trimmed20,cohort
0,OAS1_0003,1,0.458933,0.461509,0.514777,0.456984,0.455741,0.455741,0.455753,0.45574,...,0.707107,1.0,0.5,0.5,0.499985,,,,,OAS1
1,OAS1_0010,0,0.423731,0.428161,0.495524,0.416583,0.455743,0.455743,0.455756,0.455742,...,0.547723,0.857143,0.25,0.5,0.5,,,,,OAS1
2,OAS1_0016,1,0.463737,0.467343,0.530998,0.461,0.455753,0.455753,0.455765,0.455753,...,0.547723,0.857143,0.25,0.5,0.5,,,,,OAS1


In [3]:
# C) Filtrado de NaNs
nan_ratio = val_df.isna().mean().sort_values(ascending=False)
print("Top-10 NaN ratio (VAL):\n", nan_ratio.head(10))

# Mantener columnas con NaN < 0.4
keep_cols = [c for c in val_df.columns if c not in ["patient_id","y_true","cohort"]
             and val_df[c].isna().mean() <= 0.4]

X_val, y_val = val_df[keep_cols], val_df["y_true"].values
X_test, y_test = test_df[keep_cols], test_df["y_true"].values

print(f"Mantengo {len(keep_cols)} columnas; descarto {len(val_df.columns)-len(keep_cols)-3}")


Top-10 NaN ratio (VAL):
 patient_preds_ensemble_trimmed20    0.855072
patient_preds_ensemble_top7         0.855072
patient_preds_ensemble_p2           0.855072
patient_preds_mean                  0.855072
patient_preds_ensemble_mean         0.855072
slices_preds_trimmed20              0.855072
slices_preds_mean                   0.855072
slices_preds_p2                     0.855072
patient_preds_trimmed20             0.855072
patient_preds_top7                  0.855072
dtype: float64
Mantengo 36 columnas; descarto 20


In [5]:
# D) Base Learners & OOF (con saneo de NaN/inf y pipeline para LR)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 1) Saneo: ±inf -> NaN (la imputación cubrirá NaN donde aplique)
for _df in (X_val, X_test):
    _df.replace([np.inf, -np.inf], np.nan, inplace=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def make_model(name: str):
    """Crea el estimador. LR lleva imputación+escalado; el resto, tal cual."""
    if name == "LR_l2":
        return Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=True, with_std=True)),
            ("clf", LogisticRegression(max_iter=4000, solver="lbfgs"))
        ])
    if name == "HGB":
        # HGB maneja NaN nativamente
        return HistGradientBoostingClassifier(random_state=42)
    if name == "LGBM":
        # LightGBM tolera NaN; dejamos sin imputar
        return lgb.LGBMClassifier(
            random_state=42,
            n_estimators=300,
            learning_rate=0.05,
            num_leaves=31
        )
    if name == "XGB":
        # XGBoost tolera NaN; dejamos sin imputar
        return xgb.XGBClassifier(
            use_label_encoder=False, eval_metric="logloss", random_state=42,
            n_estimators=400, max_depth=3, subsample=0.9, colsample_bytree=0.9
        )
    raise ValueError(f"Modelo no soportado: {name}")

models = {
    "LR_l2": make_model("LR_l2"),
    "HGB"  : make_model("HGB"),
    "LGBM" : make_model("LGBM"),
    "XGB"  : make_model("XGB"),
}

def get_proba(model, X):
    """Compat: obtiene probas aunque sea Pipeline."""
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    # Para modelos que no implementen predict_proba (no debería ocurrir aquí)
    if hasattr(model, "decision_function"):
        from sklearn.metrics import auc
        s = model.decision_function(X)
        # sigmoid por si acaso
        return 1.0 / (1.0 + np.exp(-s))
    raise AttributeError("El estimador no expone predict_proba ni decision_function.")

oof_preds, test_preds = {}, {}

for name, base_model in models.items():
    print(f"OOF para: {name}")
    oof = np.zeros(len(X_val), dtype=float)

    # Re-crea un modelo fresco por fold para evitar leakage de stats internas
    for tr, va in skf.split(X_val, y_val):
        model = make_model(name)
        Xtr, ytr = X_val.iloc[tr], y_val[tr]
        Xva      = X_val.iloc[va]

        model.fit(Xtr, ytr)
        oof[va] = get_proba(model, Xva)

    # Ajuste final en todo VAL para predecir TEST
    final_model = make_model(name)
    final_model.fit(X_val, y_val)
    test_prob = get_proba(final_model, X_test)

    oof_preds[name]  = oof
    test_preds[name] = test_prob

meta_val  = pd.DataFrame(oof_preds)
meta_test = pd.DataFrame(test_preds)
print("Meta VAL:", meta_val.shape, "| Meta TEST:", meta_test.shape)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df.replace([np.inf, -np.inf], np.nan, inplace=True)


OOF para: LR_l2
OOF para: HGB
OOF para: LGBM
[LightGBM] [Info] Number of positive: 24, number of negative: 31
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 435
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.436364 -> initscore=-0.255933
[LightGBM] [Info] Start training from score -0.255933
[LightGBM] [Info] Number of positive: 25, number of negative: 30
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 55, number of used featur

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Meta VAL: (69, 4) | Meta TEST: (70, 4)


In [6]:
# E) Meta-learners (XGB + LR calibrado + isotónica)
meta_val["y_true"] = y_val
meta_test["y_true"] = y_test

# Meta-XGB
meta_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
meta_xgb.fit(meta_val.drop(columns="y_true"), y_val)
val_probs = meta_xgb.predict_proba(meta_val.drop(columns="y_true"))[:,1]
test_probs= meta_xgb.predict_proba(meta_test.drop(columns="y_true"))[:,1]

# Calibración isotónica sobre OOF
iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(val_probs, y_val)
val_probs_iso  = iso.transform(val_probs)
test_probs_iso = iso.transform(test_probs)

print("Calibración isotónica aplicada")


Calibración isotónica aplicada


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
# F) Métricas globales
def metrics(y, p, thr=0.5):
    return {
        "AUC": roc_auc_score(y, p),
        "PRAUC": average_precision_score(y, p),
        "Acc": accuracy_score(y, p>thr),
        "P": precision_score(y, p>thr),
        "R": recall_score(y, p>thr),
        "F1": f1_score(y, p>thr),
        "Brier": brier_score_loss(y, p),
        "thr": thr,
        "n": len(y)
    }

# Threshold por F1
from sklearn.metrics import f1_score
thr_candidates = np.linspace(0.1,0.9,17)
best_thr, best_f1 = 0.5, -1
for t in thr_candidates:
    f = f1_score(y_val, val_probs_iso>t)
    if f > best_f1: best_f1, best_thr = f, t

print("Umbral F1 óptimo (VAL):", best_thr)

print("[VAL]", metrics(y_val, val_probs_iso, best_thr))
print("[TEST]", metrics(y_test, test_probs_iso, best_thr))


Umbral F1 óptimo (VAL): 0.45000000000000007
[VAL] {'AUC': np.float64(0.9545840407470289), 'PRAUC': np.float64(0.9312011260733736), 'Acc': 0.8695652173913043, 'P': 0.8235294117647058, 'R': 0.9032258064516129, 'F1': 0.8615384615384616, 'Brier': np.float64(0.08236714975845424), 'thr': np.float64(0.45000000000000007), 'n': 69}
[TEST] {'AUC': np.float64(0.6525493421052632), 'PRAUC': np.float64(0.5870064057564057), 'Acc': 0.6428571428571429, 'P': 0.6, 'R': 0.65625, 'F1': 0.6268656716417911, 'Brier': np.float64(0.28518702883724223), 'thr': np.float64(0.45000000000000007), 'n': 70}


In [8]:
# G) Guardar predicciones y resumen
out_val  = pd.DataFrame({"patient_id": val_df.patient_id,
                         "y_true": y_val, "y_score": val_probs_iso})
out_test = pd.DataFrame({"patient_id": test_df.patient_id,
                         "y_true": y_test, "y_score": test_probs_iso})

out_val.to_csv(P21/"p21_val_meta_preds.csv", index=False)
out_test.to_csv(P21/"p21_test_meta_preds.csv", index=False)

summary = {
    "VAL": metrics(y_val, val_probs_iso, best_thr),
    "TEST": metrics(y_test, test_probs_iso, best_thr),
    "thr_opt": best_thr,
    "features": keep_cols
}
with open(P21/"p21_meta_summary.json","w") as f: json.dump(summary,f,indent=2)

print("💾 Guardados en:", P21)


💾 Guardados en: /content/drive/MyDrive/CognitivaAI/p21_meta_refine
