In [1]:
# =======================
# P19: Meta-ensemble (OOF 5-fold) + Meta-XGB (fallback LR) + artefactos
# =======================
import sys, subprocess, json, numpy as np, pandas as pd
from pathlib import Path

# ---------- Intentar disponer de xgboost y lightgbm (opcionales) ----------
def _pip_install(pkg):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
    except Exception as e:
        print(f"[WARN] No pude instalar {pkg}: {e}")

try:
    import xgboost as xgb  # noqa
except Exception:
    _pip_install("xgboost")
    try: import xgboost as xgb  # noqa
    except: xgb = None

try:
    import lightgbm as lgb  # noqa
except Exception:
    _pip_install("lightgbm")
    try: import lightgbm as lgb  # noqa
    except: lgb = None

# ---------- Sklearn ----------
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Monta Drive si hace falta
try:
    from google.colab import drive
    drive.mount('/content/drive')
except Exception:
    pass

BASE = Path("/content/drive/MyDrive/CognitivaAI")
OUT  = BASE / "p19_meta_ensemble"
OUT.mkdir(parents=True, exist_ok=True)

# ---------- Helpers robustos ----------
def _clean_cols(df):
    def cleanse(s): return str(s).replace('\ufeff','').strip()
    df.columns = [cleanse(c) for c in df.columns]
    return df

def load_labels(path_csv):
    # Lee con utf-8-sig para eliminar BOM
    try:
        df = pd.read_csv(path_csv, encoding="utf-8-sig")
    except Exception:
        df = pd.read_csv(path_csv)
    df = _clean_cols(df)
    lower = {c.lower(): c for c in df.columns}
    # patient_id
    pid = next((lower[k] for k in ['patient_id','patientid','pid','patient'] if k in lower), None)
    assert pid is not None, f"No patient_id en {path_csv}. Cabeceras: {list(df.columns)}"
    # y_true
    ycol = next((lower[k] for k in ['y_true','y','target','label','truth','gt'] if k in lower), None)
    assert ycol is not None, f"No y_true/target/label en {path_csv}. Cabeceras: {list(df.columns)}"
    # cohort
    if 'cohort' not in df.columns:
        def infer(pidv):
            s = str(pidv).upper()
            return 'OAS1' if s.startswith('OAS1') else ('OAS2' if s.startswith('OAS2') else 'ALL')
        df['cohort'] = df[pid].map(infer)
    y = pd.to_numeric(df[ycol], errors='coerce')
    uniq = sorted(pd.Series(y).dropna().unique().tolist())
    if len(uniq)==2 and set(uniq) != {0,1}:
        y = (y == max(uniq)).astype(int)
    out = pd.DataFrame({
        'patient_id': df[pid].astype(str).str.replace('\ufeff','', regex=False).str.strip(),
        'y_true': y.astype(int),
        'cohort': df['cohort']
    }).drop_duplicates('patient_id')
    assert out['y_true'].isin([0,1]).all(), f"Valores de y_true no binarios en {path_csv}: {out['y_true'].unique()}"
    return out

def load_features(file_list):
    acc=None
    for f in file_list:
        f=Path(f)
        if not f.exists():
            continue
        df = pd.read_csv(f, encoding="utf-8-sig")
        df = _clean_cols(df)
        # Detect id
        idcol = 'patient_id' if 'patient_id' in df.columns else None
        if idcol is None:
            for c in df.columns:
                if c.lower() in ['patient_id','patientid','pid','patient']:
                    idcol=c; break
        if idcol is None:
            continue
        df = df.rename(columns={idcol:'patient_id'})
        # Limpia posibles colisiones (y_true/cohort)
        drop_like = {'y_true','y','target','label','truth','gt','cohort'}
        df = df.drop(columns=[c for c in df.columns if c.lower() in drop_like], errors='ignore')
        # Numéricas + id
        numcols = ['patient_id'] + [c for c in df.columns if c!='patient_id' and pd.api.types.is_numeric_dtype(df[c])]
        df = df[numcols].drop_duplicates('patient_id')
        acc = df if acc is None else acc.merge(df, on='patient_id', how='outer')
    if acc is None:
        raise FileNotFoundError("No encontré features. Revisa rutas.")
    acc['patient_id'] = acc['patient_id'].astype(str).str.replace('\ufeff','', regex=False).str.strip()
    return acc

def build_Xy(feat_df, lab_df):
    lab_df = lab_df.copy()
    lab_df['patient_id'] = lab_df['patient_id'].astype(str).str.replace('\ufeff','', regex=False).str.strip()
    feat_df = feat_df.copy()
    feat_df['patient_id'] = feat_df['patient_id'].astype(str).str.replace('\ufeff','', regex=False).str.strip()
    df = lab_df.merge(feat_df, on='patient_id', how='left', validate='one_to_one')
    y = df['y_true'].astype(int).values
    Xcols = [c for c in df.columns if c not in {'patient_id','y_true','cohort'} and pd.api.types.is_numeric_dtype(df[c])]
    X = df[Xcols].values
    meta = df[['patient_id','cohort','y_true']]
    return meta, X, y, Xcols

def metrics_block(y_true, y_prob):
    has_var = len(np.unique(y_true))>1
    return dict(
        AUC = float(roc_auc_score(y_true, y_prob)) if has_var else float('nan'),
        PRAUC = float(average_precision_score(y_true, y_prob)) if has_var else float('nan'),
        Brier = float(brier_score_loss(y_true, y_prob))
    )

# ---------- Rutas de entrada (labels p22, features p11 + extras si tienes) ----------
VAL_LABELS  = BASE/"p22_meta_ablation/p22_val_calibrations.csv"
TEST_LABELS = BASE/"p22_meta_ablation/p22_test_calibrations.csv"

FEATURES_VAL = [
    BASE/"p11_alt_backbones/val_patient_features_backbones.csv",
    # Añade aquí otros CSV de features VAL si los tienes
]
FEATURES_TEST = [
    BASE/"p11_alt_backbones/test_patient_features_backbones.csv",
    # Añade aquí otros CSV de features TEST
]

labels_val  = load_labels(VAL_LABELS)
labels_test = load_labels(TEST_LABELS)
feat_val  = load_features(FEATURES_VAL)
feat_test = load_features(FEATURES_TEST)
meta_val,  X_val,  y_val,  Xcols = build_Xy(feat_val, labels_val)
meta_test, X_test, y_test, _    = build_Xy(feat_test, labels_test)

# --- Drop columnas totalmente NaN en VAL+TEST y reporte de missingness ---
Xv_df = pd.DataFrame(X_val, columns=Xcols)
Xt_df = pd.DataFrame(X_test, columns=Xcols)
allnan = Xv_df.isna().all() & Xt_df.isna().all()
dropped_cols = Xv_df.columns[allnan].tolist()
if dropped_cols:
    print(f"[P19] Drop {len(dropped_cols)} columnas totalmente NaN:", dropped_cols[:8], "...")
    Xv_df = Xv_df.loc[:, ~allnan]
    Xt_df = Xt_df.loc[:, ~allnan]
Xcols = Xv_df.columns.tolist()
X_val = Xv_df.values
X_test = Xt_df.values
missing_report = Xv_df.isna().mean().sort_values(ascending=False)
missing_report.to_csv(OUT/"p19_missingness_val.csv")

print(f"[P19] VAL shape: {X_val.shape}, TEST shape: {X_test.shape}, #features={len(Xcols)}")

# ---------- Definición de base learners con imputación en pipeline ----------
def lr_pipeline():
    return Pipeline([
        ('impute', SimpleImputer(strategy='median')),
        ('scale',  StandardScaler()),
        ('clf',    LogisticRegression(C=1.0, penalty='l2', solver='lbfgs',
                                      max_iter=5000, class_weight='balanced',
                                      random_state=RANDOM_STATE))
    ])

def tree_pipeline(estimator):
    # Imputación constante para que GB/RF/XGB/LGBM no fallen con NaN
    return Pipeline([
        ('impute', SimpleImputer(strategy='constant', fill_value=0.0)),
        ('clf', estimator)
    ])

base_learners = {
    'LR' : lr_pipeline(),
    'HGB': tree_pipeline(HistGradientBoostingClassifier(random_state=RANDOM_STATE)),
    'GB' : tree_pipeline(GradientBoostingClassifier(random_state=RANDOM_STATE)),
    'RF' : tree_pipeline(RandomForestClassifier(
            n_estimators=400, max_depth=None, n_jobs=-1,
            class_weight='balanced_subsample', random_state=RANDOM_STATE
    )),
}

if xgb is not None:
    from xgboost import XGBClassifier
    base_learners['XGB'] = tree_pipeline(XGBClassifier(
        n_estimators=500, max_depth=3, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
        random_state=RANDOM_STATE, n_jobs=-1, eval_metric='logloss'
    ))

if lgb is not None:
    from lightgbm import LGBMClassifier
    base_learners['LGBM'] = tree_pipeline(LGBMClassifier(
        n_estimators=800, num_leaves=31, learning_rate=0.03,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
        random_state=RANDOM_STATE
    ))

# ---------- OOF (5 folds) para cada base ----------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

oof_val  = {name: np.zeros(len(y_val), dtype=float) for name in base_learners}
pred_test = {name: np.zeros(len(y_test), dtype=float) for name in base_learners}

for name, model in base_learners.items():
    print(f"[P19] Base learner OOF → {name}")
    fold = 0
    for tr, va in skf.split(X_val, y_val):
        fold += 1
        m = model
        m.fit(X_val[tr], y_val[tr])
        # Predicciones VAL (fold) y TEST (promedio sobre folds)
        if hasattr(m, "predict_proba"):
            oof_val[name][va] = m.predict_proba(X_val[va])[:,1]
            pred_test[name]   += m.predict_proba(X_test)[:,1] / skf.n_splits
        else:
            # Fallbacks raros (no deberían activarse con nuestros pipelines)
            if hasattr(m, "decision_function"):
                from scipy.special import expit
                oof_val[name][va] = expit(m.decision_function(X_val[va]))
                pred_test[name]   += expit(m.decision_function(X_test)) / skf.n_splits
            else:
                oof_val[name][va] = m.predict(X_val[va]).astype(float)
                pred_test[name]   += m.predict(X_test).astype(float) / skf.n_splits

# ---------- Matrices meta (OOF para entrenar el meta) ----------
base_order = list(base_learners.keys())
X_val_meta  = np.column_stack([oof_val[k] for k in base_order])
X_test_meta = np.column_stack([pred_test[k] for k in base_order])

# ---------- Meta-modelo: XGB (si disponible) o LR (fallback) ----------
use_xgb = (xgb is not None)
if use_xgb:
    from xgboost import XGBClassifier
    meta = XGBClassifier(
        n_estimators=600, max_depth=3, learning_rate=0.05, subsample=0.8,
        colsample_bytree=0.8, reg_lambda=1.0, random_state=RANDOM_STATE,
        n_jobs=-1, eval_metric='logloss'
    )
else:
    meta = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', max_iter=5000, random_state=RANDOM_STATE)

meta.fit(X_val_meta, y_val)
if hasattr(meta, "predict_proba"):
    p_val_meta  = meta.predict_proba(X_val_meta)[:,1]
    p_test_meta = meta.predict_proba(X_test_meta)[:,1]
else:
    p_val_meta  = meta.predict(X_val_meta).astype(float)
    p_test_meta = meta.predict(X_test_meta).astype(float)

# ---------- Guardar predicciones (incluye bases + meta) ----------
val_df = meta_val.copy()
test_df = meta_test.copy()
for i, k in enumerate(base_order):
    val_df[f"pred_{k.lower()}"]  = X_val_meta[:, i]
    test_df[f"pred_{k.lower()}"] = X_test_meta[:, i]
val_df["y_prob"]  = p_val_meta
test_df["y_prob"] = p_test_meta

val_df.to_csv(OUT/"p19_val_patient_preds.csv", index=False)
test_df.to_csv(OUT/"p19_test_patient_preds.csv", index=False)

# ---------- Resumen json (global y por cohorte, solo meta) ----------
def overview(df_split):
    rows=[]
    # Global
    rows.append(dict(Cohort="ALL", **metrics_block(df_split['y_true'].values, df_split['y_prob'].values)))
    # Por cohorte
    for coh in ["OAS1","OAS2"]:
        part = df_split[df_split["cohort"]==coh]
        if len(part)>1:
            rows.append(dict(Cohort=coh, **metrics_block(part['y_true'].values, part['y_prob'].values)))
    return rows

summary = {
    "base_learners": base_order,
    "meta_model": "XGBClassifier" if use_xgb else "LogisticRegression",
    "val_overview": overview(val_df),
    "test_overview": overview(test_df),
    "notes": "OOF stacking 5-folds; meta entrenado sobre OOF; predicciones TEST promediadas sobre folds."
}

with open(OUT/"p19_summary.json","w") as f:
    json.dump(summary, f, indent=2)

print("✅ Artefactos P19 guardados en:", OUT)
print("- p19_val_patient_preds.csv")
print("- p19_test_patient_preds.csv")
print("- p19_summary.json")
print("Base learners:", base_order, "| Meta:", summary["meta_model"])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[P19] VAL shape: (69, 56), TEST shape: (70, 56), #features=56
[P19] Base learner OOF → LR
[P19] Base learner OOF → HGB
[P19] Base learner OOF → GB
[P19] Base learner OOF → RF
[P19] Base learner OOF → XGB
[P19] Base learner OOF → LGBM
[LightGBM] [Info] Number of positive: 24, number of negative: 31
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 416
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.436364 -> initscore=-0.255933
[LightGBM] [Info] Start training from score -0.255933
[LightGBM] [Info] Number of positive: 25, number of negative: 30
[LightGBM] [Info] Auto-c



[LightGBM] [Info] Number of positive: 25, number of negative: 30
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 416
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.454545 -> initscore=-0.182322
[LightGBM] [Info] Start training from score -0.182322
[LightGBM] [Info] Number of positive: 25, number of negative: 31
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 410
[LightGBM] [Info] Number of data points in the train set: 56, number of used features: 35
[LightGBM] [Info] [binary:BoostFromSco

