In [1]:
# Celda 0 — Montar Drive y definir rutas
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
BASE = Path("/content/drive/MyDrive/CognitivaAI")
OUT  = BASE/"p26_intermodal"
OUT.mkdir(parents=True, exist_ok=True)

print("BASE:", BASE)
print("OUT :", OUT)


Mounted at /content/drive
BASE: /content/drive/MyDrive/CognitivaAI
OUT : /content/drive/MyDrive/CognitivaAI/p26_intermodal


In [2]:
# Celda 1 — Utilidades generales
import json, numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

def clean_cols(df):
    df.columns = [str(c).replace("\ufeff","").strip() for c in df.columns]
    return df

def infer_cohort(pid):
    s = str(pid).strip().upper()
    if s.startswith("OAS1"): return "OAS1"
    if s.startswith("OAS2"): return "OAS2"
    return "OAS1"

LEAK_PATTERNS = ["cdr","dement","dx","diagnos","group","converted","label","target","y_true","y"]

def is_leak_col(name):
    s = str(name).lower()
    return any(p in s for p in LEAK_PATTERNS) or s in {"patient_id","cohort"}

def metrics_from_scores(y, p):
    y = np.asarray(y).astype(int); p = np.asarray(p).astype(float)
    has_var = len(np.unique(y))>1
    return dict(
        AUC   = float(roc_auc_score(y,p)) if has_var else float("nan"),
        PRAUC = float(average_precision_score(y,p)) if has_var else float("nan"),
        Brier = float(brier_score_loss(y,p))
    )

def choose_thr_cost(y, p, C_FN=5.0, C_FP=1.0, n=1001):
    y = np.asarray(y).astype(int); p = np.asarray(p).astype(float)
    thr = np.linspace(0,1,n)
    best = None
    for t in thr:
        yhat = (p>=t).astype(int)
        FP = int(((yhat==1)&(y==0)).sum())
        FN = int(((yhat==0)&(y==1)).sum())
        cost = C_FN*FN + C_FP*FP
        if best is None or cost < best[0]:
            best = (cost, t, FP, FN)
    cost, t, FP, FN = best
    TP = int(((p>=t)&(y==1)).sum()); TN = int(((p< t)&(y==0)).sum())
    prec = TP/(TP+FP) if (TP+FP)>0 else float("nan")
    rec  = TP/(TP+FN) if (TP+FN)>0 else float("nan")
    acc  = (TP+TN)/(TP+TN+FP+FN)
    return dict(thr=float(t), cost=float(cost), TP=TP, FP=FP, TN=TN, FN=FN, Precision=float(prec), Recall=float(rec), Acc=float(acc))

def to_patient_id(id_val, cohort):
    s = str(id_val).strip().replace("\u200b","").replace("\ufeff","")
    if s.upper().startswith(("OAS1_","OAS2_")):
        return s.upper()
    # numérico -> zero-pad 4
    if s.isdigit():
        return f"{cohort}_{int(s):04d}"
    s2 = s.replace("-", "_").upper()
    if not s2.startswith(("OAS1_","OAS2_")):
        s2 = f"{cohort}_{s2}"
    return s2


In [3]:
# Celda 2 — Construir clínico consolidado desde Excels (OASIS-1 y OASIS-2)
import pandas as pd
from pathlib import Path

search_dirs = [
    BASE/"clinical"/"raw",
    BASE/"clinical",
    BASE,
    BASE.parent,  # ← carpeta padre del proyecto
]
xls_files = []
for d in search_dirs:
    if d.exists():
        xls_files += list(d.glob("*.xlsx")) + list(d.glob("*.xls"))

if not xls_files:
    raise FileNotFoundError(
        f"No encontré Excels clínicos en {search_dirs}. "
        "Sube los ficheros OASIS-1 (cross-sectional) y OASIS-2 (longitudinal)."
    )

def load_xls(p):
    try:
        df = pd.read_excel(p)
    except Exception:
        # fallback
        df = pd.read_excel(p, engine="openpyxl")
    df.columns = [str(c).strip() for c in df.columns]
    return df

df1_raw, df2_raw = None, None
for p in xls_files:
    df = load_xls(p)
    cols = set(c.lower() for c in df.columns)
    if {"subject id","mri id","group","visit"}.issubset(cols):
        df2_raw = df
    elif {"id","m/f","mmse","cdr"}.issubset(cols) or {"id","sex","mmse","cdr"}.issubset(cols):
        df1_raw = df

assert df1_raw is not None and df2_raw is not None, "No pude distinguir cuál es OASIS-1 y cuál OASIS-2."

# Renombrado estándar
df1 = df1_raw.rename(columns={
    "ID":"ID", "M/F":"Sex", "Educ":"Education", "Hand":"Hand", "Delay":"Delay"
})
df2 = df2_raw.rename(columns={
    "Subject ID":"ID", "M/F":"Sex", "EDUC":"Education", "MR Delay":"Delay"
})

df1["Cohort"] = "OASIS1"
df2["Cohort"] = "OASIS2"

# Target de referencia (NO se usará como feature)
if "Group" in df2.columns:
    df2["Target"] = df2["Group"].replace({"Nondemented":0, "Demented":1, "Converted":1})
else:
    df2["Target"] = np.nan

df1["Target"] = df1["CDR"].apply(lambda x: 0 if x==0 else 1)

# OASIS-2: conservar primera visita por paciente
if "Visit" in df2.columns:
    df2 = df2.sort_values(["ID","Visit"]).groupby("ID").first().reset_index()

cols_common = ["ID","Age","Sex","Education","SES","MMSE","CDR","eTIV","nWBV","ASF","Target","Delay"]
df1c = df1.reindex(columns=[c for c in cols_common if c in df1.columns]).copy()
df2c = df2.reindex(columns=[c for c in cols_common if c in df2.columns]).copy()
df1c["Cohort"]="OASIS1"; df2c["Cohort"]="OASIS2"
df_all = pd.concat([df1c, df2c], ignore_index=True)

# Imputación suave en Education/SES si existen
if "Education" in df_all.columns:
    df_all["Education"] = pd.to_numeric(df_all["Education"], errors="coerce")
    df_all["Education"].fillna(df_all["Education"].median(), inplace=True)
if "SES" in df_all.columns:
    df_all["SES"] = pd.to_numeric(df_all["SES"], errors="coerce")
    df_all["SES"].fillna(df_all["SES"].median(), inplace=True)

# patient_id compatible con P24
df_all["patient_id"] = [to_patient_id(i, c) for i,c in zip(df_all["ID"], df_all["Cohort"])]

# Anti-fuga: quitar columnas proxy de etiqueta
leak_cols = [c for c in ["Target","CDR","Group"] if c in df_all.columns]
clin_features = df_all.drop(columns=["ID","Cohort"] + leak_cols, errors="ignore").copy()
if "Sex" in clin_features.columns:
    clin_features["Sex"] = clin_features["Sex"].astype(str).str.strip()

# Dejar 1 fila por paciente
clin_features = clin_features.drop_duplicates(subset=["patient_id"]).reset_index(drop=True)

# Guardar consolidado
clin_path = OUT/"p26_clinical_consolidado.csv"
clin_features.to_csv(clin_path, index=False)
print("✅ Clínico consolidado:", clin_features.shape, "->", clin_path)
print("Columnas clínicas:", clin_features.columns.tolist())


✅ Clínico consolidado: (586, 10) -> /content/drive/MyDrive/CognitivaAI/p26_intermodal/p26_clinical_consolidado.csv
Columnas clínicas: ['Age', 'Sex', 'Education', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF', 'Delay', 'patient_id']


  df2["Target"] = df2["Group"].replace({"Nondemented":0, "Demented":1, "Converted":1})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_all["Education"].fillna(df_all["Education"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_all["SES"].fillna(df_all["SES"].median(), inplace=True)


In [4]:
# Celda 3 — Cargar P24 (imagen) y el consolidado clínico
p24_val = clean_cols(pd.read_csv(BASE/"p24_meta_simple"/"p24_val_preds.csv"))
p24_tst = clean_cols(pd.read_csv(BASE/"p24_meta_simple"/"p24_test_preds.csv"))

# Cohort si falta
for df in (p24_val, p24_tst):
    if "cohort" not in df.columns:
        df["cohort"] = df["patient_id"].map(infer_cohort)

# Lista de 56 features de imagen (P24)
coef_df = pd.read_csv(BASE/"p24_meta_simple"/"p24_coefficients.csv")
IMG_FEATS = coef_df["feature"].tolist()
print(f"Features imagen (P24): {len(IMG_FEATS)} columnas")

# Clínico consolidado
clin = clean_cols(pd.read_csv(OUT/"p26_clinical_consolidado.csv"))
assert "patient_id" in clin.columns, "El consolidado clínico debe tener 'patient_id'."

# Merge con VAL/TEST de P24
val = p24_val.merge(clin, on="patient_id", how="left", suffixes=("","_clin"))
tst = p24_tst.merge(clin, on="patient_id", how="left", suffixes=("","_clin"))

# Columnas clínicas finales (excluyendo patient_id)
CLIN_COLS = [c for c in clin.columns if c!="patient_id"]
print(f"VAL: {val.shape} | TEST: {tst.shape} | #clin_cols={len(CLIN_COLS)}")


Features imagen (P24): 56 columnas
VAL: (69, 13) | TEST: (70, 13) | #clin_cols=9


In [5]:
# Celda 4 — Separar matrices IMG / CLIN y vectores
def split_xy(df):
    X_img = df.reindex(columns=IMG_FEATS, fill_value=np.nan)
    y     = df["y_true"].astype(int).values
    coh   = df["cohort"].astype(str).values
    y_img = df["y_prob"].astype(float).values  # proba calibrada de P24
    return X_img, y, coh, y_img

X_img_val, y_val, coh_val, yimg_val = split_xy(val)
X_img_tst, y_tst, coh_tst, yimg_tst = split_xy(tst)

X_clin_val = val[CLIN_COLS].copy()
X_clin_tst = tst[CLIN_COLS].copy()

print(f"VAL: X_img={X_img_val.shape}, X_clin={X_clin_val.shape}")
print(f"TEST: X_img={X_img_tst.shape}, X_clin={X_clin_tst.shape}")


VAL: X_img=(69, 56), X_clin=(69, 9)
TEST: X_img=(70, 56), X_clin=(70, 9)


In [6]:
# Celda 5 — Modelo clínico LR-EN (Repeated Stratified KFold)
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

clin_num = [c for c in CLIN_COLS if X_clin_val[c].dtype!=object]
clin_cat = [c for c in CLIN_COLS if X_clin_val[c].dtype==object]

pre_clin = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median")),
                          ("scaler", StandardScaler())]), clin_num),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), clin_cat),
    ],
    remainder="drop",
    sparse_threshold=0.3
)

clf_clin = Pipeline(steps=[
    ("pre", pre_clin),
    ("lr", LogisticRegression(penalty="elasticnet", solver="saga",
                              l1_ratio=0.5, C=0.5, max_iter=5000))
])

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
oof = np.zeros(len(y_val), dtype=float)
for tr, va in rskf.split(X_clin_val, y_val):
    m = clf_clin
    m.fit(X_clin_val.iloc[tr], y_val[tr])
    oof[va] = m.predict_proba(X_clin_val.iloc[va])[:,1]

clf_clin.fit(X_clin_val, y_val)
p_clin_tst = clf_clin.predict_proba(X_clin_tst)[:,1]

m_clin_val = metrics_from_scores(y_val, oof)
m_clin_tst = metrics_from_scores(y_tst, p_clin_tst)
print("CLÍNICO OOF VAL:", m_clin_val)
print("CLÍNICO TEST  :", m_clin_tst)


CLÍNICO OOF VAL: {'AUC': 0.58276740237691, 'PRAUC': 0.5248918812433309, 'Brier': 0.25712928295173343}
CLÍNICO TEST  : {'AUC': 0.5863486842105263, 'PRAUC': 0.5599631358460779, 'Brier': 0.23997569158513588}


In [10]:
# Celda 6_alt_v3 — Reintento con artefacto p3 (sin fuga y sin "drop" de columnas all-NaN)
import pandas as pd, numpy as np, pickle, joblib, json, warnings
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

warnings.filterwarnings("ignore", message="InconsistentVersionWarning")

def find_first(cand_dirs, names):
    for d in cand_dirs:
        for n in names:
            p = d/n
            if p.exists():
                return p
    return None

cand_dirs = [
    BASE/"clinical"/"final_models",
    BASE/"artifacts"/"clinic"/"final_models",
    BASE/"clinical",
    BASE/"artifacts"/"clinic",
    BASE,
    BASE.parent/"artifacts"/"clinic"/"final_models",
    BASE.parent
]
model_priority = [
    "model_lr_isotonic.pkl","model_lr_balanced.pkl",
    "model_xgb_isotonic.pkl","model_xgb_balanced.pkl",
    "model_rf_isotonic.pkl","model_rf_balanced.pkl",
]
model_path = find_first(cand_dirs, model_priority)

# deployment_config.json (opcional)
dep_cfg = find_first([BASE/"clinical", BASE/"artifacts"/"clinic", BASE, BASE.parent], ["deployment_config.json"])
if dep_cfg and dep_cfg.exists():
    try:
        cfg = json.load(open(dep_cfg))
        preferred = cfg.get("selected_model") or cfg.get("model_name")
        if preferred:
            p2 = find_first(cand_dirs, [preferred])
            if p2 and p2.exists():
                model_path = p2
                print("ℹ️ deployment_config.json ->", preferred)
    except Exception as e:
        print("⚠️ deployment_config.json:", e)

print("📦 Modelo clínico candidato:", model_path if model_path else "no encontrado")

feat_path = find_first([BASE/"clinical", BASE/"artifacts"/"clinic", BASE, BASE.parent], ["feature_columns.joblib"])
feature_cols = None
if feat_path and Path(feat_path).exists():
    try:
        feature_cols = [str(c) for c in joblib.load(feat_path)]
        # Anti-fuga por si en joblib venían estas columnas:
        feature_cols = [c for c in feature_cols if str(c).lower() not in {"cdr","group","target","y","label"}]
        print("🧩 feature_columns.joblib (#cols tras anti-fuga):", len(feature_cols))
    except Exception as e:
        print("⚠️ feature_columns.joblib:", e)

def try_load_model(p):
    if p is None: return None
    for loader in (joblib.load, lambda x: pickle.load(open(x,"rb")),
                   lambda x: pickle.load(open(x,"rb"), fix_imports=True, encoding="latin1")):
        try:
            return loader(p)
        except Exception as e:
            print("loader falló:", type(e).__name__, "-", e)
    return None

model_clin = try_load_model(model_path)

# Construir diseño SIN perder columnas (relleno constante para all-NaN)
def build_design(df_clin, expected_cols):
    if expected_cols is None:
        expected_cols = list(df_clin.columns)
    # Intersección + añade las que falten como NaN
    cols_in = [c for c in expected_cols if c in df_clin.columns]
    cols_out = [c for c in expected_cols if c not in df_clin.columns]
    X = df_clin.reindex(columns=cols_in, fill_value=np.nan).copy()
    # Rellenar all-NaN con 0 (evita drop del imputer por "sin observaciones")
    for c in X.columns:
        col = X[c]
        if (col.isna().all()):
            X[c] = 0.0
        elif col.dtype==object:
            X[c] = col.astype(str)
    # Añade explícitamente las ausentes como 0
    for c in cols_out:
        X[c] = 0.0
    return X[expected_cols].copy()

# Construye matrices con anti-fuga para el modelo (usa sólo CLIN_COLS conocidos)
expected = feature_cols if feature_cols is not None else CLIN_COLS
X_val_p3_raw = build_design(X_clin_val, expected)
X_tst_p3_raw = build_design(X_clin_tst, expected)

def predict_proba_robust(m, X):
    try:
        return m.predict_proba(X)[:,1]
    except Exception as e1:
        # fallback ligero numérico
        X2 = X.copy()
        num_cols = [c for c in X2.columns if X2[c].dtype != object]
        if num_cols:
            imp = SimpleImputer(strategy="constant", fill_value=0.0)  # ← clave: constante, no median
            X2[num_cols] = imp.fit_transform(X2[num_cols])
            sca = StandardScaler(with_mean=False)
            X2[num_cols] = sca.fit_transform(X2[num_cols])
        try:
            return m.predict_proba(X2)[:,1]
        except Exception as e2:
            raise RuntimeError(f"predict_proba fallo: {e1} | {e2}")

CLIN_OUT = BASE/"clinical"; CLIN_OUT.mkdir(parents=True, exist_ok=True)
p3_csv = CLIN_OUT/"p3_clinical_probs.csv"

if model_clin is not None:
    try:
        p_val_p3 = predict_proba_robust(model_clin, X_val_p3_raw)
        p_tst_p3 = predict_proba_robust(model_clin, X_tst_p3_raw)
        p3_df = pd.concat([
            pd.DataFrame({"patient_id": val["patient_id"], "split":"VAL",  "y_prob_clin": p_val_p3}),
            pd.DataFrame({"patient_id": tst["patient_id"], "split":"TEST", "y_prob_clin": p_tst_p3}),
        ], ignore_index=True)
        p3_df.to_csv(p3_csv, index=False)
        print("💾 Guardado (artefacto):", p3_csv, "→", p3_df.shape)
        # Override en memoria
        mv = val.merge(p3_df[p3_df["split"]=="VAL"][["patient_id","y_prob_clin"]], on="patient_id", how="left")
        mt = tst.merge(p3_df[p3_df["split"]=="TEST"][["patient_id","y_prob_clin"]], on="patient_id", how="left")
        oof = mv["y_prob_clin"].to_numpy()
        p_clin_tst = mt["y_prob_clin"].to_numpy()
        print("🔁 Override aplicado desde artefacto p3.")
    except Exception as e:
        print("⚠️ Artefacto no usable para predicción segura:", e)
        # Fall-back definitivo: usa Celda 5
        p3_df = pd.concat([
            pd.DataFrame({"patient_id": val["patient_id"], "split":"VAL",  "y_prob_clin": oof}),
            pd.DataFrame({"patient_id": tst["patient_id"], "split":"TEST", "y_prob_clin": p_clin_tst}),
        ], ignore_index=True)
        p3_df.to_csv(p3_csv, index=False)
        print("💾 Guardado (fallback Celda 5):", p3_csv, "→", p3_df.shape)
else:
    print("⚠️ No se pudo cargar el modelo p3. Se mantiene el fallback de Celda 5 (ya guardado).")




📦 Modelo clínico candidato: /content/drive/MyDrive/CognitivaAI/clinical/final_models/model_lr_isotonic.pkl
🧩 feature_columns.joblib (#cols tras anti-fuga): 8
⚠️ Artefacto no usable para predicción segura: predict_proba fallo: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- CDR
 | The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- CDR

💾 Guardado (fallback Celda 5): /content/drive/MyDrive/CognitivaAI/clinical/p3_clinical_probs.csv → (139, 3)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [11]:
import pandas as pd, numpy as np
from pathlib import Path

p3_csv = BASE/"clinical"/"p3_clinical_probs.csv"
assert p3_csv.exists(), "No existe p3_clinical_probs.csv (fallback). Vuelve a ejecutar la Celda 6_alt_v2/v3."

p3 = pd.read_csv(p3_csv)
print("p3_clinical_probs.csv:", p3.shape, "filas · splits:", p3["split"].value_counts().to_dict())

# Comprobamos merge con VAL/TEST de P24
mv = val.merge(p3[p3["split"]=="VAL"][["patient_id","y_prob_clin"]], on="patient_id", how="left")
mt = tst.merge(p3[p3["split"]=="TEST"][["patient_id","y_prob_clin"]], on="patient_id", how="left")
print("VAL merge OK:", mv["y_prob_clin"].notna().mean(), "TEST merge OK:", mt["y_prob_clin"].notna().mean())

# Si todo OK, seguimos con las celdas 7→11


p3_clinical_probs.csv: (139, 3) filas · splits: {'TEST': 70, 'VAL': 69}
VAL merge OK: 1.0 TEST merge OK: 1.0


In [12]:
# Celda 7_gen_p1 — Generar p1_oas2_img_probs.csv desde artefactos p13/p14 (si existen)
import pandas as pd, numpy as np
from pathlib import Path

def _clean_cols(df):
    df.columns = [str(c).replace("\ufeff","").strip() for c in df.columns];
    return df

def _to_patient_id(id_val, cohort="OAS2"):
    s = str(id_val).strip().replace("\u200b","").replace("\ufeff","")
    if s.upper().startswith(("OAS1_","OAS2_")): return s.upper()
    if s.isdigit(): return f"{cohort}_{int(s):04d}"
    s2 = s.replace("-", "_").upper()
    if not s2.startswith(("OAS1_","OAS2_")): s2 = f"{cohort}_{s2}"
    return s2

def _pick_prob_col(df):
    # candidatos habituales
    cand = [c for c in df.columns if str(c).lower() in
            {"y_prob","prob","yprob","pred_prob","prob1","score","y_hat","yhat","p","proba"}]
    cand = [c for c in cand if pd.api.types.is_numeric_dtype(df[c])]
    nums = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    # filtra etiquetas conocidas
    bad = {"y_true","label","target","cdr","group","diagnosis","dx"}
    nums = [c for c in nums if str(c).lower() not in bad]
    # dentro de [0,1] mayormente
    def in01(series):
        s = series.dropna()
        return len(s)>0 and (s.between(0,1).mean()>=0.95)
    for c in cand:
        if in01(df[c]): return c
    for c in nums:
        if in01(df[c]): return c
    return None

# 1) Candidatos
cands_dirs = [
    BASE/"p14_oasis2_images",
    BASE/"p13_oasis2_images",
    BASE.parent/"p14_oasis2_images",
    BASE.parent/"p13_oasis2_images",
]
cand_files = []
for d in cands_dirs:
    if d.exists():
        cand_files += list(d.glob("*patient_preds*.csv")) + list(d.glob("*patient_features*.csv"))
if not cand_files:
    print("ℹ️ No encontré CSV de p13/p14. Continuo sin p1 (no pasa nada).")
    p1_val = np.full(len(val), np.nan); p1_tst = np.full(len(tst), np.nan)
else:
    frames = []
    for f in cand_files:
        try:
            df = _clean_cols(pd.read_csv(f))
            # Detecta id de paciente
            idcol = None
            for k in ["patient_id","patient","id","subject id","subject_id","ID"]:
                if k in map(str.lower, df.columns):
                    # obtener nombre real respetando mayúsculas
                    idcol = [c for c in df.columns if c.lower()==k][0]; break
            if idcol is None:
                continue
            df["patient_id"] = df[idcol].astype(str).map(lambda x: _to_patient_id(x, "OAS2"))
            # Detecta prob
            pcol = _pick_prob_col(df)
            if pcol is None:
                continue
            sub = df[["patient_id", pcol]].rename(columns={pcol:"y_prob_img"})
            # Solo OAS2 + prob válida
            sub = sub[sub["patient_id"].str.upper().str.startswith("OAS2_")]
            sub = sub.dropna(subset=["y_prob_img"])
            frames.append(sub)
            print(f"↳ {f.name}: {len(sub)} filas con y_prob_img")
        except Exception as e:
            print(f"⚠️ {f.name}: {e}")

    if not frames:
        print("ℹ️ No logré extraer probabilidades de los CSVs. Sigo sin p1.")
        p1_val = np.full(len(val), np.nan); p1_tst = np.full(len(tst), np.nan)
    else:
        p1 = pd.concat(frames, ignore_index=True)
        # Si hay duplicados por paciente, promediamos
        p1 = p1.groupby("patient_id", as_index=False)["y_prob_img"].mean()
        # Guardar
        CLIN = BASE/"clinical"; CLIN.mkdir(parents=True, exist_ok=True)
        out_csv = CLIN/"p1_oas2_img_probs.csv"
        p1.to_csv(out_csv, index=False)
        print("💾 Guardado:", out_csv, "→", p1.shape)

        # Alinear con VAL/TEST actuales
        mv = val[["patient_id"]].merge(p1, on="patient_id", how="left")
        mt = tst[["patient_id"]].merge(p1, on="patient_id", how="left")
        p1_val = mv["y_prob_img"].to_numpy()
        p1_tst = mt["y_prob_img"].to_numpy()

        # Diagnóstico de cobertura
        m_val = (~np.isnan(p1_val)).mean()
        m_tst = (~np.isnan(p1_tst)).mean()
        print(f"Coverage VAL OAS2 p1: {m_val:.2%} | TEST OAS2 p1: {m_tst:.2%} (NaN en OAS1 por diseño)")


↳ val_patient_preds_oas2_effb3_p14.csv: 22 filas con y_prob_img
↳ test_patient_preds_oas2_effb3_p14.csv: 23 filas con y_prob_img
↳ val_patient_features_oas2_effb3_p14.csv: 22 filas con y_prob_img
↳ test_patient_features_oas2_effb3_p14.csv: 23 filas con y_prob_img
↳ val_patient_preds_oas2_effb3.csv: 22 filas con y_prob_img
↳ test_patient_preds_oas2_effb3.csv: 23 filas con y_prob_img
↳ val_patient_features_oas2_effb3.csv: 2 filas con y_prob_img
↳ test_patient_features_oas2_effb3.csv: 3 filas con y_prob_img
💾 Guardado: /content/drive/MyDrive/CognitivaAI/clinical/p1_oas2_img_probs.csv → (49, 2)
Coverage VAL OAS2 p1: 31.88% | TEST OAS2 p1: 32.86% (NaN en OAS1 por diseño)


In [13]:
# Sustituye el contenido de tu Celda 7 (o añádelo tras la 7_gen_p1 que ya corriste)

import numpy as np
# Si no existe p1_val/p1_tst, definelos como NaN (por si vienes del fallback)
if 'p1_val' not in globals(): p1_val = np.full(len(val), np.nan)
if 'p1_tst' not in globals(): p1_tst = np.full(len(tst), np.nan)

# Máscaras de cohorte
mask_oas2_val = (val["cohort"].values == "OAS2")
mask_oas2_tst = (tst["cohort"].values == "OAS2")

# Flag de presencia
p1_has_val = ~np.isnan(p1_val)
p1_has_tst = ~np.isnan(p1_tst)

# Media OAS2 en VAL (¡solo VAL para evitar fuga!)
m_oas2_val = np.nanmean(p1_val[mask_oas2_val]) if np.any(mask_oas2_val) else 0.5

# Imputación coherente
p1_fill_val = p1_val.copy()
p1_fill_tst = p1_tst.copy()

# En OAS2: faltantes → media VAL OAS2
p1_fill_val[~p1_has_val & mask_oas2_val] = m_oas2_val
p1_fill_tst[~p1_has_tst & mask_oas2_tst] = m_oas2_val  # usa SIEMPRE la media de VAL

# En OAS1: todo p1 es ausente → neutral 0.5
p1_fill_val[~mask_oas2_val] = 0.5
p1_fill_tst[~mask_oas2_tst] = 0.5

print(f"p1 imputado: m_oas2_val={m_oas2_val:.3f} | has_val={(p1_has_val).mean():.2%} | has_tst={(p1_has_tst).mean():.2%}")


p1 imputado: m_oas2_val=0.575 | has_val=31.88% | has_tst=32.86%


In [17]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def stack_cols(*cols):
    cols = [c.reshape(-1,1) if c.ndim==1 else c for c in cols]
    return np.hstack(cols)

# Variante A: SIN p1
X_metaA_val = stack_cols(yimg_val, oof)
X_metaA_tst = stack_cols(yimg_tst, p_clin_tst)
metaA = LogisticRegression(penalty=None, solver="lbfgs", max_iter=1000).fit(X_metaA_val, y_val)
pA_val = metaA.predict_proba(X_metaA_val)[:,1]
pA_tst = metaA.predict_proba(X_metaA_tst)[:,1]
mA_val = metrics_from_scores(y_val, pA_val)
mA_tst = metrics_from_scores(y_tst, pA_tst)

# Variante B: CON p1 (p1_fill + flag p1_has)
X_metaB_val = stack_cols(yimg_val, oof, p1_fill_val, p1_has_val.astype(float))
X_metaB_tst = stack_cols(yimg_tst, p_clin_tst, p1_fill_tst, p1_has_tst.astype(float))
metaB = LogisticRegression(penalty=None, solver="lbfgs", max_iter=1000).fit(X_metaB_val, y_val)
pB_val = metaB.predict_proba(X_metaB_val)[:,1]
pB_tst = metaB.predict_proba(X_metaB_tst)[:,1]
mB_val = metrics_from_scores(y_val, pB_val)
mB_tst = metrics_from_scores(y_tst, pB_tst)

print("LATE A (sin p1) VAL:", mA_val, "\nLATE A TEST:", mA_tst)
print("LATE B (con p1) VAL:", mB_val, "\nLATE B TEST:", mB_tst)

# Elegimos por AUC(VAL)
use_B = (mB_val["AUC"] > (mA_val["AUC"] + 1e-6))
p_meta_val = pB_val if use_B else pA_val
p_meta_tst = pB_tst if use_B else pA_tst
m_meta_val = mB_val if use_B else mA_val
m_meta_tst = mB_tst if use_B else mA_tst
meta_feats = ["p_img","p_clin","p1_fill","p1_has"] if use_B else ["p_img","p_clin"]
print("➡️ LATE elegido:", "con p1" if use_B else "sin p1", "| feats:", meta_feats)

LATE A (sin p1) VAL: {'AUC': 0.9142614601018676, 'PRAUC': 0.9233963998617327, 'Brier': 0.11381905289515709} 
LATE A TEST: {'AUC': 0.6965460526315789, 'PRAUC': 0.693915321534084, 'Brier': 0.24068705782718486}
LATE B (con p1) VAL: {'AUC': 0.9159592529711376, 'PRAUC': 0.9209221898268721, 'Brier': 0.11120259520047246} 
LATE B TEST: {'AUC': 0.7129934210526315, 'PRAUC': 0.7121843775894818, 'Brier': 0.23385491237404987}
➡️ LATE elegido: con p1 | feats: ['p_img', 'p_clin', 'p1_fill', 'p1_has']


In [20]:
# Celda 9 — Mid fusion (reconstruyendo las 56 features de imagen desde p11/p14/p13)

import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

def _clean_cols(df):
    df.columns = [str(c).replace("\ufeff","").strip() for c in df.columns]
    return df

def _std_pid(s):
    s = str(s).strip().replace("\u200b","").replace("\ufeff","")
    u = s.upper()
    if u.startswith(("OAS1_","OAS2_")): return u
    if s.isdigit(): return f"OAS1_{int(s):04d}"  # por seguridad; se sobreescribe según origen
    return u

def _load_feature_file(path, cohort_hint=None):
    df = _clean_cols(pd.read_csv(path))
    # Detectar columna id
    idcol = None
    for k in ["patient_id","patient","id","subject id","subject_id","ID"]:
        for c in df.columns:
            if c.lower() == k:
                idcol = c; break
        if idcol: break
    if idcol is None:
        return None
    df["patient_id"] = df[idcol].astype(str).map(_std_pid)
    # Si cohort_hint == "OAS2" y los ids no llevan prefijo, añádelo
    if cohort_hint == "OAS2":
        df["patient_id"] = df["patient_id"].apply(lambda x: x if x.startswith("OAS2_") else x.replace("OAS1_","OAS2_") if x.startswith("OAS1_") else ("OAS2_"+x if not x.startswith("OAS2_") else x))
    # Filtrar a columnas de interés (intersección con IMG_FEATS)
    keep = ["patient_id"] + [c for c in df.columns if c in IMG_FEATS]
    df = df[keep].copy()
    # Asegurar numéricas
    for c in df.columns:
        if c!="patient_id":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def _merge_features(base_ids, dfs):
    """Combina DataFrames de features evitando sufijos: toma el primer no-NaN por columna."""
    out = pd.DataFrame({"patient_id": base_ids})
    for df in dfs:
        if df is None or df.empty:
            continue
        out = out.merge(df, on="patient_id", how="left", suffixes=(None, None))
        # Si aparecieran duplicados, resolveremos abajo al reindexar
    # Garantiza todas IMG_FEATS presentes
    for c in IMG_FEATS:
        if c not in out.columns:
            out[c] = np.nan
    # Ordena columnas
    out = out[["patient_id"] + IMG_FEATS]
    return out

# Rutas candidatas por split
sources_val = []
sources_tst = []

# OAS1 (p11)
p11 = BASE/"p11_alt_backbones"
if p11.exists():
    f_val_oas1 = p11/"val_patient_features_backbones.csv"
    f_tst_oas1 = p11/"test_patient_features_backbones.csv"
    if f_val_oas1.exists(): sources_val.append(("OAS1", f_val_oas1))
    if f_tst_oas1.exists(): sources_tst.append(("OAS1", f_tst_oas1))

# OAS2 (p14/p13)
for d in [BASE/"p14_oasis2_images", BASE/"p13_oasis2_images", BASE.parent/"p14_oasis2_images", BASE.parent/"p13_oasis2_images"]:
    if d.exists():
        for name in d.glob("*val*patient_features*.csv"):
            sources_val.append(("OAS2", name))
        for name in d.glob("*test*patient_features*.csv"):
            sources_tst.append(("OAS2", name))

print("Fuentes VAL:", [str(p) for _,p in sources_val])
print("Fuentes TEST:", [str(p) for _,p in sources_tst])

# Cargar y filtrar a columnas IMG_FEATS
val_dfs = []
for coh, path in sources_val:
    try:
        val_dfs.append(_load_feature_file(path, cohort_hint=coh))
    except Exception as e:
        print(f"⚠️ {path.name}: {e}")

tst_dfs = []
for coh, path in sources_tst:
    try:
        tst_dfs.append(_load_feature_file(path, cohort_hint=coh))
    except Exception as e:
        print(f"⚠️ {path.name}: {e}")

# Construir matrices de features alineadas al orden de val/tst
feat_val = _merge_features(val["patient_id"].values, val_dfs)
feat_tst = _merge_features(tst["patient_id"].values, tst_dfs)

# Diagnóstico de cobertura por columnas
cov_val = feat_val[IMG_FEATS].notna().mean()
cov_tst = feat_tst[IMG_FEATS].notna().mean()
print("Cobertura VAL (media por col):", float(cov_val.mean()))
print("Cobertura TEST (media por col):", float(cov_tst.mean()))

# === Preparar Mid: concat imagen + clínico + p1 (prellenado y flag) ===
# Nota: p1_fill_val/p1_fill_tst y p1_has_val/p1_has_tst fueron creados en el parche de Celda 7

import pandas as pd

val_p1_df = pd.DataFrame({"p1_fill": p1_fill_val, "p1_has": p1_has_val.astype(int)}, index=val.index)
tst_p1_df = pd.DataFrame({"p1_fill": p1_fill_tst, "p1_has": p1_has_tst.astype(int)}, index=tst.index)

# Usar feat_val/feat_tst para las columnas IMG_FEATS (no val[img_cols])
X_mid_val = pd.concat([feat_val[IMG_FEATS].reset_index(drop=True), X_clin_val.reset_index(drop=True), val_p1_df.reset_index(drop=True)], axis=1)
X_mid_tst = pd.concat([feat_tst[IMG_FEATS].reset_index(drop=True), X_clin_tst.reset_index(drop=True), tst_p1_df.reset_index(drop=True)], axis=1)

all_cols = list(X_mid_val.columns)
# clínicas num/cat ya detectadas antes (CLIN_COLS); extendemos con imagen + p1
mid_num = [c for c in all_cols if (c in IMG_FEATS) or (c in [*CLIN_COLS]) and (X_clin_val[c].dtype!=object) or (c in ["p1_fill","p1_has"])]
mid_cat = [c for c in CLIN_COLS if X_clin_val[c].dtype==object and c in all_cols]

# El ColumnTransformer se encargará de imputar medianas; si alguna col está all-NaN, sklearn la “saltará” sin romper
pre_mid = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median")),
                          ("scaler", StandardScaler())]), mid_num),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), mid_cat),
    ],
    remainder="drop",
    sparse_threshold=0.3
)

mid_lr = Pipeline([
    ("pre", pre_mid),
    ("lr", LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.5, C=0.5, max_iter=5000))
])

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
oof_mid = np.zeros(len(y_val), dtype=float)
for tr, va in rskf.split(X_mid_val, y_val):
    m = mid_lr
    m.fit(X_mid_val.iloc[tr], y_val[tr])
    oof_mid[va] = m.predict_proba(X_mid_val.iloc[va])[:,1]

mid_lr.fit(X_mid_val, y_val)
p_mid_tst = mid_lr.predict_proba(X_mid_tst)[:,1]

m_mid_val = metrics_from_scores(y_val, oof_mid)
m_mid_tst = metrics_from_scores(y_tst, p_mid_tst)
print("MID FUSION VAL :", m_mid_val)
print("MID FUSION TEST:", m_mid_tst)



Fuentes VAL: ['/content/drive/MyDrive/CognitivaAI/p11_alt_backbones/val_patient_features_backbones.csv', '/content/drive/MyDrive/CognitivaAI/p14_oasis2_images/val_patient_features_oas2_effb3_p14.csv', '/content/drive/MyDrive/CognitivaAI/p13_oasis2_images/val_patient_features_oas2_effb3.csv']
Fuentes TEST: ['/content/drive/MyDrive/CognitivaAI/p11_alt_backbones/test_patient_features_backbones.csv', '/content/drive/MyDrive/CognitivaAI/p14_oasis2_images/test_patient_features_oas2_effb3_p14.csv', '/content/drive/MyDrive/CognitivaAI/p13_oasis2_images/test_patient_features_oas2_effb3.csv']
Cobertura VAL (media por col): 0.5144927536231884
Cobertura TEST (media por col): 0.6224489795918366
MID FUSION VAL : {'AUC': 0.7971137521222411, 'PRAUC': 0.7766370505604872, 'Brier': 0.18454527361873402}
MID FUSION TEST: {'AUC': 0.6973684210526316, 'PRAUC': 0.6573379171919493, 'Brier': 0.22967954621192932}


In [23]:
# Celda 10 — Selección + umbrales coste 5:1 (por cohorte) + artefactos P26
import numpy as np, pandas as pd, json
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

OUT = BASE/"p26_intermodal"
OUT.mkdir(parents=True, exist_ok=True)

def metrics_from_scores(y, p):
    y = np.asarray(y).astype(int); p = np.asarray(p, float)
    return dict(
        AUC=float(roc_auc_score(y, p)),
        PRAUC=float(average_precision_score(y, p)),
        Brier=float(brier_score_loss(y, p)),
    )

def confusion_at_thr(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    TP = int(((y_true==1)&(y_pred==1)).sum())
    FP = int(((y_true==0)&(y_pred==1)).sum())
    TN = int(((y_true==0)&(y_pred==0)).sum())
    FN = int(((y_true==1)&(y_pred==0)).sum())
    prec = TP/(TP+FP) if (TP+FP)>0 else np.nan
    rec  = TP/(TP+FN) if (TP+FN)>0 else np.nan
    acc  = (TP+TN)/(TP+TN+FP+FN)
    return TP,FP,TN,FN,prec,rec,acc

def best_cost_thr(y_true, y_prob, C_FN=5.0, C_FP=1.0, grid=1001):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob, float)
    thrs = np.linspace(0,1,grid)
    best = None
    for t in thrs:
        TP,FP,TN,FN,_,_,_ = confusion_at_thr(y_true,y_prob,t)
        cost = C_FN*FN + C_FP*FP
        if (best is None) or (cost < best["Cost"] - 1e-9):
            best = dict(Thr=float(t), Cost=float(cost), TP=TP, FP=FP, TN=TN, FN=FN)
    return best

# === 1) Tomamos el mejor "Late o Mid" por AUC(VAL) ===
# Variables creadas en celdas previas:
#   Late elegido → p_meta_val, p_meta_tst, m_meta_val, m_meta_tst, meta_feats
#   Mid          → oof_mid,     p_mid_tst,  m_mid_val,  m_mid_tst
late_is_better = (m_meta_val["AUC"] >= m_mid_val["AUC"] - 1e-6)
winner = "LATE" if late_is_better else "MID"

y_val_arr = y_val.astype(int)
y_tst_arr = y_tst.astype(int)

p_val_w = p_meta_val if winner=="LATE" else oof_mid
p_tst_w = p_meta_tst if winner=="LATE" else p_mid_tst

m_val_w = metrics_from_scores(y_val_arr, p_val_w)
m_tst_w = metrics_from_scores(y_tst_arr, p_tst_w)

print(f"🏁 P26 seleccionado: {winner}  |  VAL AUC={m_val_w['AUC']:.3f}  TEST AUC={m_tst_w['AUC']:.3f}")

# === 2) Umbrales por cohorte (coste FN:FP = 5:1) aprendidos en VAL y aplicados a TEST ===
C_FN, C_FP = 5.0, 1.0
rows_thr = []
rows_test = []

for coh in ["OAS1","OAS2"]:
    mask_val = (val["cohort"].values==coh)
    mask_tst = (tst["cohort"].values==coh)
    best = best_cost_thr(y_val_arr[mask_val], p_val_w[mask_val], C_FN=C_FN, C_FP=C_FP, grid=1001)
    thr = best["Thr"]
    TP,FP,TN,FN,prec,rec,acc = confusion_at_thr(y_tst_arr[mask_tst], p_tst_w[mask_tst], thr)
    rows_thr.append(dict(Cohort=coh, Thr_VAL=thr, Cost_VAL=best["Cost"], C_FN=C_FN, C_FP=C_FP))
    rows_test.append(dict(
        Cohort=coh, Thr=thr, TP=TP, FP=FP, TN=TN, FN=FN,
        Precision=float(prec), Recall=float(rec), Acc=float(acc),
        Cost=float(C_FN*FN + C_FP*FP)
    ))

thr_df  = pd.DataFrame(rows_thr)
test_df = pd.DataFrame(rows_test)
thr_df.to_csv(OUT/"p26_thresholds_cost_5to1.csv", index=False)
test_df.to_csv(OUT/"p26_test_report_cost_5to1.csv", index=False)

print("💾 Guardado umbrales:", OUT/"p26_thresholds_cost_5to1.csv")
print("💾 Guardado test@umbrales:", OUT/"p26_test_report_cost_5to1.csv")
print(test_df)

# === 3) Guardar predicciones y resumen ===
val_preds = pd.DataFrame({
    "patient_id": val["patient_id"].values,
    "cohort": val["cohort"].values,
    "y_true": y_val_arr,
    "y_prob": p_val_w
})
tst_preds = pd.DataFrame({
    "patient_id": tst["patient_id"].values,
    "cohort": tst["cohort"].values,
    "y_true": y_tst_arr,
    "y_prob": p_tst_w
})
val_preds.to_csv(OUT/"p26_val_preds.csv", index=False)
tst_preds.to_csv(OUT/"p26_test_preds.csv", index=False)

summary = dict(
    winner=winner,
    meta_features=(meta_feats if winner=="LATE" else "MID(IMG56+CLIN+p1)"),
    cost_weights=dict(C_FN=C_FN, C_FP=C_FP),
    VAL=m_val_w, TEST=m_tst_w,
    late_VAL=m_meta_val, late_TEST=m_meta_tst,
    mid_VAL=m_mid_val,  mid_TEST=m_mid_tst
)
json.dump(summary, open(OUT/"p26_summary.json","w"), indent=2)
print("💾 Guardado summary:", OUT/"p26_summary.json")


🏁 P26 seleccionado: LATE  |  VAL AUC=0.916  TEST AUC=0.713
💾 Guardado umbrales: /content/drive/MyDrive/CognitivaAI/p26_intermodal/p26_thresholds_cost_5to1.csv
💾 Guardado test@umbrales: /content/drive/MyDrive/CognitivaAI/p26_intermodal/p26_test_report_cost_5to1.csv
  Cohort    Thr  TP  FP  TN  FN  Precision    Recall       Acc  Cost
0   OAS1  0.307  14   9  18   6   0.608696  0.700000  0.680851  39.0
1   OAS2  0.195   8   4   7   4   0.666667  0.666667  0.652174  24.0
💾 Guardado summary: /content/drive/MyDrive/CognitivaAI/p26_intermodal/p26_summary.json


In [24]:
# Celda 11_fix — Bloques README / Informe / Bitácora (robustos, sin f-strings con if en el formato)
import pandas as pd, numpy as np
from pathlib import Path
from datetime import date

OUT = BASE/"p26_intermodal"
OUT.mkdir(parents=True, exist_ok=True)

# Carga lo que acabamos de guardar en la Celda 10
thr_df  = pd.read_csv(OUT/"p26_thresholds_cost_5to1.csv")
test_df = pd.read_csv(OUT/"p26_test_report_cost_5to1.csv")

# Extrae por cohorte para poner cifras en el texto
def _row(coh):
    r = test_df.loc[test_df["Cohort"]==coh]
    return r.iloc[0].to_dict() if len(r) else {}

r1 = _row("OAS1")
r2 = _row("OAS2")

# Helpers para strings con 3 decimales o '—' si falta
f3 = lambda x: f"{float(x):.3f}" if pd.notna(x) else "—"

# Valores del resumen (ya estaban en 'summary' de Celda 10)
late_val_auc  = f3(summary['late_VAL']['AUC'])
late_val_pra  = f3(summary['late_VAL']['PRAUC'])
late_val_bri  = f3(summary['late_VAL']['Brier'])
late_tst_auc  = f3(summary['late_TEST']['AUC'])
late_tst_pra  = f3(summary['late_TEST']['PRAUC'])
late_tst_bri  = f3(summary['late_TEST']['Brier'])

mid_val_auc   = f3(summary['mid_VAL']['AUC'])
mid_val_pra   = f3(summary['mid_VAL']['PRAUC'])
mid_val_bri   = f3(summary['mid_VAL']['Brier'])
mid_tst_auc   = f3(summary['mid_TEST']['AUC'])
mid_tst_pra   = f3(summary['mid_TEST']['PRAUC'])
mid_tst_bri   = f3(summary['mid_TEST']['Brier'])

winner_str = summary['winner']
meta_feats = summary['meta_features'] if isinstance(summary['meta_features'], list) else [str(summary['meta_features'])]

# Texto por cohorte
oas1_line = (
    f"OAS1 @ thr={f3(r1.get('Thr'))} → "
    f"TP={int(r1.get('TP',np.nan))}, FP={int(r1.get('FP',np.nan))}, TN={int(r1.get('TN',np.nan))}, FN={int(r1.get('FN',np.nan))} "
    f"→ R={f3(r1.get('Recall'))}, P={f3(r1.get('Precision'))}, Acc={f3(r1.get('Acc'))}, Coste={f3(r1.get('Cost'))}"
)
oas2_line = (
    f"OAS2 @ thr={f3(r2.get('Thr'))} → "
    f"TP={int(r2.get('TP',np.nan))}, FP={int(r2.get('FP',np.nan))}, TN={int(r2.get('TN',np.nan))}, FN={int(r2.get('FN',np.nan))} "
    f"→ R={f3(r2.get('Recall'))}, P={f3(r2.get('Precision'))}, Acc={f3(r2.get('Acc'))}, Coste={f3(r2.get('Cost'))}"
)

# === README block ===
blk_readme = f"""
### P26 — Intermodal (imagen + clínico) con fusión Late/Mid

**Selección por VAL:** {winner_str}
- **Late (p_img, p_clin{", p1_fill, p1_has" if len(meta_feats)>2 else ""})**
  - VAL: AUC={late_val_auc} | PR-AUC={late_val_pra} | Brier={late_val_bri}
  - TEST: AUC={late_tst_auc} | PR-AUC={late_tst_pra} | Brier={late_tst_bri}
- **Mid (IMG56 + clínico + p1)**
  - VAL: AUC={mid_val_auc} | PR-AUC={mid_val_pra} | Brier={mid_val_bri}
  - TEST: AUC={mid_tst_auc} | PR-AUC={mid_tst_pra} | Brier={mid_tst_bri}

**Decisión por coste (FN:FP=5:1, umbral aprendido en VAL y aplicado en TEST):**
- {oas1_line}
- {oas2_line}

_Artefactos_: `p26_val_preds.csv`, `p26_test_preds.csv`, `p26_thresholds_cost_5to1.csv`, `p26_test_report_cost_5to1.csv`, `p26_summary.json`.
"""

# === InformeTecnico block ===
blk_informe = f"""
## P26 — Intermodal (imagen + clínico)

**Diseño:**
1) **Clínico consolidado** OASIS-1/2 con anti-fuga (sin CDR/Group), imputación ligera y OHE.
2) Señales de imagen: **probabilidad P24** + **matriz 56 features** (p11 OAS1 + p14/p13 OAS2).
3) Señal parcial p1-OAS2 (~32% cobertura) integrada con **imputación por cohorte** (media VAL OAS2) + **flag de presencia**.
4) Dos estrategias:
   - **Late:** meta-LR sobre {{p_img, p_clin}} (+ p1_fill, p1_has).
   - **Mid:** LR-EN sobre {{IMG56, clínico, p1}}.
5) Selección por **AUC(VAL)** y decisión por **coste 5:1** (umbral por cohorte aprendido en VAL).

**Resultados:**
- LATE (seleccionado): VAL AUC={late_val_auc} · TEST AUC={late_tst_auc} (Brier TEST={late_tst_bri}).
- MID: VAL AUC={mid_val_auc} · TEST AUC={mid_tst_auc} (Brier TEST={mid_tst_bri}).

**Decisión clínico-operativa (5:1):**
- {oas1_line}
- {oas2_line}

**Notas:**
- La cobertura parcial de p1 se maneja con imputación **solo en OAS2** y `p1_has`.
- Late supera Mid en este dataset; en despliegue, monitorizar ECE/MCE por cohorte y considerar recalibración si ECE>0.2.
"""

# === Bitácora block ===
blk_bitacora = f"""
### {date.today()} — P26 intermodal completado

- Estrategia seleccionada: **{winner_str}** (meta-features: {", ".join(meta_feats)}).
- Late VAL: AUC={late_val_auc} | TEST: AUC={late_tst_auc}.
- Mid  VAL: AUC={mid_val_auc}  | TEST: AUC={mid_tst_auc}.
- Umbrales 5:1 por cohorte (aprendidos en VAL → aplicados en TEST):
  - {oas1_line}
  - {oas2_line}
- Artefactos guardados en `p26_intermodal/`.
"""

# Guarda bloques a disco y muestra README block como preview
(Path(OUT/"p26_readme_block.md").write_text(blk_readme, encoding="utf-8"))
(Path(OUT/"p26_informe_block.md").write_text(blk_informe, encoding="utf-8"))
(Path(OUT/"p26_bitacora_block.md").write_text(blk_bitacora, encoding="utf-8"))

print("✅ Bloques guardados en:", OUT)
print(blk_readme)


✅ Bloques guardados en: /content/drive/MyDrive/CognitivaAI/p26_intermodal

### P26 — Intermodal (imagen + clínico) con fusión Late/Mid

**Selección por VAL:** LATE  
- **Late (p_img, p_clin, p1_fill, p1_has)**  
  - VAL: AUC=0.916 | PR-AUC=0.921 | Brier=0.111  
  - TEST: AUC=0.713 | PR-AUC=0.712 | Brier=0.234
- **Mid (IMG56 + clínico + p1)**  
  - VAL: AUC=0.797 | PR-AUC=0.777 | Brier=0.185  
  - TEST: AUC=0.697 | PR-AUC=0.657 | Brier=0.230

**Decisión por coste (FN:FP=5:1, umbral aprendido en VAL y aplicado en TEST):**  
- OAS1 @ thr=0.307 → TP=14, FP=9, TN=18, FN=6 → R=0.700, P=0.609, Acc=0.681, Coste=39.000  
- OAS2 @ thr=0.195 → TP=8, FP=4, TN=7, FN=4 → R=0.667, P=0.667, Acc=0.652, Coste=24.000

_Artefactos_: `p26_val_preds.csv`, `p26_test_preds.csv`, `p26_thresholds_cost_5to1.csv`, `p26_test_report_cost_5to1.csv`, `p26_summary.json`.



In [25]:
# Celda 12 — Insertar P26 en P25 (master table + executive table)
import pandas as pd, numpy as np, json
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

BASE = Path("/content/drive/MyDrive/CognitivaAI")
P25 = BASE/"p25_informe_final"
P26 = BASE/"p26_intermodal"
assert (P25/"p25_master_table.csv").exists(), "No encuentro p25_master_table.csv"
assert (P26/"p26_val_preds.csv").exists() and (P26/"p26_test_preds.csv").exists(), "Faltan preds de P26"
assert (P26/"p26_test_report_cost_5to1.csv").exists(), "Falta test_report_cost_5to1 de P26"
assert (P26/"p26_summary.json").exists(), "Falta summary de P26"

mt = pd.read_csv(P25/"p25_master_table.csv")
val = pd.read_csv(P26/"p26_val_preds.csv")
tst = pd.read_csv(P26/"p26_test_preds.csv")
rep = pd.read_csv(P26/"p26_test_report_cost_5to1.csv")
summary = json.load(open(P26/"p26_summary.json"))

def _metrics(df):
    y, p = df["y_true"].astype(int).to_numpy(), df["y_prob"].astype(float).to_numpy()
    return dict(
        AUC=float(roc_auc_score(y,p)),
        PRAUC=float(average_precision_score(y,p)),
        Brier=float(brier_score_loss(y,p))
    )

rows = []
# ALL
rows.append(dict(Pipeline="P26", Split="VAL",  Cohort="ALL", **_metrics(val),  Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas=summary["winner"]))
rows.append(dict(Pipeline="P26", Split="TEST", Cohort="ALL", **_metrics(tst),  Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas=summary["winner"]))
# OAS1 / OAS2
for coh in ["OAS1","OAS2"]:
    rows.append(dict(Pipeline="P26", Split="VAL",  Cohort=coh, **_metrics(val[val["cohort"]==coh]),  Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas=summary["winner"]))
    rows.append(dict(Pipeline="P26", Split="TEST", Cohort=coh, **_metrics(tst[tst["cohort"]==coh]), Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas=summary["winner"]))

mt2 = pd.concat([mt, pd.DataFrame(rows)], ignore_index=True)
mt2.to_csv(P25/"p25_master_table.csv", index=False)
print("✅ Actualizado:", P25/"p25_master_table.csv")

# Executive table (añadimos líneas P26: métricas y coste)
def f3(x):
    try: return f"{float(x):.3f}"
    except: return "—"

lines = []

# Métricas P26
all_val = _metrics(val); all_tst = _metrics(tst)
lines.append({"Pipeline":"P26","Cohorte":"ALL","Método":summary["winner"],
              "AUC":f3(all_tst["AUC"]),"PR-AUC":f3(all_tst["PRAUC"]),"Brier":f3(all_tst["Brier"]),
              "Acc":"nan","Prec":"nan","Rec":"nan","Thr":"nan","Coste":"nan"})
for coh in ["OAS1","OAS2"]:
    m = _metrics(tst[tst["cohort"]==coh])
    lines.append({"Pipeline":"P26","Cohorte":coh,"Método":summary["winner"],
                  "AUC":f3(m["AUC"]),"PR-AUC":f3(m["PRAUC"]),"Brier":f3(m["Brier"]),
                  "Acc":"nan","Prec":"nan","Rec":"nan","Thr":"nan","Coste":"nan"})

# Filas de coste 5:1 P26 @ TEST
for _,r in rep.iterrows():
    lines.append({"Pipeline":"P26","Cohorte":r["Cohort"],"Método":"cost-5:1",
                  "AUC":"—","PR-AUC":"—","Brier":"—",
                  "Acc":f3(r["Acc"]),"Prec":f3(r["Precision"]),"Rec":f3(r["Recall"]),
                  "Thr":f3(r["Thr"]),"Coste":f3(r["Cost"])})

# Releer executive existente y añadir P26 al final
exec_md = P25/"p25_executive_table.md"
from io import StringIO
def mk_table(rows):
    cols=["Pipeline","Cohorte","Método","AUC","PR-AUC","Brier","Acc","Prec","Rec","Thr","Coste"]
    out="| " + " | ".join(["Pipeline","Cohorte","Método","AUC","PR-AUC","Brier","Acc","Prec","Rec","Thr","Coste"]) + " |\n"
    out+="|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|\n"
    for r in rows:
        out+="| {Pipeline} | {Cohorte} | {Método} | {AUC} | {PR-AUC} | {Brier} | {Acc} | {Prec} | {Rec} | {Thr} | {Coste} |\n".format(**r)
    return out

# Cargamos tabla anterior si existe
old = ""
if exec_md.exists():
    old = exec_md.read_text(encoding="utf-8")

tbl = mk_table(lines)
new_md = (old.rstrip() + "\n" if old else "") + tbl
exec_md.write_text(new_md, encoding="utf-8")
print("✅ Actualizado:", exec_md)
print(tbl)


✅ Actualizado: /content/drive/MyDrive/CognitivaAI/p25_informe_final/p25_master_table.csv
✅ Actualizado: /content/drive/MyDrive/CognitivaAI/p25_informe_final/p25_executive_table.md
| Pipeline | Cohorte | Método | AUC | PR-AUC | Brier | Acc | Prec | Rec | Thr | Coste |
|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|
| P26 | ALL | LATE | 0.713 | 0.712 | 0.234 | nan | nan | nan | nan | nan |
| P26 | OAS1 | LATE | 0.754 | 0.736 | 0.208 | nan | nan | nan | nan | nan |
| P26 | OAS2 | LATE | 0.652 | 0.728 | 0.288 | nan | nan | nan | nan | nan |
| P26 | OAS1 | cost-5:1 | — | — | — | 0.681 | 0.609 | 0.700 | 0.307 | 39.000 |
| P26 | OAS2 | cost-5:1 | — | — | — | 0.652 | 0.667 | 0.667 | 0.195 | 24.000 |



In [26]:
# Celda 13 — P26 con umbrales de P24 (OAS1=0.435, OAS2=0.332)
import pandas as pd, numpy as np
from pathlib import Path

P26 = Path("/content/drive/MyDrive/CognitivaAI/p26_intermodal")
tst = pd.read_csv(P26/"p26_test_preds.csv")

def confusion(y_true, y_prob, thr):
    y_pred = (y_prob>=thr).astype(int)
    TP = int(((y_true==1)&(y_pred==1)).sum())
    FP = int(((y_true==0)&(y_pred==1)).sum())
    TN = int(((y_true==0)&(y_pred==0)).sum())
    FN = int(((y_true==1)&(y_pred==0)).sum())
    prec = TP/(TP+FP) if (TP+FP)>0 else np.nan
    rec  = TP/(TP+FN) if (TP+FN)>0 else np.nan
    acc  = (TP+TN)/len(y_true)
    return TP,FP,TN,FN,prec,rec,acc, (5*FN + 1*FP)

rows=[]
for coh,thr in [("OAS1",0.435),("OAS2",0.332)]:
    df = tst[tst["cohort"]==coh]
    TP,FP,TN,FN,P,R,A,C = confusion(df["y_true"].values, df["y_prob"].values, thr)
    rows.append(dict(Cohort=coh, Thr=thr, TP=TP, FP=FP, TN=TN, FN=FN,
                     Precision=P, Recall=R, Acc=A, Cost=C))
alt = pd.DataFrame(rows)
print(alt)
out = P26/"p26_test_report_cost_5to1_ALTthr_fromP24.csv"
alt.to_csv(out, index=False)
print("💾 Guardado:", out)


  Cohort    Thr  TP  FP  TN  FN  Precision    Recall       Acc  Cost
0   OAS1  0.435  11   6  21   9   0.647059  0.550000  0.680851    51
1   OAS2  0.332   7   4   7   5   0.636364  0.583333  0.608696    29
💾 Guardado: /content/drive/MyDrive/CognitivaAI/p26_intermodal/p26_test_report_cost_5to1_ALTthr_fromP24.csv


In [27]:
# Celda 14 — ECE/MCE por cohorte para P26 (10 bins)
import pandas as pd, numpy as np
from pathlib import Path

def ece_mce(y_true, y_prob, bins=10):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)
    edges = np.linspace(0,1,bins+1)
    ece=0.0; mce=0.0; n=len(y_true)
    for i in range(bins):
        m = (y_prob>=edges[i]) & (y_prob<edges[i+1] if i<bins-1 else y_prob<=edges[i+1])
        if m.sum()==0: continue
        conf = y_prob[m].mean()
        acc  = y_true[m].mean()
        gap = abs(acc-conf)
        ece += (m.mean())*gap
        mce = max(mce, gap)
    return ece, mce

P26 = Path("/content/drive/MyDrive/CognitivaAI/p26_intermodal")
tst = pd.read_csv(P26/"p26_test_preds.csv")

rows=[]
for coh in ["ALL","OAS1","OAS2"]:
    df = tst if coh=="ALL" else tst[tst["cohort"]==coh]
    ece,mce = ece_mce(df["y_true"], df["y_prob"], bins=10)
    rows.append(dict(Cohort=coh, ECE10=ece, MCE10=mce))
cal = pd.DataFrame(rows)
cal.to_csv(P26/"p26_test_calibration_ece.csv", index=False)
print(cal)
print("💾 Guardado:", P26/"p26_test_calibration_ece.csv")


  Cohort     ECE10     MCE10
0    ALL  0.178378  0.406751
1   OAS1  0.150002  0.577521
2   OAS2  0.312514  0.765920
💾 Guardado: /content/drive/MyDrive/CognitivaAI/p26_intermodal/p26_test_calibration_ece.csv


In [28]:
# P26b — Calibración por cohorte (Platt) + re-umbrales 5:1
import numpy as np, pandas as pd, json
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

P26 = BASE/"p26_intermodal"
val = pd.read_csv(P26/"p26_val_preds.csv")
tst = pd.read_csv(P26/"p26_test_preds.csv")

def platt_fit(y, p):
    # calibrador estilo Platt (LR binaria sobre el score)
    m = LogisticRegression(solver="lbfgs")
    m.fit(p.reshape(-1,1), y.astype(int))
    return m

def platt_pred(m, p):
    return m.predict_proba(p.reshape(-1,1))[:,1]

def best_cost_thr(y_true, y_prob, C_FN=5.0, C_FP=1.0, grid=1001):
    thrs = np.linspace(0,1,grid); best=None
    for t in thrs:
        y_pred = (y_prob>=t).astype(int)
        TP=((y_true==1)&(y_pred==1)).sum()
        FP=((y_true==0)&(y_pred==1)).sum()
        FN=((y_true==1)&(y_pred==0)).sum()
        cost = C_FN*FN + C_FP*FP
        if (best is None) or (cost < best["Cost"]-1e-9):
            best=dict(Thr=float(t), Cost=float(cost))
    return best

out = []
cal_preds = []

for coh in ["OAS1","OAS2"]:
    v = val[val["cohort"]==coh]; t = tst[tst["cohort"]==coh]
    yv, pv = v["y_true"].to_numpy(), v["y_prob"].to_numpy()
    yt, pt = t["y_true"].to_numpy(), t["y_prob"].to_numpy()

    # Calibración Platt por cohorte
    pl = platt_fit(yv, pv)
    pv_cal = platt_pred(pl, pv)
    pt_cal = platt_pred(pl, pt)

    # Métricas post-calibración
    auc_val = roc_auc_score(yv, pv_cal); auc_tst = roc_auc_score(yt, pt_cal)
    pr_val = average_precision_score(yv, pv_cal); pr_tst = average_precision_score(yt, pt_cal)
    bri_val = brier_score_loss(yv, pv_cal); bri_tst = brier_score_loss(yt, pt_cal)

    # Re-umbrales (coste 5:1) aprendidos en VAL-cal y aplicados en TEST-cal
    best = best_cost_thr(yv, pv_cal, C_FN=5.0, C_FP=1.0)
    thr = best["Thr"]
    ypred = (pt_cal>=thr).astype(int)
    TP=int(((yt==1)&(ypred==1)).sum())
    FP=int(((yt==0)&(ypred==1)).sum())
    TN=int(((yt==0)&(ypred==0)).sum())
    FN=int(((yt==1)&(ypred==0)).sum())
    prec = TP/(TP+FP) if (TP+FP)>0 else np.nan
    rec  = TP/(TP+FN) if (TP+FN)>0 else np.nan
    acc  = (TP+TN)/len(yt)
    cost = 5*FN + 1*FP

    out.append(dict(
        Cohort=coh, Thr=thr,
        VAL_AUC=auc_val, TEST_AUC=auc_tst,
        VAL_PRAUC=pr_val, TEST_PRAUC=pr_tst,
        VAL_Brier=bri_val, TEST_Brier=bri_tst,
        TP=TP, FP=FP, TN=TN, FN=FN,
        Precision=prec, Recall=rec, Acc=acc, Cost=cost
    ))

    cal_preds.append(pd.DataFrame({
        "patient_id": t["patient_id"].values, "cohort": coh,
        "y_true": yt, "y_prob_cal": pt_cal
    }))

res = pd.DataFrame(out)
cal_preds = pd.concat(cal_preds, ignore_index=True)

res_path = P26/"p26b_percohort_platt_cost5to1.csv"
cal_path = P26/"p26b_test_preds_calibrated.csv"
res.to_csv(res_path, index=False)
cal_preds.to_csv(cal_path, index=False)
print("💾 Guardado:", res_path)
print(res)


💾 Guardado: /content/drive/MyDrive/CognitivaAI/p26_intermodal/p26b_percohort_platt_cost5to1.csv
  Cohort    Thr   VAL_AUC  TEST_AUC  VAL_PRAUC  TEST_PRAUC  VAL_Brier  \
0   OAS1  0.340  0.909259  0.753704   0.920794    0.735858   0.130844   
1   OAS2  0.374  0.942149  0.651515   0.944233    0.727862   0.164272   

   TEST_Brier  TP  FP  TN  FN  Precision    Recall       Acc  Cost  
0    0.199075  14   9  18   6   0.608696  0.700000  0.680851    39  
1    0.240842   8   4   7   4   0.666667  0.666667  0.652174    24  


In [29]:
from pathlib import Path
import pandas as pd, numpy as np

BASE = Path("/content/drive/MyDrive/CognitivaAI")
P25 = BASE/"p25_informe_final"
P26 = BASE/"p26_intermodal"

mt = pd.read_csv(P25/"p25_master_table.csv")

valb = pd.read_csv(P26/"p26_val_preds.csv")  # mismas VAL (pre-cal), ok para AUC
tstb = pd.read_csv(P26/"p26b_test_preds_calibrated.csv")  # probas calibradas por cohorte
tstb = tstb.rename(columns={"y_prob_cal":"y_prob"})

def _metrics(df):
    from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
    y, p = df["y_true"].astype(int).to_numpy(), df["y_prob"].astype(float).to_numpy()
    return dict(AUC=float(roc_auc_score(y,p)),
                PRAUC=float(average_precision_score(y,p)),
                Brier=float(brier_score_loss(y,p)))

rows=[]
# ALL
rows.append(dict(Pipeline="P26b", Split="VAL",  Cohort="ALL", **_metrics(valb), Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas="LATE+Platt"))
rows.append(dict(Pipeline="P26b", Split="TEST", Cohort="ALL", **_metrics(tstb), Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas="LATE+Platt"))
# Cohortes
for coh in ["OAS1","OAS2"]:
    rows.append(dict(Pipeline="P26b", Split="VAL",  Cohort=coh, **_metrics(valb[valb["cohort"]==coh]), Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas="LATE+Platt"))
    rows.append(dict(Pipeline="P26b", Split="TEST", Cohort=coh, **_metrics(tstb[tstb["cohort"]==coh]), Acc=np.nan, Precision=np.nan, Recall=np.nan, Thr=np.nan, Cost=np.nan, Notas="LATE+Platt"))

mt2 = pd.concat([mt, pd.DataFrame(rows)], ignore_index=True)
mt2.to_csv(P25/"p25_master_table.csv", index=False)
print("✅ Master table con P26b:", P25/"p25_master_table.csv")


✅ Master table con P26b: /content/drive/MyDrive/CognitivaAI/p25_informe_final/p25_master_table.csv
