In [1]:
!fusermount -u /content/drive
!rm -rf /content/drive
from google.colab import drive
drive.mount('/content/drive')


import os
base = "/content/drive/MyDrive/CognitivaAI/p13_oasis2_images"
for f in ["oas2_train_colab_mapped.csv",
          "oas2_val_colab_mapped.csv",
          "oas2_test_colab_mapped.csv"]:
    print(f, os.path.exists(os.path.join(base,f)))


fusermount: failed to unmount /content/drive: No such file or directory
Mounted at /content/drive
oas2_train_colab_mapped.csv True
oas2_val_colab_mapped.csv True
oas2_test_colab_mapped.csv True


In [2]:
# A) Setup & paths
import os, json, math, random, shutil
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import timm
from PIL import Image

from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.isotonic import IsotonicRegression

# Reproducibilidad
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

# Rutas
DRIVE = Path("/content/drive/MyDrive/CognitivaAI")
DATASETS = Path("/content/datasets")
OUT = DRIVE / "p14_oasis2_calib"
OUT.mkdir(parents=True, exist_ok=True)

MAP_DIR = DRIVE / "p13_oasis2_images"    # reusamos los mapas de p13
TRAIN_MAP = MAP_DIR / "oas2_train_colab_mapped.csv"
VAL_MAP   = MAP_DIR / "oas2_val_colab_mapped.csv"
TEST_MAP  = MAP_DIR / "oas2_test_colab_mapped.csv"

# Comprobación de GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Helpers
def ensure_abs_path_col(df, col="png_path"):
    # si hay rutas tipo 'data\...' o relativas, intentalas re-mapear a /content/datasets
    df = df.copy()
    df[col] = df[col].astype(str)
    bad = df[col].str.startswith("data\\") | df[col].str.startswith("data/")
    if bad.any():
        # asumimos que realmente están en /content/datasets/OAS2_PROCESSED/...
        df.loc[bad, col] = df.loc[bad, col].str.replace(r"^data[\\/]", str(DATASETS)+"/", regex=True)
    return df

def check_files(df, col="png_path", max_rows=30):
    df = df.copy()
    df["exists"] = df[col].apply(lambda p: Path(p).exists())
    n_bad = (~df["exists"]).sum()
    print(f"Missing files: {n_bad}/{len(df)}")
    if n_bad and max_rows:
        print(df.loc[~df["exists"], [col]].head(max_rows))
    return n_bad == 0


Device: cuda


In [5]:
# B) Carga mapas OAS2 y sanea columnas/rutas (parche: reconstruye scan_id si falta)
import pandas as pd, numpy as np, re, os
from pathlib import Path

TRAIN_MAP = "/content/drive/MyDrive/CognitivaAI/p13_oasis2_images/oas2_train_colab_mapped.csv"
VAL_MAP   = "/content/drive/MyDrive/CognitivaAI/p13_oasis2_images/oas2_val_colab_mapped.csv"
TEST_MAP  = "/content/drive/MyDrive/CognitivaAI/p13_oasis2_images/oas2_test_colab_mapped.csv"

needed = ["scan_id", "patient_id", "png_path", "target"]

scan_pat = re.compile(r"(OAS2_\d{4}_MR\d)")

def infer_scan_id_from_path(p: str) -> str | None:
    # intenta por regex completo
    m = scan_pat.search(p.replace("\\", "/"))
    if m:
        return m.group(1)
    # fallback: nombre del fichero sin sufijo _sliceXX
    stem = Path(p).stem  # p.ej. OAS2_0001_MR1_slice03
    if "_slice" in stem:
        return stem.split("_slice", 1)[0]
    # fallback: usa el nombre de carpeta si coincide el patrón
    parts = Path(p).parts
    for tok in parts[::-1]:
        if scan_pat.fullmatch(tok):
            return tok
    return None

def infer_patient_id_from_scan_id(scan_id: str) -> str | None:
    # OAS2_0001_MR1 -> OAS2_0001
    m = re.match(r"(OAS2_\d{4})_MR\d", scan_id)
    return m.group(1) if m else None

def normalize_paths(df: pd.DataFrame) -> pd.DataFrame:
    # normaliza separadores y, si ya están copiadas a /content/datasets, mantenlas
    df = df.copy()
    df["png_path"] = df["png_path"].astype(str).str.replace("\\", "/")
    return df

def ensure_cols(df: pd.DataFrame, name: str) -> pd.DataFrame:
    df = df.copy()
    df = normalize_paths(df)

    # target -> int {0,1}
    if "target" not in df.columns:
        raise AssertionError(f"{name}: falta columna 'target'")
    df["target"] = df["target"].astype(int)

    # scan_id
    if "scan_id" not in df.columns:
        df["scan_id"] = df["png_path"].map(infer_scan_id_from_path)
    # patient_id
    if "patient_id" not in df.columns:
        df["patient_id"] = df["scan_id"].map(infer_patient_id_from_scan_id)

    # valida que no haya nulos críticos
    miss = [c for c in ["scan_id", "patient_id", "png_path"] if df[c].isna().any()]
    if miss:
        bad = df[df[miss].isna().any(axis=1)].head(5)
        raise AssertionError(f"{name}: valores NaN en {miss}. Ejemplos:\n{bad}")

    # reordena columnas y devuelve sólo las necesarias + extras si quieres
    keep = [c for c in needed if c in df.columns]
    # deja extras (por si existieran) al final
    extras = [c for c in df.columns if c not in keep]
    df = df[keep + extras]

    # chequeo ligero de rutas
    # (no hacemos os.path.exists aquí para no ralentizar; ya chequeaste antes)
    return df

def load_and_fix(path: str, name: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    print(f"{name} leido: shape={df.shape} | cols={df.columns.tolist()}")
    df = ensure_cols(df, name)
    # opcional: sobreescribir el CSV con el parche para que p14 en adelante use la versión completa
    df[needed].to_csv(path, index=False)
    print(f"✅ {name} parcheado y guardado con columnas {needed}")
    return df

train_df = load_and_fix(TRAIN_MAP, "TRAIN")
val_df   = load_and_fix(VAL_MAP,   "VAL")
test_df  = load_and_fix(TEST_MAP,  "TEST")

# Muestra 3 filas de cada uno
from IPython.display import display
print("\nTRAIN sample:"); display(train_df.head(3))
print("\nVAL sample:");   display(val_df.head(3))
print("\nTEST sample:");  display(test_df.head(3))




TRAIN leido: shape=(2100, 3) | cols=['png_path', 'target', 'patient_id']
✅ TRAIN parcheado y guardado con columnas ['scan_id', 'patient_id', 'png_path', 'target']
VAL leido: shape=(440, 3) | cols=['png_path', 'target', 'patient_id']
✅ VAL parcheado y guardado con columnas ['scan_id', 'patient_id', 'png_path', 'target']
TEST leido: shape=(460, 3) | cols=['png_path', 'target', 'patient_id']
✅ TEST parcheado y guardado con columnas ['scan_id', 'patient_id', 'png_path', 'target']

TRAIN sample:


Unnamed: 0,scan_id,patient_id,png_path,target
0,OAS2_0001_MR1,OAS2_0001,/content/datasets/OAS2_PROCESSED/OAS2_0001_MR1...,0
1,OAS2_0001_MR1,OAS2_0001,/content/datasets/OAS2_PROCESSED/OAS2_0001_MR1...,0
2,OAS2_0001_MR1,OAS2_0001,/content/datasets/OAS2_PROCESSED/OAS2_0001_MR1...,0



VAL sample:


Unnamed: 0,scan_id,patient_id,png_path,target
0,OAS2_0002_MR3,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1
1,OAS2_0002_MR3,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1
2,OAS2_0002_MR3,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1



TEST sample:


Unnamed: 0,scan_id,patient_id,png_path,target
0,OAS2_0014_MR2,OAS2_0014,/content/datasets/OAS2_PROCESSED/OAS2_0014_MR2...,1
1,OAS2_0014_MR2,OAS2_0014,/content/datasets/OAS2_PROCESSED/OAS2_0014_MR2...,1
2,OAS2_0014_MR2,OAS2_0014,/content/datasets/OAS2_PROCESSED/OAS2_0014_MR2...,1


In [7]:
# C-0) Montaje e inicialización de rutas (idempotente)
from pathlib import Path
import os, pandas as pd, shutil, json, sys, numpy as np

# Montaje solo si no está montado
if not Path("/content/drive").exists() or not os.listdir("/content/drive"):
    from google.colab import drive
    drive.mount("/content/drive")

DRIVE_BASE = Path("/content/drive/MyDrive/CognitivaAI")
DATASETS_SSD = Path("/content/datasets")                             # SSD runtime
OAS2_SRC    = DRIVE_BASE / "oas2_data" / "OAS2_PROCESSED"           # en Drive
OAS2_DST    = DATASETS_SSD / "OAS2_PROCESSED"                       # en SSD

P13_DIR     = DRIVE_BASE / "p13_oasis2_images"
TRAIN_MAP   = P13_DIR / "oas2_train_colab_mapped.csv"
VAL_MAP     = P13_DIR / "oas2_val_colab_mapped.csv"
TEST_MAP    = P13_DIR / "oas2_test_colab_mapped.csv"

print("DRIVE_BASE:", DRIVE_BASE)
print("OAS2_SRC exists:", OAS2_SRC.exists())
print("P13_DIR exists:", P13_DIR.exists())

DATASETS_SSD.mkdir(parents=True, exist_ok=True)



DRIVE_BASE: /content/drive/MyDrive/CognitivaAI
OAS2_SRC exists: True
P13_DIR exists: True


In [8]:
# C-1) Re-sync incremental Drive -> SSD si faltan PNG
from glob import glob

def count_pngs(base: Path):
    return len(list(base.glob("*.png"))) + sum(len(list(p.glob("*.png"))) for p in base.rglob("*") if p.is_dir())

dst_count_before = count_pngs(OAS2_DST) if OAS2_DST.exists() else 0
src_count = count_pngs(OAS2_SRC) if OAS2_SRC.exists() else 0

print(f"PNG en Drive: {src_count} | PNG en SSD (antes): {dst_count_before}")

# Si no hay destino o está incompleto, copiamos incrementalmente.
if not OAS2_DST.exists() or dst_count_before < src_count:
    OAS2_DST.mkdir(parents=True, exist_ok=True)
    # rsync-like con gsutil/rsync no está disponible; usamos shell cp -u recursivo
    # Nota: cp -u no sobreescribe si el destino es más reciente; suficiente para Colab.
    !rsync -ah --ignore-existing "{OAS2_SRC}/" "{OAS2_DST}/"

dst_count_after = count_pngs(OAS2_DST)
print(f"PNG en SSD (después): {dst_count_after}")
assert dst_count_after > 0, "No se copiaron PNG a SSD. Revisa OAS2_SRC."


PNG en Drive: 7340 | PNG en SSD (antes): 0
PNG en SSD (después): 7340


In [9]:
# C-2) Carga mapas y corrige las rutas a SSD (idempotente)
cols_needed = ["scan_id", "patient_id", "png_path", "target"]

def load_map(path: Path, name: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Algunos mapas venían sin 'scan_id' en p13 (lo ya arreglaste, pero lo dejamos por robustez)
    if "scan_id" not in df.columns and "png_path" in df.columns:
        df = df.copy()
        # reconstruye scan_id si la ruta incorpora el patrón OAS2_xxxx_MRy_sliceNN.png
        df["scan_id"] = df["png_path"].apply(lambda p: Path(str(p)).name.split("_slice")[0])
    # Normaliza columnas
    missing = [c for c in cols_needed if c not in df.columns]
    if missing:
        raise AssertionError(f"{name} sin columnas requeridas: falta {missing}. Tiene: {df.columns.tolist()}")
    # Normaliza target a int
    df["target"] = df["target"].astype(int)
    # Parche de rutas → reemplaza prefijo de Drive/Windows por SSD POSIX
    def fix_path(p):
        p = str(p)
        p = p.replace("\\", "/")
        # si ya apunta a /content/datasets, mantener
        if p.startswith("/content/datasets/"):
            return p
        # convierte cualquier .../OAS2_PROCESSED/xxx.png a SSD
        if "OAS2_PROCESSED/" in p:
            tail = p.split("OAS2_PROCESSED/")[-1]
            return str(OAS2_DST / tail)
        # fallback: si solo nombre de archivo
        if p.endswith(".png"):
            return str(OAS2_DST / Path(p).name)
        return p
    df["png_path"] = df["png_path"].map(fix_path)
    return df

train_df = load_map(TRAIN_MAP, "TRAIN")
val_df   = load_map(VAL_MAP,   "VAL")
test_df  = load_map(TEST_MAP,  "TEST")

for name, df in [("TRAIN", train_df), ("VAL", val_df), ("TEST", test_df)]:
    miss = df["png_path"].map(lambda p: Path(p).exists()).value_counts().to_dict()
    print(f"{name}: filas={len(df)} | existen={{True:{miss.get(True,0)}, False:{miss.get(False,0)}}}")


TRAIN: filas=2100 | existen={True:2100, False:0}
VAL: filas=440 | existen={True:440, False:0}
TEST: filas=460 | existen={True:460, False:0}


In [10]:
# C-3) Filtrado de filas huérfanas (sin fichero)
def filter_existing(df: pd.DataFrame) -> pd.DataFrame:
    m = df["png_path"].map(lambda p: Path(p).exists())
    return df[m].reset_index(drop=True)

train_df_f = filter_existing(train_df)
val_df_f   = filter_existing(val_df)
test_df_f  = filter_existing(test_df)

print("TRAIN kept:", len(train_df_f), "/", len(train_df))
print("VAL   kept:", len(val_df_f),   "/", len(val_df))
print("TEST  kept:", len(test_df_f),  "/", len(test_df))

# Guarda de nuevo los mapas corregidos (apunta a SSD)
train_df_f.to_csv(TRAIN_MAP, index=False)
val_df_f.to_csv(VAL_MAP,   index=False)
test_df_f.to_csv(TEST_MAP, index=False)
print("Mapas SSD guardados en p13_oasis2_images con rutas /content/datasets/...")


TRAIN kept: 2100 / 2100
VAL   kept: 440 / 440
TEST  kept: 460 / 460
Mapas SSD guardados en p13_oasis2_images con rutas /content/datasets/...


In [11]:
# C-4) Comprobaciones
for name, df in [("TRAIN", train_df_f), ("VAL", val_df_f), ("TEST", test_df_f)]:
    pu = df["patient_id"].nunique()
    cu = df["target"].value_counts().to_dict()
    print(f"{name}: pacientes={pu} | clases={cu}")

# Dataset/Loader mínimos para probar
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

class OAS2PNG(Dataset):
    def __init__(self, df, tfm=None):
        self.df = df.reset_index(drop=True)
        self.tfm = tfm
    def __len__(self):
        return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        p = r.png_path
        if not Path(p).exists():
            # seguridad extra si algo se coló
            raise FileNotFoundError(p)
        im = Image.open(p).convert("RGB")
        if self.tfm: im = self.tfm(im)
        y  = int(r.target)
        pid= r.patient_id
        return im, y, pid

# TF dummy para probar forma
import torchvision.transforms as T
tfm = T.Compose([T.Resize((224,224)), T.ToTensor()])

train_loader = DataLoader(OAS2PNG(train_df_f, tfm), batch_size=32, shuffle=True, num_workers=2, pin_memory=True, drop_last=False)
xb, yb, pb = next(iter(train_loader))
print("Batch check:", xb.shape, yb.shape, torch.unique(yb, return_counts=True))


TRAIN: pacientes=105 | clases={1: 1100, 0: 1000}
VAL: pacientes=22 | clases={1: 220, 0: 220}
TEST: pacientes=23 | clases={1: 240, 0: 220}
Batch check: torch.Size([32, 3, 224, 224]) torch.Size([32]) (tensor([0, 1]), tensor([11, 21]))


In [12]:
# D1) Imports + rutas + device
import os, json, time, math, gc
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import timm
from PIL import Image
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

from torchvision import transforms as T

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# Rutas (usamos los mapas con rutas SSD que guardaste en C4)
BASE_DRIVE = "/content/drive/MyDrive/CognitivaAI"
P14_DIR    = Path(BASE_DRIVE) / "p14_oasis2_images"
P14_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_MAP = Path(BASE_DRIVE) / "p13_oasis2_images/oas2_train_colab_mapped.csv"
VAL_MAP   = Path(BASE_DRIVE) / "p13_oasis2_images/oas2_val_colab_mapped.csv"
TEST_MAP  = Path(BASE_DRIVE) / "p13_oasis2_images/oas2_test_colab_mapped.csv"

for p in [TRAIN_MAP, VAL_MAP, TEST_MAP]:
    assert p.exists(), f"Falta mapa: {p}"


Device: cuda


In [13]:
# D2) Dataset + transforms (224 o 300 según prefieras; mantenemos 224 para consistencia)
IMG_SIZE = 224

tfm_train = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomVerticalFlip(p=0.1),
    T.RandomRotation(10),
    T.ToTensor(),
    T.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]),
])
tfm_eval  = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]),
])

class OAS2PNG(Dataset):
    def __init__(self, df, tfm):
        self.df  = df.reset_index(drop=True)
        self.tfm = tfm

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        im = Image.open(r.png_path).convert("RGB")
        im = self.tfm(im)
        y  = int(r.target)
        pid = r.patient_id
        return im, torch.tensor(y, dtype=torch.float32), pid


In [14]:
# D3) Cargar mapas + sanity + class weights
def load_map(path):
    df = pd.read_csv(path)
    need = ["scan_id","patient_id","png_path","target"]
    assert all(c in df.columns for c in need), f"Columnas requeridas {need} no están en {path}"
    # objetivo int
    df["target"] = df["target"].astype(int)
    return df[need]

train_df = load_map(TRAIN_MAP)
val_df   = load_map(VAL_MAP)
test_df  = load_map(TEST_MAP)

print("TRAIN:", train_df["target"].value_counts().to_dict(), "| filas:", len(train_df))
print("VAL:  ", val_df["target"].value_counts().to_dict(),   "| filas:", len(val_df))
print("TEST: ", test_df["target"].value_counts().to_dict(),  "| filas:", len(test_df))

# Class weights
pos = (train_df["target"]==1).sum()
neg = (train_df["target"]==0).sum()
w_pos = (neg + pos) / (2.0 * pos)
w_neg = (neg + pos) / (2.0 * neg)
class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float32, device=DEVICE)
print("class weights (neg,pos):", (float(w_neg), float(w_pos)))


TRAIN: {1: 1100, 0: 1000} | filas: 2100
VAL:   {1: 220, 0: 220} | filas: 440
TEST:  {1: 240, 0: 220} | filas: 460
class weights (neg,pos): (1.05, 0.9545454545454546)


In [15]:
# D4) DataLoaders
BATCH = 64
train_loader = DataLoader(OAS2PNG(train_df, tfm_train), batch_size=BATCH, shuffle=True,  num_workers=2, pin_memory=True, drop_last=False)
val_loader   = DataLoader(OAS2PNG(val_df,   tfm_eval),  batch_size=BATCH, shuffle=False, num_workers=2, pin_memory=True, drop_last=False)
test_loader  = DataLoader(OAS2PNG(test_df,  tfm_eval),  batch_size=BATCH, shuffle=False, num_workers=2, pin_memory=True, drop_last=False)

xb, yb, pb = next(iter(train_loader))
print("Batch check:", xb.shape, yb.shape, torch.unique(yb.to(torch.int64), return_counts=True))


Batch check: torch.Size([64, 3, 224, 224]) torch.Size([64]) (tensor([0, 1]), tensor([30, 34]))


In [16]:
# D5) Modelo + optim + AMP + loop
EPOCHS = 12
LR     = 3e-4

model = timm.create_model("tf_efficientnet_b3_ns", pretrained=True, num_classes=1)
model.to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler    = torch.cuda.amp.GradScaler()

bce = nn.BCEWithLogitsLoss(reduction="none")  # aplicamos peso manual

def run_epoch(loader, train=True):
    model.train(train)
    tot, n = 0.0, 0
    for xb, yb, _ in loader:
        xb = xb.to(DEVICE, non_blocking=True)
        yb = yb.to(DEVICE, non_blocking=True)

        with torch.cuda.amp.autocast():
            logits = model(xb).squeeze(1)            # (B,)
            loss_all = bce(logits, yb)               # (B,)
            # aplicar class weights
            w = torch.where(yb>0.5, class_weights[1], class_weights[0])
            loss = (loss_all * w).mean()

        if train:
            optimizer.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        tot += float(loss) * xb.size(0)
        n   += xb.size(0)
    return tot / max(1,n)

best_val = (1e9, 0)
CKPT = P14_DIR/"p14_effb3_oas2_best.pth"

for ep in range(1, EPOCHS+1):
    trL = run_epoch(train_loader, True)
    vaL = run_epoch(val_loader,   False)
    scheduler.step()
    print(f"Epoch {ep} | train_loss={trL:.4f} | val_loss={vaL:.4f}")
    if vaL < best_val[0] - 1e-5:
        best_val = (vaL, ep)
        torch.save(model.state_dict(), CKPT)
        print("✓ Guardado best:", CKPT)


  model = create_fn(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

  scaler    = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  tot += float(loss) * xb.size(0)


Epoch 1 | train_loss=1.5012 | val_loss=2.3290
✓ Guardado best: /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/p14_effb3_oas2_best.pth
Epoch 2 | train_loss=0.8362 | val_loss=0.9622
✓ Guardado best: /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/p14_effb3_oas2_best.pth
Epoch 3 | train_loss=0.5279 | val_loss=1.1499
Epoch 4 | train_loss=0.4188 | val_loss=0.8070
✓ Guardado best: /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/p14_effb3_oas2_best.pth
Epoch 5 | train_loss=0.3331 | val_loss=0.6517
✓ Guardado best: /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/p14_effb3_oas2_best.pth
Epoch 6 | train_loss=0.2278 | val_loss=0.8690
Epoch 7 | train_loss=0.1682 | val_loss=1.1683
Epoch 8 | train_loss=0.1380 | val_loss=0.9719
Epoch 9 | train_loss=0.1272 | val_loss=1.1599
Epoch 10 | train_loss=0.0977 | val_loss=1.0583
Epoch 11 | train_loss=0.0900 | val_loss=1.0723
Epoch 12 | train_loss=0.0782 | val_loss=1.0965


In [17]:
# E1) Inferencia helper
@torch.no_grad()
def infer_df(df, loader):
    model.eval()
    rows = []
    for xb, yb, pid in tqdm(loader, desc="Infer", total=math.ceil(len(df)/loader.batch_size)):
        xb = xb.to(DEVICE, non_blocking=True)
        logits = model(xb).squeeze(1).float().cpu().numpy()
        probs  = 1/(1+np.exp(-logits))
        for i in range(len(pid)):
            rows.append((pid[i], df.iloc[len(rows)]["png_path"], int(df.iloc[len(rows)]["target"]), float(probs[i])))
    out = pd.DataFrame(rows, columns=["patient_id","png_path","y_true","y_score"])
    return out


In [18]:
# E2) Cargar best y ejecutar
model.load_state_dict(torch.load(CKPT, map_location=DEVICE))
model.eval()

val_sl  = infer_df(val_df,  val_loader)
test_sl = infer_df(test_df, test_loader)

VAL_SL_PATH  = P14_DIR/"val_png_preds_oas2_effb3_p14.csv"
TEST_SL_PATH = P14_DIR/"test_png_preds_oas2_effb3_p14.csv"
val_sl.to_csv(VAL_SL_PATH, index=False)
test_sl.to_csv(TEST_SL_PATH, index=False)

print("VAL_SL", val_sl.shape, "TEST_SL", test_sl.shape)
display(val_sl.head(3))


Infer:   0%|          | 0/7 [00:00<?, ?it/s]

Infer:   0%|          | 0/8 [00:00<?, ?it/s]

VAL_SL (440, 4) TEST_SL (460, 4)


Unnamed: 0,patient_id,png_path,y_true,y_score
0,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1,0.013529
1,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1,0.910736
2,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1,0.234597


In [19]:
# E3) Pooling por paciente
def agg_patient(df):
    g = df.groupby("patient_id")["y_score"]
    # trimmed20 requiere al menos 10 cortes: si no, usa mean
    def trimmed20(s):
        k = max(1, int(len(s)*.1)); j = max(k, int(len(s)*.9))
        if j>k:
            return s.sort_values().iloc[k:j].mean()
        return s.mean()
    def top7(s):
        return s.sort_values(ascending=False).head(7).mean()
    def pmean2(s):
        s = np.clip(s.values, 0, 1)
        return float(np.sqrt((s**2).mean()))
    out = pd.DataFrame({
        "y_true": df.groupby("patient_id")["y_true"].first(),
        "mean": g.mean(),
        "trimmed20": g.apply(trimmed20),
        "top7": g.apply(top7),
        "pmean_2": g.apply(pmean2),
    }).reset_index()
    return out

val_pt  = agg_patient(val_sl)
test_pt = agg_patient(test_sl)
VAL_PT_PATH  = P14_DIR/"val_patient_features_oas2_effb3_p14.csv"
TEST_PT_PATH = P14_DIR/"test_patient_features_oas2_effb3_p14.csv"
val_pt.to_csv(VAL_PT_PATH, index=False)
test_pt.to_csv(TEST_PT_PATH, index=False)
print("VAL_PT", val_pt.shape, "TEST_PT", test_pt.shape)
display(val_pt.head(2))


VAL_PT (22, 6) TEST_PT (23, 6)


Unnamed: 0,patient_id,y_true,mean,trimmed20,top7,pmean_2
0,OAS2_0002,1,0.760129,0.815769,0.996145,0.815837
1,OAS2_0004,0,0.57353,0.586414,0.937615,0.66231


In [20]:
# E4) Métricas rápidas F1-opt (umbral barrido) + AUC/PR
from sklearn.metrics import accuracy_score, precision_score, recall_score

def eval_curve(y_true, y_score):
    # AUC/PR-AUC pueden fallar si una sola clase; gestionamos
    try:
        auc = float(roc_auc_score(y_true, y_score))
    except Exception:
        auc = float("nan")
    try:
        prauc = float(average_precision_score(y_true, y_score))
    except Exception:
        prauc = float("nan")
    # F1-opt: buscamos umbral que maximiza F1
    ps, rs, th = precision_recall_curve(y_true, y_score)
    f1 = 2*ps*rs/(ps+rs+1e-9)
    k  = int(np.nanargmax(f1))
    best_thr = float(th[max(0, min(k, len(th)-1))]) if len(th)>0 else 0.5
    y_pred = (y_score >= best_thr).astype(int)
    acc = float(accuracy_score(y_true, y_pred))
    P   = float(precision_score(y_true, y_pred, zero_division=0))
    R   = float(recall_score(y_true, y_pred, zero_division=0))
    return {"AUC":auc,"PRAUC":prauc,"Acc":acc,"P":P,"R":R,"thr":best_thr,"n":int(len(y_true))}

summary = {
    "variant":"EffB3_OAS2_p14",
    "VAL": {k:eval_curve(val_pt["y_true"].values,  val_pt[k].values) for k in ["mean","trimmed20","top7","pmean_2"]},
    "TEST":{k:eval_curve(test_pt["y_true"].values, test_pt[k].values) for k in ["mean","trimmed20","top7","pmean_2"]},
}
SUM_PATH = P14_DIR/"p14_patient_eval_summary.json"
with open(SUM_PATH,"w") as f: json.dump(summary, f, indent=2)
summary


{'variant': 'EffB3_OAS2_p14',
 'VAL': {'mean': {'AUC': 0.8842975206611571,
   'PRAUC': 0.9165565428723323,
   'Acc': 0.8636363636363636,
   'P': 0.9,
   'R': 0.8181818181818182,
   'thr': 0.7601291615050286,
   'n': 22},
  'trimmed20': {'AUC': 0.8842975206611571,
   'PRAUC': 0.9165565428723323,
   'Acc': 0.8636363636363636,
   'P': 0.9,
   'R': 0.8181818181818182,
   'thr': 0.8116158433258533,
   'n': 22},
  'top7': {'AUC': 0.9008264462809917,
   'PRAUC': 0.9381118881118882,
   'Acc': 0.9090909090909091,
   'P': 1.0,
   'R': 0.8181818181818182,
   'thr': 0.9950901355062213,
   'n': 22},
  'pmean_2': {'AUC': 0.8760330578512396,
   'PRAUC': 0.913924963924964,
   'Acc': 0.8636363636363636,
   'P': 0.9,
   'R': 0.8181818181818182,
   'thr': 0.8158370829996843,
   'n': 22}},
 'TEST': {'mean': {'AUC': 0.7121212121212122,
   'PRAUC': 0.7787677385394105,
   'Acc': 0.6956521739130435,
   'P': 0.631578947368421,
   'R': 1.0,
   'thr': 0.20474459462566302,
   'n': 23},
  'trimmed20': {'AUC': 0.71

In [21]:
# =========================
# E5) Exportar CSVs + Summary + Catálogo (p14 → p11)
# =========================
import os, json, time, math
import numpy as np
import pandas as pd
from pathlib import Path

# ---- rutas base p14 y p11
BASE_P14 = Path("/content/drive/MyDrive/CognitivaAI/p14_oasis2_images")
BASE_P14.mkdir(parents=True, exist_ok=True)

BASE_P11 = Path("/content/drive/MyDrive/CognitivaAI/p11_alt_backbones")
BASE_P11.mkdir(parents=True, exist_ok=True)

CATALOG_JSON = BASE_P11 / "p11_backbone_catalog.json"
COMPARISON_CSV = BASE_P11 / "comparison_backbones_eval.csv"

# ---- Si vienes de E4, probablemente tienes val_sl y test_sl en memoria.
# Si no existen, los cargamos de nombres estándar (ajusta si cambiaste).
def _try_load(name_candidates: list[Path]) -> pd.DataFrame | None:
    for p in name_candidates:
        if p.exists():
            try:
                df = pd.read_csv(p)
                if {"patient_id","png_path","y_true","y_score"}.issubset(df.columns):
                    return df
            except Exception:
                pass
    return None

if "val_sl" not in globals() or "test_sl" not in globals():
    # busca primero en p14; como fallback, en p13 si reusaste nombres
    val_sl = _try_load([
        BASE_P14 / "val_png_preds_oas2_effb3_p14.csv",
        BASE_P14 / "val_png_preds_oas2_effb3.csv",
        Path("/content/drive/MyDrive/CognitivaAI/p13_oasis2_images/val_png_preds_oas2_effb3.csv")
    ])
    test_sl = _try_load([
        BASE_P14 / "test_png_preds_oas2_effb3_p14.csv",
        BASE_P14 / "test_png_preds_oas2_effb3.csv",
        Path("/content/drive/MyDrive/CognitivaAI/p13_oasis2_images/test_png_preds_oas2_effb3.csv")
    ])
    assert val_sl is not None and test_sl is not None, "No encuentro val_sl/test_sl ni en memoria ni en disco."

print("VAL_SL", val_sl.shape, "| cols:", list(val_sl.columns))
print("TEST_SL", test_sl.shape, "| cols:", list(test_sl.columns))

# ---- pooling a nivel paciente: mean, trimmed20, top7, pmean_2
def _trimmed_mean(s: pd.Series, frac=0.1):
    n = len(s)
    if n < 3:
        return float(s.mean())
    k = int(math.floor(n*frac))
    ss = s.sort_values().iloc[k:n-k] if (n - 2*k) > 0 else s
    return float(ss.mean())

def _topk_mean(s: pd.Series, k=7):
    return float(s.sort_values(ascending=False).head(k).mean())

def _pmean_2(s: pd.Series):
    x = np.clip(s.values, 0, 1).astype(float)
    return float(np.sqrt((x**2).mean()))

def pool_patient(df: pd.DataFrame) -> pd.DataFrame:
    g = df.groupby("patient_id")["y_score"]
    out = pd.DataFrame({
        "patient_id": list(g.groups.keys()),
        "mean":       g.mean().values,
        "trimmed20":  g.apply(lambda s: _trimmed_mean(s, 0.1)).values,
        "top7":       g.apply(lambda s: _topk_mean(s, 7)).values,
        "pmean_2":    g.apply(_pmean_2).values,
    })
    # adjunta y_true (asumimos constante por paciente)
    y = df.groupby("patient_id")["y_true"].first().reset_index()
    out = y.merge(out, on="patient_id", how="inner").rename(columns={"y_true":"y_true"})
    return out[["patient_id","y_true","mean","trimmed20","top7","pmean_2"]]

val_pt = pool_patient(val_sl)
tst_pt = pool_patient(test_sl)
print("VAL_PT", val_pt.shape, "| TEST_PT", tst_pt.shape)
display(val_pt.head(3)); display(tst_pt.head(3))

# ---- Guardar slice-level preds p14 (con sufijo _p14 para no pisar p13)
val_sl_out  = BASE_P14 / "val_png_preds_oas2_effb3_p14.csv"
test_sl_out = BASE_P14 / "test_png_preds_oas2_effb3_p14.csv"
val_sl.to_csv(val_sl_out, index=False)
test_sl.to_csv(test_sl_out, index=False)

# ---- Guardar features a paciente (las 4 columnas)
val_feat_out  = BASE_P14 / "val_patient_features_oas2_effb3_p14.csv"
test_feat_out = BASE_P14 / "test_patient_features_oas2_effb3_p14.csv"
val_pt.to_csv(val_feat_out, index=False)
tst_pt.to_csv(test_feat_out, index=False)

# ---- También guardamos "predicciones a paciente" (una sola columna elegida para ensambles)
# Por defecto usaremos 'trimmed20' como score principal (robusto)
score_for_ens = "trimmed20"
val_pred = val_pt[["patient_id","y_true",score_for_ens]].rename(columns={score_for_ens:"y_score"})
tst_pred = tst_pt[["patient_id","y_true",score_for_ens]].rename(columns={score_for_ens:"y_score"})

val_pred_out = BASE_P14 / "val_patient_preds_oas2_effb3_p14.csv"
tst_pred_out = BASE_P14 / "test_patient_preds_oas2_effb3_p14.csv"
val_pred.to_csv(val_pred_out, index=False)
tst_pred.to_csv(tst_pred_out, index=False)

print("💾 Guardados (p14):")
for p in [val_sl_out, test_sl_out, val_feat_out, test_feat_out, val_pred_out, tst_pred_out]:
    print(" -", p)

# ---- Summary JSON (usamos las métricas que ya calculaste en E4 si están,
#      si no, guardamos un stub simple con tamaños)
summary = {
    "variant": "EffB3_OAS2_p14",
    "VAL": {
        "n_patients": int(val_pt.shape[0]),
        "cols": ["mean","trimmed20","top7","pmean_2"]
    },
    "TEST": {
        "n_patients": int(tst_pt.shape[0]),
        "cols": ["mean","trimmed20","top7","pmean_2"]
    },
    "files": {
        "val_png": str(val_sl_out),
        "test_png": str(test_sl_out),
        "val_patient_features": str(val_feat_out),
        "test_patient_features": str(test_feat_out),
        "val_patient_preds": str(val_pred_out),
        "test_patient_preds": str(tst_pred_out),
    },
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}

# Si dejaste en una variable las métricas detalladas (dict_metrics) las añadimos
if "dict_metrics" in globals() and isinstance(dict_metrics, dict):
    summary["metrics"] = dict_metrics

summary_out = BASE_P14 / "p14_patient_eval_summary.json"
with open(summary_out, "w") as f:
    json.dump(summary, f, indent=2)
print("💾 Resumen guardado:", summary_out)

# ---- Registrar en catálogo p11 (para reutilizar en ensembles):
# añadimos una fuente 'oas2_effb3_p14' con las rutas esenciales.
# Escogemos el score 'y_score' (que proviene de trimmed20 arriba)
entry_key = "oas2_effb3_p14"

new_entry = {
    "VAL":  str(val_pred_out),
    "TEST": str(tst_pred_out),
    "id_col": "patient_id",
    "y_col":  "y_true",
    "score_col": "y_score",
    "notes": "EffNet-B3 OASIS-2 (p14). Score=y_score (trimmed20)."
}

catalog = {}
if CATALOG_JSON.exists():
    try:
        catalog = json.loads(Path(CATALOG_JSON).read_text())
    except Exception:
        catalog = {}

catalog[entry_key] = new_entry

with open(CATALOG_JSON, "w") as f:
    json.dump(catalog, f, indent=2)
print("📚 Catálogo actualizado:", CATALOG_JSON)
print("   → claves:", list(catalog.keys()))

# ---- (Opcional) Añadir fila a comparison_backbones_eval.csv con tus métricas principales de TEST
# Usaremos las que reportaste (TEST: trimmed20) si están en summary["metrics"]
try:
    row = {
        "variant": "EffB3_OAS2_p14_trimmed20",
        "AUC": None,
        "PRAUC": None,
        "Acc": None,
        "P": None,
        "R": None,
        "thr": None,
        "n": int(tst_pt.shape[0]),
        "when": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    # si hay métricas en memoria (como las que compartiste), las volcamos
    if "dict_metrics" in globals() and "TEST" in dict_metrics and "trimmed20" in dict_metrics["TEST"]:
        m = dict_metrics["TEST"]["trimmed20"]
        for k in ["AUC","PRAUC","Acc","P","R","thr"]:
            row[k] = float(m.get(k, np.nan))

    df_cmp = pd.DataFrame([row])
    if COMPARISON_CSV.exists():
        prev = pd.read_csv(COMPARISON_CSV)
        df_cmp = pd.concat([prev, df_cmp], ignore_index=True)
    df_cmp.to_csv(COMPARISON_CSV, index=False)
    print("🧾 comparison_backbones_eval.csv actualizado:", COMPARISON_CSV)
except Exception as e:
    print("(!) No se pudo actualizar comparison_backbones_eval.csv:", e)

print("\n✅ E5 completado.")
print("   Si vas a re-fabricar features en p11, ejecuta el paso G/fix de catálogo y reconstrucción de features.")


VAL_SL (440, 4) | cols: ['patient_id', 'png_path', 'y_true', 'y_score']
TEST_SL (460, 4) | cols: ['patient_id', 'png_path', 'y_true', 'y_score']
VAL_PT (22, 6) | TEST_PT (23, 6)


Unnamed: 0,patient_id,y_true,mean,trimmed20,top7,pmean_2
0,OAS2_0002,1,0.760129,0.815769,0.996145,0.815837
1,OAS2_0004,0,0.57353,0.586414,0.937615,0.66231
2,OAS2_0005,0,0.324261,0.296181,0.653334,0.426674


Unnamed: 0,patient_id,y_true,mean,trimmed20,top7,pmean_2
0,OAS2_0014,1,0.662593,0.697188,0.985867,0.745301
1,OAS2_0040,1,0.80826,0.866753,0.999504,0.85836
2,OAS2_0054,1,0.554348,0.564679,0.949705,0.667179


💾 Guardados (p14):
 - /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/val_png_preds_oas2_effb3_p14.csv
 - /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/test_png_preds_oas2_effb3_p14.csv
 - /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/val_patient_features_oas2_effb3_p14.csv
 - /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/test_patient_features_oas2_effb3_p14.csv
 - /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/val_patient_preds_oas2_effb3_p14.csv
 - /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/test_patient_preds_oas2_effb3_p14.csv
💾 Resumen guardado: /content/drive/MyDrive/CognitivaAI/p14_oasis2_images/p14_patient_eval_summary.json
📚 Catálogo actualizado: /content/drive/MyDrive/CognitivaAI/p11_alt_backbones/p11_backbone_catalog.json
   → claves: ['SwinTiny', 'convnext_tiny.in12k_ft_in1k_slices', 'png_preds_d121', 'patient_preds', 'patient_preds_ensemble', 'patient_preds_plus', 'png_preds', 'slice_preds_plus', 'slice_preds_seedENS', 'slices_preds

In [22]:
# === G0: imports + helpers ===
import os, json, math, shutil
import numpy as np
import pandas as pd
from pathlib import Path

def safe_sigmoid(z):
    z = np.clip(z, -50, 50)
    return 1/(1+np.exp(-z))

def detect_col(cands, cols):
    """Devuelve la 1ª columna de 'cands' que exista en 'cols'.
    Si no hay, pero hay 'logit' o 'logits', se usará eso para construir score."""
    cols = list(cols)
    for c in cands:
        if c in cols:
            return c
    # fallback: logit/logits
    for c in ("logit", "logits"):
        if c in cols:
            return c
    raise KeyError(f"No se pudo detectar columna entre {cands} en {cols}")

def patient_pool(df):
    """Aggregaciones patient-level coherentes con p11/p14."""
    g = df.groupby("patient_id")["y_score"]
    out = pd.DataFrame({
        "mean": g.mean(),
        "trimmed20": g.apply(lambda s: s.sort_values().iloc[int(len(s)*.1):int(len(s)*.9)].mean()
                             if len(s)>=10 else s.mean()),
        "top7": g.apply(lambda s: s.sort_values(ascending=False).head(7).mean()),
        "pmean_2": g.apply(lambda s: (np.mean(np.power(np.clip(s,0,1),2)))**0.5),
    }).reset_index()
    # Añadimos y_true por paciente (asumimos consistente dentro de paciente)
    yt = df.groupby("patient_id")["y_true"].agg(lambda x: int(round(float(np.mean(x)))))
    out = out.merge(yt.rename("y_true"), on="patient_id", how="left")
    return out[["patient_id","y_true","mean","trimmed20","top7","pmean_2"]]

def read_preds(path, id_col=None, y_col=None, score_col=None):
    """Lee CSV de preds (slice- o patient-level). Normaliza a columnas: patient_id, y_true, y_score (si slice, trae png_path si existe)."""
    path = Path(path)
    df = pd.read_csv(path)
    cols = df.columns

    # Detectar id, y_true, score
    id_col    = id_col    or detect_col(["patient_id","pid","subject_id"], cols)
    y_col     = y_col     or detect_col(["y_true","target","label","y"], cols)
    score_raw = score_col or detect_col(["y_score","sigmoid(logit)","sigmoid(logits)","pred","score","prob","proba","logit","logits"], cols)

    df = df.copy()
    df = df.rename(columns={id_col:"patient_id", y_col:"y_true"})

    # Si el "score_raw" es logit/logits, convertir a prob
    if score_raw in ("logit","logits"):
        df["y_score"] = safe_sigmoid(df[score_raw].astype(float))
    else:
        df["y_score"] = df[score_raw].astype(float)

    # Si tiene png_path, lo conservamos (útil cuando viene por-slice)
    if "png_path" in df.columns:
        return df[["patient_id","png_path","y_true","y_score"]]
    else:
        return df[["patient_id","y_true","y_score"]]


In [23]:
# === G1: montar drive (seguro) + rutas ===
try:
    from google.colab import drive
    # Evita error de "ya montado"
    if not Path("/content/drive/MyDrive").exists():
        drive.mount("/content/drive")
except Exception as e:
    print("Colab drive mount (opcional):", e)

BASE = Path("/content/drive/MyDrive/CognitivaAI")
P11  = BASE / "p11_alt_backbones"
P14  = BASE / "p14_oasis2_images"
P13  = BASE / "p13_oasis2_images"
P11.mkdir(parents=True, exist_ok=True)

CATALOG_JSON = P11/"p11_backbone_catalog.json"

print("Rutas:")
print("  P11 =", P11)
print("  P14 =", P14)
print("  P13 =", P13)
print("  Catalog =", CATALOG_JSON)


Rutas:
  P11 = /content/drive/MyDrive/CognitivaAI/p11_alt_backbones
  P14 = /content/drive/MyDrive/CognitivaAI/p14_oasis2_images
  P13 = /content/drive/MyDrive/CognitivaAI/p13_oasis2_images
  Catalog = /content/drive/MyDrive/CognitivaAI/p11_alt_backbones/p11_backbone_catalog.json


In [24]:
# === G2: cargar/actualizar catálogo p11 ===
if CATALOG_JSON.exists():
    with open(CATALOG_JSON, "r") as f:
        catalog = json.load(f)
else:
    catalog = {}

# Añadir/actualizar entrada OAS2 p14 (por-slice → haremos pooling a paciente)
entry_p14 = {
    "VAL":  str(P14/"val_png_preds_oas2_effb3_p14.csv"),
    "TEST": str(P14/"test_png_preds_oas2_effb3_p14.csv"),
    "id_col":    "patient_id",
    "y_col":     "y_true",
    "score_col": "y_score",
    "granularity": "slice"
}
catalog["oas2_effb3_p14"] = entry_p14

# (Opcional) Asegurar que entradas antiguas mínimas siguen
# No tocamos el resto; sólo añadimos/actualizamos esta clave

with open(CATALOG_JSON, "w") as f:
    json.dump(catalog, f, indent=2)
print("📚 Catálogo actualizado:", CATALOG_JSON)
print("→ claves:", list(catalog.keys()))


📚 Catálogo actualizado: /content/drive/MyDrive/CognitivaAI/p11_alt_backbones/p11_backbone_catalog.json
→ claves: ['SwinTiny', 'convnext_tiny.in12k_ft_in1k_slices', 'png_preds_d121', 'patient_preds', 'patient_preds_ensemble', 'patient_preds_plus', 'png_preds', 'slice_preds_plus', 'slice_preds_seedENS', 'slices_preds', 'slice_preds', 'patient_eval_colab', 'oas2_effb3', 'oas2_effb3_p14']


In [25]:
# === G3: validar rutas de todos los tags del catálogo ===
for tag, meta in catalog.items():
    for split in ("VAL","TEST"):
        p = Path(meta[split])
        if not p.exists():
            raise FileNotFoundError(f"[{tag}] No existe {split} en: {p}")
print("✅ Todas las rutas del catálogo existen.")


✅ Todas las rutas del catálogo existen.


In [26]:
# === G4: reconstruir features p11 (incluye oas2_effb3_p14) ===
def build_features(catalog_dict):
    V = None
    T = None
    for tag, meta in catalog_dict.items():
        # 1) Leer preds VAL/TEST
        v = read_preds(meta["VAL"],  meta.get("id_col"), meta.get("y_col"), meta.get("score_col"))
        t = read_preds(meta["TEST"], meta.get("id_col"), meta.get("y_col"), meta.get("score_col"))

        # 2) ¿slice-level? pool a paciente
        is_slice = (meta.get("granularity","slice") == "slice") or ("png_path" in v.columns)
        if is_slice:
            v_pt = patient_pool(v)
            t_pt = patient_pool(t)
        else:
            # ya patient-level
            # normalizar nombres por si vinieran como mean/trimmed/top7/pmean_2
            # si sólo hay y_score, lo tratamos como 'mean'
            cols = set(v.columns)
            if {"mean","trimmed20","top7","pmean_2"}.issubset(cols):
                v_pt = v[["patient_id","y_true","mean","trimmed20","top7","pmean_2"]].copy()
                t_pt = t[["patient_id","y_true","mean","trimmed20","top7","pmean_2"]].copy()
            else:
                # construir 'mean' desde y_score si viniera suelto
                if "y_score" in v.columns:
                    v2 = v.groupby(["patient_id","y_true"])["y_score"].mean().reset_index().rename(columns={"y_score":"mean"})
                    t2 = t.groupby(["patient_id","y_true"])["y_score"].mean().reset_index().rename(columns={"y_score":"mean"})
                    for c in ("trimmed20","top7","pmean_2"):
                        v2[c] = v2["mean"]
                        t2[c] = t2["mean"]
                    v_pt, t_pt = v2, t2
                else:
                    raise ValueError(f"[{tag}] No encuentro columnas de features ni y_score patient-level.")

        # 3) Renombrar con prefijo de tag
        v_pt = v_pt.rename(columns={
            "mean":f"{tag}_mean","trimmed20":f"{tag}_trimmed20",
            "top7":f"{tag}_top7","pmean_2":f"{tag}_p2"
        })
        t_pt = t_pt.rename(columns={
            "mean":f"{tag}_mean","trimmed20":f"{tag}_trimmed20",
            "top7":f"{tag}_top7","pmean_2":f"{tag}_p2"
        })

        # 4) Merge acumulativo por patient_id + y_true (para mantener etiqueta)
        keep_cols_v = ["patient_id","y_true", f"{tag}_mean", f"{tag}_trimmed20", f"{tag}_top7", f"{tag}_p2"]
        keep_cols_t = ["patient_id","y_true", f"{tag}_mean", f"{tag}_trimmed20", f"{tag}_top7", f"{tag}_p2"]

        V = v_pt[keep_cols_v] if V is None else V.merge(v_pt[keep_cols_v], on=["patient_id","y_true"], how="outer")
        T = t_pt[keep_cols_t] if T is None else T.merge(t_pt[keep_cols_t], on=["patient_id","y_true"], how="outer")

    return V, T

VAL, TEST = build_features(catalog)

# Ordenar columnas: primero id/label y luego features
idcols = ["patient_id","y_true"]
val_cols = idcols + sorted([c for c in VAL.columns if c not in idcols])
tst_cols = idcols + sorted([c for c in TEST.columns if c not in idcols])
VAL = VAL[val_cols].sort_values("patient_id").reset_index(drop=True)
TEST = TEST[tst_cols].sort_values("patient_id").reset_index(drop=True)

# Guardar
val_out  = P11/"val_patient_features_backbones.csv"
test_out = P11/"test_patient_features_backbones.csv"
VAL.to_csv(val_out, index=False)
TEST.to_csv(test_out, index=False)

print("✅ Guardado features:")
print("  ", val_out, "| shape:", VAL.shape)
print("  ", test_out, "| shape:", TEST.shape)

print("\nPreview VAL:")
display(VAL.head(3))
print("\nPreview TEST:")
display(TEST.head(3))


✅ Guardado features:
   /content/drive/MyDrive/CognitivaAI/p11_alt_backbones/val_patient_features_backbones.csv | shape: (69, 58)
   /content/drive/MyDrive/CognitivaAI/p11_alt_backbones/test_patient_features_backbones.csv | shape: (70, 58)

Preview VAL:


Unnamed: 0,patient_id,y_true,SwinTiny_mean,SwinTiny_p2,SwinTiny_top7,SwinTiny_trimmed20,convnext_tiny.in12k_ft_in1k_slices_mean,convnext_tiny.in12k_ft_in1k_slices_p2,convnext_tiny.in12k_ft_in1k_slices_top7,convnext_tiny.in12k_ft_in1k_slices_trimmed20,...,slice_preds_seedENS_mean,slice_preds_seedENS_p2,slice_preds_seedENS_top7,slice_preds_seedENS_trimmed20,slice_preds_top7,slice_preds_trimmed20,slices_preds_mean,slices_preds_p2,slices_preds_top7,slices_preds_trimmed20
0,OAS1_0003,1,0.458933,0.461509,0.514777,0.456984,0.455741,0.455741,0.455753,0.45574,...,0.5,0.707107,1.0,0.5,0.5,0.499985,,,,
1,OAS1_0010,0,0.423731,0.428161,0.495524,0.416583,0.455743,0.455743,0.455756,0.455742,...,0.3,0.547723,0.857143,0.25,0.5,0.5,,,,
2,OAS1_0016,1,0.463737,0.467343,0.530998,0.461,0.455753,0.455753,0.455765,0.455753,...,0.3,0.547723,0.857143,0.25,0.5,0.5,,,,



Preview TEST:


Unnamed: 0,patient_id,y_true,SwinTiny_mean,SwinTiny_p2,SwinTiny_top7,SwinTiny_trimmed20,convnext_tiny.in12k_ft_in1k_slices_mean,convnext_tiny.in12k_ft_in1k_slices_p2,convnext_tiny.in12k_ft_in1k_slices_top7,convnext_tiny.in12k_ft_in1k_slices_trimmed20,...,slice_preds_seedENS_mean,slice_preds_seedENS_p2,slice_preds_seedENS_top7,slice_preds_seedENS_trimmed20,slice_preds_top7,slice_preds_trimmed20,slices_preds_mean,slices_preds_p2,slices_preds_top7,slices_preds_trimmed20
0,OAS1_0002,0,0.478841,0.481466,0.537003,0.471988,0.455743,0.455743,0.455754,0.455742,...,0.15,0.387298,0.428571,0.0625,0.5,0.5,0.431681,0.462238,0.609769,0.422858
1,OAS1_0023,1,0.457738,0.459757,0.508314,0.456675,0.455755,0.455755,0.455763,0.455754,...,0.35,0.591608,1.0,0.3125,0.5,0.5,0.440384,0.483223,0.655163,0.431404
2,OAS1_0070,0,0.479852,0.482387,0.535784,0.476712,0.455746,0.455746,0.455759,0.455746,...,0.3,0.547723,0.857143,0.25,0.5,0.5,0.403323,0.466808,0.670567,0.382019


In [27]:
# === G5 (opcional): pequeña sanity-check de ensemble con columnas nuevas ===
def pick_cols(df, keys):
    exist = [k for k in keys if k in df.columns]
    if not exist:
        print("No existen columnas solicitadas:", keys)
        return None
    return df[exist].copy()

strong_keys = [
    # añade/quita columnas que tengas en tu catálogo
    "SwinTiny_top7",              # si existe (de p11)
    "patient_preds_plus_mean",    # si existe (de p10/p11)
    "oas2_effb3_p14_mean",        # NUEVA
    "oas2_effb3_mean",            # si estaba la de p13
]

V_feat = pick_cols(VAL, strong_keys)
T_feat = pick_cols(TEST, strong_keys)

if V_feat is not None and T_feat is not None:
    V_tmp = VAL[["patient_id","y_true"]].copy()
    T_tmp = TEST[["patient_id","y_true"]].copy()
    V_tmp["avg"] = V_feat.mean(axis=1)
    T_tmp["avg"] = T_feat.mean(axis=1)

    from sklearn.metrics import roc_auc_score, average_precision_score
    def eval_bin(y, s, name="avg"):
        # threshold al mejor F1 en VAL sólo como demo
        from sklearn.metrics import f1_score, precision_recall_curve
        prec, rec, thr = precision_recall_curve(y, s)
        f1s = 2*prec*rec/(prec+rec+1e-9)
        i = int(np.nanargmax(f1s))
        thr_opt = thr[max(0, min(i, len(thr)-1))]
        yhat = (s >= thr_opt).astype(int)
        out = dict(
            AUC=float(roc_auc_score(y, s)) if len(np.unique(y))>1 else float("nan"),
            PRAUC=float(average_precision_score(y, s)),
            Acc=float((yhat==y).mean()),
            P=float(( (yhat==1)&(y==1) ).sum() / max(1,(yhat==1).sum())),
            R=float(( (yhat==1)&(y==1) ).sum() / max(1,(y==1).sum())),
            thr=float(thr_opt),
            n=int(len(y)),
        )
        return out

    print("[VAL avg]", eval_bin(V_tmp["y_true"].values, V_tmp["avg"].values))
    print("[TEST avg]", eval_bin(T_tmp["y_true"].values, T_tmp["avg"].values))


[VAL avg] {'AUC': 0.8599320882852293, 'PRAUC': 0.8541141379080951, 'Acc': 0.8260869565217391, 'P': 0.7567567567567568, 'R': 0.9032258064516129, 'thr': 0.4896392077041317, 'n': 69}
[TEST avg] {'AUC': 0.7401315789473684, 'PRAUC': 0.7381621772526545, 'Acc': 0.7142857142857143, 'P': 0.6875, 'R': 0.6875, 'thr': 0.4967604759603226, 'n': 70}


In [29]:
# === E6 (robusto a NaNs): stacking con imputación + indicadores de ausencia ===
import numpy as np, pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score

# ------------------------------------------------------------------
# 0) Entradas (usa las tablas ya creadas en G/E5)
# ------------------------------------------------------------------
VAL_path  = "/content/drive/MyDrive/CognitivaAI/p11_alt_backbones/val_patient_features_backbones.csv"
TEST_path = "/content/drive/MyDrive/CognitivaAI/p11_alt_backbones/test_patient_features_backbones.csv"

VAL  = pd.read_csv(VAL_path)
TEST = pd.read_csv(TEST_path)

assert {'patient_id','y_true'}.issubset(VAL.columns) and {'patient_id','y_true'}.issubset(TEST.columns)

# Conjunto de features candidato (el que te falló)
FEATURES_RAW = [
    'SwinTiny_top7', 'SwinTiny_mean',
    'convnext_tiny.in12k_ft_in1k_slices_top7',
    'png_preds_d121_trimmed20',
    'patient_preds_plus_mean',
    'slice_preds_plus_mean',
    'oas2_effb3_top7', 'oas2_effb3_mean',
    'oas2_effb3_p14_top7', 'oas2_effb3_p14_mean'
]

# Asegura que existen (las que no existan se ignoran con aviso)
FEATURES = [c for c in FEATURES_RAW if c in VAL.columns and c in TEST.columns]
missing_cols = [c for c in FEATURES_RAW if c not in FEATURES]
if missing_cols:
    print("⚠️ Estas columnas no están en ambas tablas y se omiten:", missing_cols)

# ------------------------------------------------------------------
# 1) Diagnóstico de NaNs por columna
# ------------------------------------------------------------------
def nan_report(df, name):
    na = df[FEATURES].isna().mean().sort_values(ascending=False)
    print(f"\n=== NaN ratio por columna en {name} ===")
    print(na[na>0].to_string())

nan_report(VAL,  "VAL")
nan_report(TEST, "TEST")

# Opcional: ver distribución por cohorte
def cohort(pid):
    # Heurística: OAS1_... u OAS2_...
    return "OAS2" if str(pid).startswith("OAS2_") else "OAS1"

VAL['_cohort']  = VAL['patient_id'].map(cohort)
TEST['_cohort'] = TEST['patient_id'].map(cohort)

print("\nVAL cohort counts:", VAL['_cohort'].value_counts().to_dict())
print("TEST cohort counts:", TEST['_cohort'].value_counts().to_dict())

# ------------------------------------------------------------------
# 2) Filtrado de columnas con NaN excesivo
#    (si una feature está casi vacía en VAL, mejor no usarla)
# ------------------------------------------------------------------
NAN_CUTOFF = 0.40  # puedes mover a 0.5 si quieres ser más permisivo
keep = [c for c in FEATURES if VAL[c].isna().mean() <= NAN_CUTOFF]
drop = [c for c in FEATURES if c not in keep]
print(f"\n✅ Mantengo {len(keep)} columnas; ❌ descarto por NaN>={NAN_CUTOFF}: {drop}")

FEATURES = keep

# ------------------------------------------------------------------
# 3) Construye X/y e indicadores de ausencia (missingness flags)
# ------------------------------------------------------------------
def build_Xy(df, features):
    X = df[features].copy()
    y = df['y_true'].astype(int).values
    # Indicadores de ausencia por columna (1 si faltaba)
    miss_flags = {f"{c}_isnan": df[c].isna().astype(int).values for c in features}
    X = X.fillna(np.nan)  # aseguramos NaN explícito
    # Concatenamos flags al final
    for k,v in miss_flags.items():
        X[k] = v
    return X.values, y, list(X.columns)

X_val_raw,  y_val,  feat_names = build_Xy(VAL,  FEATURES)
X_test_raw, y_test, _          = build_Xy(TEST, FEATURES)

# ------------------------------------------------------------------
# 4A) Modelo 1: LogisticRegression con imputación (mediana) + escalado
#     (imputador se entrena SOLO en VAL)
# ------------------------------------------------------------------
lr_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),   # imputa NaN en features originales y en flags (casi todo 0/1)
    ("scaler",  StandardScaler(with_mean=True, with_std=True)),
    ("clf",     LogisticRegression(
        penalty="l2", C=1.0, solver="lbfgs",
        class_weight=None, max_iter=2000, random_state=42
    ))
])

lr_pipe.fit(X_val_raw, y_val)
p_val_lr  = lr_pipe.predict_proba(X_val_raw)[:,1]
p_test_lr = lr_pipe.predict_proba(X_test_raw)[:,1]

def metrics_bin(y, p, name=""):
    # Umbral por F1-opt en VAL → aquí devolvemos el mejor de un barrido
    thr_grid = np.linspace(0.05, 0.95, 19)
    f1_best, thr_best, acc_best, p_best, r_best = -1, 0.5, None, None, None
    for thr in thr_grid:
        yhat = (p >= thr).astype(int)
        tp = ((yhat==1)&(y==1)).sum()
        fp = ((yhat==1)&(y==0)).sum()
        fn = ((yhat==0)&(y==1)).sum()
        prec = tp/(tp+fp+1e-9)
        rec  = tp/(tp+fn+1e-9)
        f1   = 2*prec*rec/(prec+rec+1e-9)
        acc  = (yhat==y).mean()
        if f1 > f1_best:
            f1_best, thr_best, acc_best, p_best, r_best = f1, thr, acc, prec, rec
    d = dict(
        AUC=float(roc_auc_score(y, p)),
        PRAUC=float(average_precision_score(y, p)),
        Acc=float(acc_best), P=float(p_best), R=float(r_best),
        thr=float(thr_best), n=int(len(y))
    )
    print(f"[{name}] {d}")
    return d

print("\n📈 LogisticRegression con imputación/flags")
val_lr = metrics_bin(y_val,  p_val_lr,  name="VAL|LR")
# Usa el thr óptimo de VAL para TEST:
thr = val_lr["thr"]
yhat_test = (p_test_lr >= thr).astype(int)
test_lr = dict(
    AUC=float(roc_auc_score(y_test, p_test_lr)),
    PRAUC=float(average_precision_score(y_test, p_test_lr)),
    Acc=float(accuracy_score(y_test, yhat_test)),
    P=float(precision_score(y_test, yhat_test)),
    R=float(recall_score(y_test, yhat_test)),
    thr=float(thr), n=int(len(y_test))
)
print("[TEST|LR]", test_lr)

# ------------------------------------------------------------------
# 4B) (OPCIONAL) Modelo 2: HistGradientBoosting (tolera NaNs)
#     Nota: HGB no necesita imputación ni escalado y maneja NaN nativamente.
# ------------------------------------------------------------------
use_hgb = True
if use_hgb:
    hgb = HistGradientBoostingClassifier(
        learning_rate=0.05, max_depth=None, max_iter=400,
        l2_regularization=0.0, random_state=42
    )
    # En HGB conviene NO pasar los flags (porque ya usa NaN nativo); separamos:
    # Seleccionamos solo las columnas originales (sin *_isnan)
    n_orig = len(FEATURES)
    Xv_hgb  = X_val_raw[:, :n_orig]
    Xt_hgb  = X_test_raw[:, :n_orig]

    hgb.fit(Xv_hgb, y_val)
    p_val_hgb  = hgb.predict_proba(Xv_hgb)[:,1]
    p_test_hgb = hgb.predict_proba(Xt_hgb)[:,1]

    print("\n🌳 HistGradientBoosting (sin imputación, NaN nativo)")
    val_hgb = metrics_bin(y_val,  p_val_hgb,  name="VAL|HGB")
    thr_h = val_hgb["thr"]
    yhat_t = (p_test_hgb >= thr_h).astype(int)
    test_hgb = dict(
        AUC=float(roc_auc_score(y_test, p_test_hgb)),
        PRAUC=float(average_precision_score(y_test, p_test_hgb)),
        Acc=float(accuracy_score(y_test, yhat_t)),
        P=float(precision_score(y_test, yhat_t)),
        R=float(recall_score(y_test, yhat_t)),
        thr=float(thr_h), n=int(len(y_test))
    )
    print("[TEST|HGB]", test_hgb)

# ------------------------------------------------------------------
# 5) (OPCIONAL) Guardado de predicciones para comparar en p11/p14
# ------------------------------------------------------------------
out_dir = "/content/drive/MyDrive/CognitivaAI/p14_oasis2_images"
pd.DataFrame({
    "patient_id": VAL["patient_id"], "y_true": VAL["y_true"],
    "y_pred_lr": p_val_lr
}).to_csv(f"{out_dir}/val_stack_lr_oas2_oas1.csv", index=False)
pd.DataFrame({
    "patient_id": TEST["patient_id"], "y_true": TEST["y_true"],
    "y_pred_lr": p_test_lr
}).to_csv(f"{out_dir}/test_stack_lr_oas2_oas1.csv", index=False)

if use_hgb:
    pd.DataFrame({
        "patient_id": VAL["patient_id"], "y_true": VAL["y_true"],
        "y_pred_hgb": p_val_hgb
    }).to_csv(f"{out_dir}/val_stack_hgb_oas2_oas1.csv", index=False)
    pd.DataFrame({
        "patient_id": TEST["patient_id"], "y_true": TEST["y_true"],
        "y_pred_hgb": p_test_hgb
    }).to_csv(f"{out_dir}/test_stack_hgb_oas2_oas1.csv", index=False)

print("\n✅ E6-fix completado (con manejo de NaNs).")




=== NaN ratio por columna en VAL ===
oas2_effb3_p14_top7                        0.681159
oas2_effb3_p14_mean                        0.681159
oas2_effb3_mean                            0.681159
oas2_effb3_top7                            0.681159
SwinTiny_mean                              0.318841
SwinTiny_top7                              0.318841
slice_preds_plus_mean                      0.318841
patient_preds_plus_mean                    0.318841
png_preds_d121_trimmed20                   0.318841
convnext_tiny.in12k_ft_in1k_slices_top7    0.318841

=== NaN ratio por columna en TEST ===
oas2_effb3_p14_top7                        0.671429
oas2_effb3_p14_mean                        0.671429
oas2_effb3_mean                            0.671429
oas2_effb3_top7                            0.671429
SwinTiny_mean                              0.328571
SwinTiny_top7                              0.328571
slice_preds_plus_mean                      0.328571
patient_preds_plus_mean                