In [1]:
# A) Montar Drive y copiar a SSD local
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p /content/datasets/OAS2_PROCESSED
!rsync -ah --info=progress2 "/content/drive/MyDrive/CognitivaAI/oas2_data/OAS2_PROCESSED/" "/content/datasets/OAS2_PROCESSED/"

# B) Rutas Colab
from pathlib import Path
BASE = Path("/content/datasets/OAS2_PROCESSED")
TRN  = BASE/"oas2_train_colab_mapped.csv"
VAL  = BASE/"oas2_val_colab_mapped.csv"
TST  = BASE/"oas2_test_colab_mapped.csv"

for p in [TRN,VAL,TST]:
    assert p.exists(), f"No existe {p}"

# C) DataLoader (usa png_path absoluto en SSD local)
import pandas as pd
train_df = pd.read_csv(TRN)
val_df   = pd.read_csv(VAL)
test_df  = pd.read_csv(TST)

# Si tus png_path son rutas locales (ya apuntan a /content/datasets/..), no necesitas reemplazos.
# Si aún apuntaran a 'data/OAS2_PROCESSED', puedes hacer:
for df in [train_df, val_df, test_df]:
    df["png_path"] = df["png_path"].astype(str).str.replace(r"^data/", "/content/datasets/", regex=True)


Mounted at /content/drive
        969.34M 100%    3.09MB/s    0:04:58 (xfr#7345, to-chk=0/7346)


In [5]:
# =========================
# 1) Lectura limpia de mapas y normalización de rutas
# =========================
import os, pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/CognitivaAI/p13_oasis2_images")
POSIX_ROOT = "/content/datasets/OAS2_PROCESSED"   # destino real en Colab SSD

def load_and_fix_map(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    # normaliza separadores (Windows -> POSIX)
    df["png_path"] = df["png_path"].astype(str).str.replace("\\", "/", regex=False)

    # 2) Si aún hay rutas "data/OAS2_PROCESSED/..." cámbialas a /content/datasets/...
    df["png_path"] = df["png_path"].str.replace(
        r"^data/OAS2_PROCESSED", POSIX_ROOT, regex=True
    )

    # 3) Normaliza y valida existencia
    df["png_path"] = df["png_path"].apply(lambda p: os.path.normpath(p))
    missing = df[~df["png_path"].apply(os.path.exists)]
    if len(missing):
        print(f"⚠️ {csv_path.name}: {len(missing)} rutas no existen. Ejemplo:")
        print(missing["png_path"].head(3).to_string(index=False))
        # si quieres, elimina las faltantes:
        df = df[df["png_path"].apply(os.path.exists)].copy()

    # 4) Tipos y columnas
    if "target" in df.columns:
        df["target"] = df["target"].astype(int)
    return df.reset_index(drop=True)

train_df = load_and_fix_map(BASE/"oas2_train_colab_mapped.csv")
val_df   = load_and_fix_map(BASE/"oas2_val_colab_mapped.csv")
test_df  = load_and_fix_map(BASE/"oas2_test_colab_mapped.csv")

print("✅ Mapas listos:",
      f"train={len(train_df)} val={len(val_df)} test={len(test_df)}")

# =========================
# 2) Dataset robusto (normaliza rutas en __getitem__)
# =========================
from PIL import Image
from torch.utils.data import Dataset, DataLoader

class OAS2PNG(Dataset):
    def __init__(self, frame: pd.DataFrame, transform=None):
        self.df = frame.copy()
        self.tfm = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        # normalización extra de seguridad:
        p = str(r.png_path).replace("\\", "/")
        p = os.path.normpath(p)
        if not os.path.exists(p):
            raise FileNotFoundError(f"No existe: {p}")
        im = Image.open(p).convert("RGB")
        if self.tfm: im = self.tfm(im)
        return im, int(r.target)

# =========================
# 3) Reconstruye los DataLoaders DESPUÉS de este fix
# =========================
import torch
from torchvision import transforms as T

tfm_train = T.Compose([
    T.Resize((224,224)), T.ToTensor()
])
tfm_eval  = T.Compose([
    T.Resize((224,224)), T.ToTensor()
])

# Nota: para depurar I/O pon num_workers=0 y luego súbelo a 2–4 cuando vaya bien.
train_loader = DataLoader(OAS2PNG(train_df, tfm_train), batch_size=64, shuffle=True,  num_workers=0, pin_memory=True)
val_loader   = DataLoader(OAS2PNG(val_df,   tfm_eval),  batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
test_loader  = DataLoader(OAS2PNG(test_df,  tfm_eval),  batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

# Sanity check: iterar un batch de cada split
xb, yb = next(iter(train_loader))
print("OK train:", xb.shape, yb.shape, torch.unique(yb, return_counts=True))
xb, yb = next(iter(val_loader))
print("OK val  :", xb.shape, yb.shape, torch.unique(yb, return_counts=True))
xb, yb = next(iter(test_loader))
print("OK test :", xb.shape, yb.shape, torch.unique(yb, return_counts=True))




✅ Mapas listos: train=180 val=40 test=60
OK train: torch.Size([64, 3, 224, 224]) torch.Size([64]) (tensor([1]), tensor([64]))
OK val  : torch.Size([40, 3, 224, 224]) torch.Size([40]) (tensor([1]), tensor([40]))
OK test : torch.Size([60, 3, 224, 224]) torch.Size([60]) (tensor([1]), tensor([60]))


In [6]:
import timm, torch, torch.nn as nn, torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = timm.create_model("efficientnet_b3", pretrained=True, num_classes=1)
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)

def run_epoch(loader, train=True):
    model.train(train)
    loss_sum, n = 0.0, 0
    for xb,yb in tqdm(loader, disable=True):
        xb, yb = xb.to(device), yb.float().to(device)
        with torch.set_grad_enabled(train):
            logits = model(xb).squeeze(1)
            loss = criterion(logits, yb)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        loss_sum += loss.item()*xb.size(0)
        n += xb.size(0)
    return loss_sum/n

best = (1e9, None)
for epoch in range(1, 11):
    trL = run_epoch(train_loader, True)
    vaL = run_epoch(val_loader,   False)
    print(f"Epoch {epoch} | train_loss={trL:.4f} | val_loss={vaL:.4f}")
    if vaL < best[0]:
        best = (vaL, epoch)
        torch.save(model.state_dict(), "/content/effb3_oas2_best.pth")
        print("✓ Guardado best.")


Epoch 1 | train_loss=2.3336 | val_loss=0.0037
✓ Guardado best.
Epoch 2 | train_loss=0.0808 | val_loss=0.0000
✓ Guardado best.
Epoch 3 | train_loss=0.0000 | val_loss=0.0000
✓ Guardado best.
Epoch 4 | train_loss=0.0000 | val_loss=0.0000
✓ Guardado best.
Epoch 5 | train_loss=0.0000 | val_loss=0.0000
✓ Guardado best.
Epoch 6 | train_loss=0.0000 | val_loss=0.0000
✓ Guardado best.
Epoch 7 | train_loss=0.0000 | val_loss=0.0000
Epoch 8 | train_loss=0.0000 | val_loss=0.0000
Epoch 9 | train_loss=0.0000 | val_loss=0.0000
✓ Guardado best.
Epoch 10 | train_loss=0.0000 | val_loss=0.0000
✓ Guardado best.


In [7]:
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/CognitivaAI/p13_oasis2_images")
train = pd.read_csv(BASE/"oas2_train_colab_mapped.csv")
val   = pd.read_csv(BASE/"oas2_val_colab_mapped.csv")
test  = pd.read_csv(BASE/"oas2_test_colab_mapped.csv")

for name, df in [("TRAIN", train), ("VAL", val), ("TEST", test)]:
    print(name, df['patient_id'].nunique(), "pacientes", "| filas:", len(df))

def overlap(a,b, key):
    A=set(a[key]); B=set(b[key]); return sorted(A&B)
print("Overlap pacientes TR-VAL:", overlap(train,val,'patient_id'))
print("Overlap pacientes TR-TEST:", overlap(train,test,'patient_id'))
print("Overlap pacientes VAL-TEST:", overlap(val,test,'patient_id'))


TRAIN 9 pacientes | filas: 180
VAL 2 pacientes | filas: 40
TEST 3 pacientes | filas: 60
Overlap pacientes TR-VAL: []
Overlap pacientes TR-TEST: []
Overlap pacientes VAL-TEST: []


In [8]:
def summarize(df):
    by_p = df.groupby('patient_id')['target'].max()  # etiqueta paciente = OR de sus slices
    print("Pacientes por clase:", by_p.value_counts(dropna=False).to_dict())
    print("Slices por clase:", df['target'].value_counts(dropna=False).to_dict())

print("TRAIN"); summarize(train)
print("\nVAL");  summarize(val)
print("\nTEST"); summarize(test)


TRAIN
Pacientes por clase: {1: 9}
Slices por clase: {1: 180}

VAL
Pacientes por clase: {1: 2}
Slices por clase: {1: 40}

TEST
Pacientes por clase: {1: 3}
Slices por clase: {1: 60}


In [9]:
import os, random
def spot_check(df, n=20):
    xs = df.sample(min(n, len(df)), random_state=0)['png_path'].tolist()
    miss = [x for x in xs if not os.path.exists(x)]
    print(f"Chequé {len(xs)} | missing={len(miss)}")
    if miss: print("Ejemplos faltantes:", miss[:5])
print("TRAIN"); spot_check(train)
print("VAL");   spot_check(val)
print("TEST");  spot_check(test)


TRAIN
Chequé 20 | missing=0
VAL
Chequé 20 | missing=0
TEST
Chequé 20 | missing=0


In [11]:
# ================================
# Verificación OASIS-2: labels vs inventario de slices
# - Comprueba cobertura de labels.csv
# - Lista scans sin etiqueta
# - Repara rutas (Windows -> Linux/Colab) opcionalmente
# - Guarda dataset filtrado (opcional)
# ================================
import os
import sys
import json
import pandas as pd
import numpy as np
from pathlib import Path

# ----------------
# Parámetros
# ----------------
# Archivo de etiquetas OASIS-2 (ya verificado: 150 filas, target ∈ {0,1})
LABELS_CSV = Path("/content/drive/MyDrive/CognitivaAI/oas2_data/oas2_labels.csv")  # <- cambia si lo tienes en otra ruta

# Inventario por-slice OASIS-2 (el que generaste con p4 adaptado a OAS2)
# Suele llamarse 'oas2_slices_dataset.csv' y tener columnas: scan_id, patient_id, png_path, ...
INVENTORY_CSV = Path("data/OAS2_PROCESSED/oas2_slices_dataset.csv")  # local
# En Colab, si lo copiaste a SSD:
COLAB_INVENTORY_CSV = Path("/content/datasets/OAS2_PROCESSED/oas2_slices_dataset.csv")

# Salida (opcional) con sólo filas etiquetadas y rutas reparadas para el entorno actual
SAVE_FILTERED = True
OUT_FILTERED = Path("data/OAS2_PROCESSED/oas2_slices_dataset_labeled.csv")  # local
COLAB_OUT_FILTERED = Path("/content/datasets/OAS2_PROCESSED/oas2_slices_dataset_labeled.csv")

# Si ejecutas en Colab y ya moviste imágenes a /content/datasets/…, activa remapeo:
RUNNING_IN_COLAB = Path("/content").exists()
PREFERRED_ROOT = Path("/content/datasets/OAS2_PROCESSED") if RUNNING_IN_COLAB else Path("data/OAS2_PROCESSED")

# ----------------
# Utilidades
# ----------------
def normalize_scan_id(s: str) -> str:
    """Hace robusta la igualdad de scan_id (espacios, etc.)."""
    return str(s).strip()

def fix_path_str(p: str) -> str:
    """Normaliza separadores y quita comillas extra si hubiera."""
    if pd.isna(p):
        return p
    s = str(p).strip().strip('"').strip("'")
    # Windows -> POSIX separadores
    s = s.replace("\\", "/")
    return s

def try_remap_png_path(p: str, preferred_root: Path) -> str:
    """
    Si 'p' no existe, intenta remapearlo a 'preferred_root' conservando el nombre de archivo.
    """
    if pd.isna(p):
        return p
    p2 = Path(p)
    if p2.exists():
        return str(p2)
    # si no existe, reensambla como preferred_root / filename
    fname = Path(p).name
    cand = preferred_root / fname
    return str(cand)

def check_exists_fraction(paths, sample=50):
    """Chequea una muestra de rutas para estimar existencia."""
    paths = [Path(x) for x in paths[:sample] if isinstance(x, str)]
    if not paths:
        return 0.0, 0, 0
    ok = sum(p.exists() for p in paths)
    return ok / len(paths), ok, len(paths)

# ----------------
# Carga
# ----------------
assert LABELS_CSV.exists(), f"No encuentro LABELS_CSV: {LABELS_CSV}"
labels = pd.read_csv(LABELS_CSV)
assert {"patient_id", "scan_id", "target"}.issubset(labels.columns), f"labels.csv debe tener columnas: patient_id, scan_id, target. Tiene: {list(labels.columns)}"

# El inventario puede estar en local o en Colab; elige el que exista
if COLAB_INVENTORY_CSV.exists():
    inventory_path = COLAB_INVENTORY_CSV
elif INVENTORY_CSV.exists():
    inventory_path = INVENTORY_CSV
else:
    raise FileNotFoundError("No encuentro inventario ni en local ni en Colab.")

inv = pd.read_csv(inventory_path)
assert {"scan_id", "patient_id", "png_path"}.issubset(inv.columns), f"Inventario debe incluir scan_id, patient_id, png_path. Tiene: {list(inv.columns)}"

# Normalizaciones
labels["scan_id"] = labels["scan_id"].map(normalize_scan_id)
labels["patient_id"] = labels["patient_id"].astype(str).str.strip()
labels["target"] = labels["target"].astype(int)

inv["scan_id"] = inv["scan_id"].map(normalize_scan_id)
inv["patient_id"] = inv["patient_id"].astype(str).str.strip()
inv["png_path"] = inv["png_path"].map(fix_path_str)

# ----------------
# Cobertura y resumen
# ----------------
scan_labels = set(labels["scan_id"].unique())
scan_inventory = set(inv["scan_id"].unique())

covered = scan_inventory & scan_labels
missing = sorted(scan_inventory - scan_labels)
coverage = len(covered) / max(1, len(scan_inventory))

print("=== Cobertura de labels sobre inventario OASIS-2 ===")
print(f"- Scans en inventario: {len(scan_inventory)}")
print(f"- Scans con label:     {len(scan_labels)}")
print(f"- Scans cubiertos:     {len(covered)}  ({coverage:.1%})")
print(f"- Scans SIN label:     {len(missing)}")

if missing:
    print("  Ejemplos (hasta 15):", missing[:15])

print("\n=== Distribución target (labels) ===")
print(labels["target"].value_counts(dropna=False).rename("count"))

# ----------------
# Join labels → inventario
# ----------------
merged = inv.merge(labels[["scan_id", "target"]], on="scan_id", how="left", validate="m:1")
n_total = len(merged)
n_missing_target = merged["target"].isna().sum()
print(f"\nTras merge: filas={n_total} | target NaN={n_missing_target}")

# ----------------
# (Opcional) Filtrar a sólo filas etiquetadas y remapear rutas a SSD/Carpeta preferida
# ----------------
labeled = merged[merged["target"].isin([0, 1])].copy()

# Remapeo de rutas (si la ruta original no existe, construye <PREFERRED_ROOT>/<filename>)
labeled["png_path"] = labeled["png_path"].map(lambda s: try_remap_png_path(s, PREFERRED_ROOT))

# Verificación rápida de existencias (muestra)
frac, ok, tot = check_exists_fraction(labeled["png_path"].tolist(), sample=50)
print(f"\nChequeo de existencia de rutas (muestra 50): {ok}/{tot} ({frac:.0%})")

# (Opcional) Guardar dataset filtrado
if SAVE_FILTERED:
    out_path = COLAB_OUT_FILTERED if RUNNING_IN_COLAB else OUT_FILTERED
    out_path.parent.mkdir(parents=True, exist_ok=True)
    labeled.to_csv(out_path, index=False)
    print(f"\n✅ Guardado dataset filtrado en: {out_path} | filas={len(labeled)}")

# ----------------
# (Opcional) Sanidad por paciente
# ----------------
by_scan = labeled.groupby(["scan_id", "target"]).size().reset_index(name="n_slices")
by_patient = labeled.groupby(["patient_id", "target"]).size().reset_index(name="n_slices")

print("\n=== Resumen por scan_id (muestra 5) ===")
print(by_scan.head(5).to_string(index=False))

print("\n=== Resumen por patient_id (muestra 5) ===")
print(by_patient.head(5).to_string(index=False))

# Reglas simples de sanidad
assert (labeled["target"].isin([0,1]).all()), "Hay targets no binarios tras filtrado."
assert labeled["png_path"].notna().all(), "Hay png_path vacíos tras filtrado."

print("\n✅ Verificación completada.")


=== Cobertura de labels sobre inventario OASIS-2 ===
- Scans en inventario: 150
- Scans con label:     150
- Scans cubiertos:     150  (100.0%)
- Scans SIN label:     0

=== Distribución target (labels) ===
target
1    78
0    72
Name: count, dtype: int64


KeyError: 'target'

In [14]:
# ==== Parche robusto para columna 'target' en OASIS-2 ====
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/CognitivaAI")
LABELS_CSV = BASE / "oas2_data" / "oas2_labels.csv"
INV_CSV    = BASE / "oas2_data" / "OAS2_PROCESSED" / "oas2_slices_inventory.csv"
DATASET_CSV= BASE / "oas2_data" / "OAS2_PROCESSED" / "oas2_slices_dataset.csv"  # si lo quieres regenerar

def coalesce_target(df, prefer="target"):
    """
    Asegura una columna 'target' única tomando la primera disponible de:
    target, target_x, target_y, label, y_true.
    """
    candidates = ["target", "target_x", "target_y", "label", "y_true"]
    if prefer in df.columns:
        # ya existe tal cual
        return df
    # crea 'target' combinando
    tgt = None
    for c in candidates:
        if c in df.columns:
            if tgt is None:
                tgt = df[c].copy()
            else:
                tgt = tgt.fillna(df[c])
    if tgt is not None:
        df["target"] = tgt
    # limpia sufijos si existían
    drop_cols = [c for c in ["target_x","target_y","label","y_true"] if c in df.columns]
    if drop_cols:
        df = df.drop(columns=drop_cols)
    return df

# 1) Carga labels "oficiales" (150 filas, target ∈ {0,1})
labels = pd.read_csv(LABELS_CSV, dtype=str)
# normaliza tipos
labels["target"] = labels["target"].astype(float).astype(int)
labels["patient_id"] = labels["patient_id"].astype(str).str.strip()
labels["scan_id"]    = labels["scan_id"].astype(str).str.strip()

# 2) Versión 1-sesión por paciente (conservando 'target')
#    -> si ya tenías tu propia función, lo clave es que no te dejes 'target'
labels_1sess = (
    labels.sort_values(["patient_id","scan_id"])
          .groupby("patient_id", as_index=False)
          .apply(lambda g: g.iloc[0])        # ó tu pick_one_session(g)
          .reset_index(drop=True)
)[["patient_id","scan_id","target"]].copy()

# 3) Carga inventario (todas las imágenes generadas, sin target)
inv = pd.read_csv(INV_CSV)
# por si hay separadores de ruta extraños, normaliza el scan_id (debería venir bien)
inv["scan_id"] = inv["scan_id"].astype(str).str.strip()
inv["patient_id"] = inv["patient_id"].astype(str).str.strip()

# 4) Cobertura de labels vs inventario (por scan_id)
inv_scans = set(inv["scan_id"].unique())
lab_scans = set(labels_1sess["scan_id"].unique())
missing   = sorted(inv_scans - lab_scans)
print("=== Cobertura de labels sobre inventario OASIS-2 ===")
print(f"- Scans en inventario: {len(inv_scans)}")
print(f"- Scans con label:     {len(lab_scans)}")
print(f"- Scans cubiertos:     {len(inv_scans & lab_scans)}  ({100*len(inv_scans & lab_scans)/max(1,len(inv_scans)):.1f}%)")
print(f"- Scans SIN label:     {len(missing)}")
if missing:
    print("  * Ejemplos sin label:", missing[:10])

print("\n=== Distribución target (labels) ===")
print(labels_1sess["target"].value_counts())

# 5) Merge inventario + labels (many images -> one scan)
dataset = inv.merge(
    labels_1sess[["scan_id","target"]],
    on="scan_id", how="left", validate="many_to_one"
)

# 6) Asegura columna 'target' estándar tras el merge
dataset = coalesce_target(dataset, prefer="target")

# 7) Comprobaciones y guardado opcional del dataset enriquecido
n_nan = int(dataset["target"].isna().sum())
print(f"\nDataset enriquecido: filas={len(dataset)} | target NaN={n_nan}")
if n_nan:
    # si quedaran NaN, los imprimimos por si hay algún scan raro
    print("Ejemplos con target NaN:")
    display(dataset[dataset["target"].isna()].head())

# Guardar (si quieres actualizar el dataset por-slice con target)
dataset.to_csv(DATASET_CSV, index=False)
print(f"✅ Dataset por-slice con target guardado en: {DATASET_CSV}")

# 8) Estadísticas rápidas por clase en el dataset
print("\n=== Distribución target (dataset) ===")
print(dataset["target"].astype(int).value_counts())


=== Cobertura de labels sobre inventario OASIS-2 ===
- Scans en inventario: 367
- Scans con label:     150
- Scans cubiertos:     150  (40.9%)
- Scans SIN label:     217
  * Ejemplos sin label: ['OAS2_0001_MR2', 'OAS2_0002_MR1', 'OAS2_0002_MR2', 'OAS2_0004_MR2', 'OAS2_0005_MR2', 'OAS2_0005_MR3', 'OAS2_0007_MR1', 'OAS2_0007_MR3', 'OAS2_0008_MR2', 'OAS2_0009_MR1']

=== Distribución target (labels) ===
target
1    78
0    72
Name: count, dtype: int64

Dataset enriquecido: filas=7340 | target NaN=4340
Ejemplos con target NaN:


  .apply(lambda g: g.iloc[0])        # ó tu pick_one_session(g)


Unnamed: 0,scan_id,patient_id,png_path,source_hdr,has_mask,mask_source,target
20,OAS2_0001_MR2,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR2_slice00.png,data\OAS2_RAW\OAS2_0001_MR2\RAW\mpr-1.nifti.hdr,1,otsu,
21,OAS2_0001_MR2,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR2_slice01.png,data\OAS2_RAW\OAS2_0001_MR2\RAW\mpr-1.nifti.hdr,1,otsu,
22,OAS2_0001_MR2,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR2_slice02.png,data\OAS2_RAW\OAS2_0001_MR2\RAW\mpr-1.nifti.hdr,1,otsu,
23,OAS2_0001_MR2,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR2_slice03.png,data\OAS2_RAW\OAS2_0001_MR2\RAW\mpr-1.nifti.hdr,1,otsu,
24,OAS2_0001_MR2,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR2_slice04.png,data\OAS2_RAW\OAS2_0001_MR2\RAW\mpr-1.nifti.hdr,1,otsu,


✅ Dataset por-slice con target guardado en: /content/drive/MyDrive/CognitivaAI/oas2_data/OAS2_PROCESSED/oas2_slices_dataset.csv

=== Distribución target (dataset) ===


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [17]:
# ==== HOTFIX OASIS-2 en Colab: normalizar rutas, re-hacer dataset y splits (1 sesión/paciente) ====
import os, re, json
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

# --- 1) Rutas base
DS_ROOT      = Path("/content/datasets/OAS2_PROCESSED")         # donde rsync copió los PNG
INV_CSV      = DS_ROOT / "oas2_slices_inventory.csv"            # inventario que se generó al exportar
LABELS_CSV   = Path("/content/drive/MyDrive/CognitivaAI/oas2_data/oas2_labels.csv")  # tu labels correcto (150 filas)
OUT_DATASET  = DS_ROOT / "oas2_slices_dataset.csv"              # dataset final por-slice (con target)
SPLIT_DIR    = Path("/content/drive/MyDrive/CognitivaAI/p13_oasis2_images")          # mapas de split

SPLIT_DIR.mkdir(parents=True, exist_ok=True)

assert INV_CSV.exists(), f"No encuentro inventario: {INV_CSV}"
assert LABELS_CSV.exists(), f"No encuentro labels: {LABELS_CSV}"

# --- 2) Cargar inventario y normalizar 'png_path' a POSIX absoluto en /content
inv = pd.read_csv(INV_CSV)
if "png_path" not in inv.columns:
    raise ValueError("Inventario no tiene columna 'png_path'.")

# normaliza separadores y reescribe prefijo data/OAS2_PROCESSED -> /content/datasets/OAS2_PROCESSED
png_path = inv["png_path"].astype(str).str.replace("\\", "/", regex=False)
png_path = png_path.str.replace(r"^data/OAS2_PROCESSED", str(DS_ROOT).rstrip("/"), regex=True)

# si quedara algún relativo suelto, únelos por seguridad
png_path = png_path.apply(lambda p: p if p.startswith("/content/") else str((DS_ROOT / Path(p).name)))

inv["png_path"] = png_path

# --- 3) Cargar labels (150 scans, 0/1 en target), mantener sólo Control/Dementia/Converted->1 (ya lo hiciste)
labels = pd.read_csv(LABELS_CSV)
# columnas esperadas: patient_id, scan_id, target (0/1)
needed = {"patient_id","scan_id","target"}
missing = needed - set(labels.columns)
if missing:
    raise ValueError(f"Faltan columnas en labels: {missing}")

# --- 4) Unir por scan_id y quedarnos con filas cuyo PNG existe
ds = inv.merge(labels[["scan_id","target","patient_id"]], on="scan_id", how="inner", suffixes=("","_lbl"))

exists_mask = ds["png_path"].apply(lambda p: Path(p).exists())
missing = (~exists_mask).sum()
print(f"Dataset tras merge: filas={len(ds)} | missing_files={missing}")
if missing:
    print("Ejemplos faltantes (mostrando 5):")
    print(ds.loc[~exists_mask, ["scan_id","png_path"]].head())

ds = ds.loc[exists_mask].copy()
ds.rename(columns={"patient_id":"patient_id"}, inplace=True)  # ya viene de inventario; la del merge se llama patient_id (ok)
ds = ds[["scan_id","patient_id","png_path","target"]]

# Guardar dataset por-slice con target
ds.to_csv(OUT_DATASET, index=False)
print(f"✅ Guardado dataset por-slice: {OUT_DATASET} | filas={len(ds)}")

# --- 5) Forzar política 1 sesión/paciente ANTES de split
# Preferimos MR1 si existe; si no, la MRx más baja
def _mr_num(s):
    m = re.search(r"_MR(\d+)", s)
    return int(m.group(1)) if m else 999

def pick_one_session(g):
    # g: filas de un paciente (varios scan_id repetidos por 20 slices)
    scans = g["scan_id"].unique().tolist()
    mr1 = [s for s in scans if s.endswith("_MR1")]
    if mr1:
        keep = mr1[0]
    else:
        keep = sorted(scans, key=_mr_num)[0]
    return g[g["scan_id"] == keep]

ds_1sess = (ds
            .groupby("patient_id", as_index=False, group_keys=False)
            .apply(pick_one_session)
            .reset_index(drop=True))

# --- 6) Construir tabla de pacientes (una fila por paciente, con su target)
pt = (ds_1sess[["patient_id","target","scan_id"]]
      .drop_duplicates(["patient_id"])
      .reset_index(drop=True))

print(f"Pacientes etiquetados (tras 1 sesión): {pt.shape}")
print(pt["target"].value_counts())

# sanity: debe haber ambas clases para poder estratificar; si no, lanza aviso
if pt["target"].nunique() < 2:
    raise RuntimeError("Solo hay una clase en OASIS-2 etiquetado tras 1 sesión/paciente. Revisa labels o añade más pacientes/clase.")

# --- 7) Split estratificado por paciente (70/15/15 como ejemplo)
train_pat, tmp_pat = train_test_split(pt, test_size=0.30, random_state=42, stratify=pt["target"])
val_pat,   test_pat= train_test_split(tmp_pat, test_size=0.50, random_state=42, stratify=tmp_pat["target"])

def rows_for(pat_df):
    pset = set(pat_df["patient_id"])
    return ds_1sess[ds_1sess["patient_id"].isin(pset)].copy()

train_df = rows_for(train_pat)
val_df   = rows_for(val_pat)
test_df  = rows_for(test_pat)

# --- 8) Guardar mapas para DataLoaders
def save_map(df, path_csv):
    out = df[["png_path","target","patient_id"]].copy()
    out.rename(columns={"png_path":"png_path", "target":"target", "patient_id":"patient_id"}, inplace=True)
    out.to_csv(path_csv, index=False)
    return out

tr_csv = SPLIT_DIR / "oas2_train_colab_mapped.csv"
va_csv = SPLIT_DIR / "oas2_val_colab_mapped.csv"
te_csv = SPLIT_DIR / "oas2_test_colab_mapped.csv"

save_map(train_df, tr_csv)
save_map(val_df,   va_csv)
save_map(test_df,  te_csv)

print("\n=== Resumen splits (por slice, 1 sesión/paciente) ===")
for name, df, pats in [
    ("TRAIN", train_df, train_pat),
    ("VAL",   val_df,   val_pat),
    ("TEST",  test_df,  test_pat),
]:
    n_p = pats["patient_id"].nunique()
    print(f"{name}: pacientes={n_p} | filas={len(df)} | clases: {pats['target'].value_counts().to_dict()}")

print("\nMapas guardados en:")
print(" -", tr_csv)
print(" -", va_csv)
print(" -", te_csv)



Dataset tras merge: filas=3000 | missing_files=0
✅ Guardado dataset por-slice: /content/datasets/OAS2_PROCESSED/oas2_slices_dataset.csv | filas=3000
Pacientes etiquetados (tras 1 sesión): (150, 3)
target
1    78
0    72
Name: count, dtype: int64

=== Resumen splits (por slice, 1 sesión/paciente) ===
TRAIN: pacientes=105 | filas=2100 | clases: {1: 55, 0: 50}
VAL: pacientes=22 | filas=440 | clases: {0: 11, 1: 11}
TEST: pacientes=23 | filas=460 | clases: {1: 12, 0: 11}

Mapas guardados en:
 - /content/drive/MyDrive/CognitivaAI/p13_oasis2_images/oas2_train_colab_mapped.csv
 - /content/drive/MyDrive/CognitivaAI/p13_oasis2_images/oas2_val_colab_mapped.csv
 - /content/drive/MyDrive/CognitivaAI/p13_oasis2_images/oas2_test_colab_mapped.csv


  .apply(pick_one_session)


In [18]:
# A) Setup
import os, json, math, random, time, gc, csv
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
import torchvision.transforms as T
from sklearn.metrics import roc_auc_score, average_precision_score

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE = Path("/content/drive/MyDrive/CognitivaAI")
OUT  = BASE / "p13_oasis2_images"
OUT.mkdir(parents=True, exist_ok=True)

MAP_TRAIN = OUT / "oas2_train_colab_mapped.csv"
MAP_VAL   = OUT / "oas2_val_colab_mapped.csv"
MAP_TEST  = OUT / "oas2_test_colab_mapped.csv"

assert MAP_TRAIN.exists() and MAP_VAL.exists() and MAP_TEST.exists(), "Faltan mapas OAS2."

# Utils
def seed_everything(s=42):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)
seed_everything(42)

# Métricas paciente-level (AUC/PR-AUC con guardas por clase única)
def metrics_binary(y_true, y_score, thr=None):
    y_true = np.asarray(y_true).astype(int)
    y_score = np.asarray(y_score).astype(float)

    out = {}
    try:
        out["AUC"] = roc_auc_score(y_true, y_score)
    except Exception:
        out["AUC"] = float("nan")
    try:
        out["PRAUC"] = average_precision_score(y_true, y_score)
    except Exception:
        out["PRAUC"] = float("nan")

    if thr is None:
        # F1-opt approx: barrido 200 umbrales
        thrs = np.linspace(0, 1, 201)
        best = (0, 0, 0)  # (F1, thr, (P,R,Acc))
        for t in thrs:
            yp = (y_score >= t).astype(int)
            tp = (yp & (y_true==1)).sum()
            fp = (yp & (y_true==0)).sum()
            fn = ((1-yp) & (y_true==1)).sum()
            tn = ((1-yp) & (y_true==0)).sum()
            P = tp/(tp+fp+1e-9); R = tp/(tp+fn+1e-9); Acc = (tp+tn)/(len(y_true)+1e-9)
            F1 = 2*P*R/(P+R+1e-9)
            if F1 > best[0]: best = (F1, t, (P,R,Acc))
        thr = best[1]
        P,R,Acc = best[2]
    else:
        yp = (y_score >= thr).astype(int)
        tp = (yp & (y_true==1)).sum()
        fp = (yp & (y_true==0)).sum()
        fn = ((1-yp) & (y_true==1)).sum()
        tn = ((1-yp) & (y_true==0)).sum()
        P = tp/(tp+fp+1e-9); R = tp/(tp+fn+1e-9); Acc = (tp+tn)/(len(y_true)+1e-9)

    out.update(dict(Acc=Acc, P=P, R=R, thr=float(thr), n=int(len(y_true))))
    return out

# Pooling a paciente
def patient_pool(df_sl):
    g = df_sl.groupby("patient_id")["y_score"]
    mean = g.mean()
    trimmed20 = g.apply(lambda s: s.sort_values().iloc[int(len(s)*.1):int(len(s)*.9)].mean() if len(s)>=10 else s.mean())
    top7 = g.apply(lambda s: s.sort_values(ascending=False).head(7).mean())
    p2 = g.apply(lambda s: (np.mean(np.clip(s,0,1)**2))**0.5)
    y = df_sl.groupby("patient_id")["y_true"].first()
    out = pd.DataFrame({"patient_id": y.index, "y_true": y.values,
                        "mean": mean.reindex(y.index).values,
                        "trimmed20": trimmed20.reindex(y.index).values,
                        "top7": top7.reindex(y.index).values,
                        "pmean_2": p2.reindex(y.index).values})
    return out

print("✅ Setup OK | Device:", DEVICE)


✅ Setup OK | Device: cuda


In [19]:
# B) Dataset y transforms
class OAS2PNG(Dataset):
    def __init__(self, df, tfm):
        self.df = df.reset_index(drop=True).copy()
        self.tfm = tfm
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        im = Image.open(r.png_path).convert("RGB")
        if self.tfm: im = self.tfm(im)
        y = int(r.target)
        return im, y

tfm_train = T.Compose([
    T.Resize((256,256)),
    T.RandomResizedCrop(224, scale=(0.85, 1.0)),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize([0.5,0.5,0.5],[0.5,0.5,0.5]),
])
tfm_eval = T.Compose([
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize([0.5,0.5,0.5],[0.5,0.5,0.5]),
])

train_df = pd.read_csv(MAP_TRAIN)
val_df   = pd.read_csv(MAP_VAL)
test_df  = pd.read_csv(MAP_TEST)

for name, df in [("TRAIN",train_df),("VAL",val_df),("TEST",test_df)]:
    miss = (~df["png_path"].apply(lambda p: Path(p).exists())).sum()
    print(f"{name}: filas={len(df)} | missing_files={miss}")
    assert miss==0, f"{name} aún tiene rutas no existentes."

train_loader = DataLoader(OAS2PNG(train_df, tfm_train), batch_size=64, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(OAS2PNG(val_df,   tfm_eval),  batch_size=64, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(OAS2PNG(test_df,  tfm_eval),  batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

xb, yb = next(iter(train_loader))
print("Batch check:", xb.shape, yb.shape, yb.unique(return_counts=True))


TRAIN: filas=2100 | missing_files=0
VAL: filas=440 | missing_files=0
TEST: filas=460 | missing_files=0
Batch check: torch.Size([64, 3, 224, 224]) torch.Size([64]) (tensor([0, 1]), tensor([28, 36]))


In [20]:
# C) Modelo + entrenamiento
model = timm.create_model("efficientnet_b3", pretrained=True, num_classes=1)
model = model.to(DEVICE)

opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
loss_fn = nn.BCEWithLogitsLoss()

def run_epoch(loader, train=True):
    model.train(train)
    tot, n = 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.float().to(DEVICE)
        with torch.set_grad_enabled(train):
            logits = model(xb).flatten()
            loss = loss_fn(logits, yb)
        if train:
            opt.zero_grad(set_to_none=True)
            loss.backward()
            opt.step()
        tot += float(loss)*len(xb); n += len(xb)
    return tot/max(n,1)

best = (1e9, None)
for epoch in range(1, 11):
    trL = run_epoch(train_loader, True)
    vaL = run_epoch(val_loader,   False)
    print(f"Epoch {epoch} | train_loss={trL:.4f} | val_loss={vaL:.4f}")
    if vaL < best[0]:
        best = (vaL, f"p13_effb3_oas2_best.pth")
        torch.save(model.state_dict(), OUT / best[1])
        print("✓ Guardado best.")


Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  tot += float(loss)*len(xb); n += len(xb)


Epoch 1 | train_loss=1.5634 | val_loss=1.7870
✓ Guardado best.
Epoch 2 | train_loss=0.8380 | val_loss=1.6327
✓ Guardado best.
Epoch 3 | train_loss=0.5681 | val_loss=1.0657
✓ Guardado best.
Epoch 4 | train_loss=0.3889 | val_loss=0.9876
✓ Guardado best.
Epoch 5 | train_loss=0.2782 | val_loss=0.9404
✓ Guardado best.
Epoch 6 | train_loss=0.2231 | val_loss=1.2317
Epoch 7 | train_loss=0.1786 | val_loss=1.2734
Epoch 8 | train_loss=0.1153 | val_loss=1.5895
Epoch 9 | train_loss=0.1216 | val_loss=1.3243
Epoch 10 | train_loss=0.0792 | val_loss=1.3142


In [21]:
# D) Inferencia por-slice
def infer_df(df, loader, model_path):
    m = timm.create_model("efficientnet_b3", pretrained=False, num_classes=1).to(DEVICE)
    m.load_state_dict(torch.load(model_path, map_location=DEVICE))
    m.eval()
    scores = []
    with torch.no_grad():
        for xb, _ in tqdm(loader, desc="Infer"):
            xb = xb.to(DEVICE)
            logits = m(xb).flatten()
            prob = torch.sigmoid(logits).detach().cpu().numpy()
            scores.append(prob)
    scores = np.concatenate(scores)
    out = df.copy()
    out["y_true"] = df["target"].astype(int).values
    out["y_score"] = scores
    return out[["patient_id","png_path","y_true","y_score"]]

best_path = OUT / "p13_effb3_oas2_best.pth"
val_sl  = infer_df(val_df,  val_loader,  best_path)
test_sl = infer_df(test_df, test_loader, best_path)

val_sl.to_csv(OUT/"val_png_preds_oas2_effb3.csv", index=False)
test_sl.to_csv(OUT/"test_png_preds_oas2_effb3.csv", index=False)

print("VAL_SL", val_sl.shape, "TEST_SL", test_sl.shape)
val_sl.head(3)


Infer:   0%|          | 0/7 [00:00<?, ?it/s]

Infer:   0%|          | 0/8 [00:00<?, ?it/s]

VAL_SL (440, 4) TEST_SL (460, 4)


Unnamed: 0,patient_id,png_path,y_true,y_score
0,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1,0.969628
1,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1,0.99779
2,OAS2_0002,/content/datasets/OAS2_PROCESSED/OAS2_0002_MR3...,1,0.967749


In [22]:
# E) Paciente-level + métricas
val_pt  = patient_pool(val_sl)
test_pt = patient_pool(test_sl)
print("VAL_PT", val_pt.shape, "TEST_PT", test_pt.shape)
display(val_pt.head(2))

def eval_by_variant(df_pt, name):
    for k in ["mean","trimmed20","top7","pmean_2"]:
        m = metrics_binary(df_pt["y_true"].values, df_pt[k].values)
        print(f"[{name}:{k}] {m}")

eval_by_variant(val_pt,  "VAL")
eval_by_variant(test_pt, "TEST")

# Guardados (para integrarlos en catálogo/ensembles si quieres)
val_pt.to_csv(OUT/"val_patient_preds_oas2_effb3.csv", index=False)
test_pt.to_csv(OUT/"test_patient_preds_oas2_effb3.csv", index=False)

summary = {
    "variant": "EffB3_OAS2",
    "VAL_mean": metrics_binary(val_pt["y_true"],  val_pt["mean"]),
    "TEST_mean":metrics_binary(test_pt["y_true"], test_pt["mean"]),
    "VAL_trimmed20": metrics_binary(val_pt["y_true"],  val_pt["trimmed20"]),
    "TEST_trimmed20":metrics_binary(test_pt["y_true"], test_pt["trimmed20"]),
    "VAL_top7": metrics_binary(val_pt["y_true"],  val_pt["top7"]),
    "TEST_top7":metrics_binary(test_pt["y_true"], test_pt["top7"]),
    "VAL_p2": metrics_binary(val_pt["y_true"],  val_pt["pmean_2"]),
    "TEST_p2":metrics_binary(test_pt["y_true"], test_pt["pmean_2"]),
}
with open(OUT/"p13_patient_eval_summary.json","w") as f:
    json.dump(summary, f, indent=2)
print("💾 Resumen guardado:", OUT/"p13_patient_eval_summary.json")


VAL_PT (22, 6) TEST_PT (23, 6)


Unnamed: 0,patient_id,y_true,mean,trimmed20,top7,pmean_2
0,OAS2_0002,1,0.682128,0.726299,0.988161,0.770463
1,OAS2_0004,0,0.638993,0.673595,0.984751,0.743807


[VAL:mean] {'AUC': np.float64(0.9008264462809917), 'PRAUC': np.float64(0.9285204991087346), 'Acc': np.float64(0.8636363635971074), 'P': np.float64(0.89999999991), 'R': np.float64(0.8181818181074381), 'thr': 0.655, 'n': 22}
[VAL:trimmed20] {'AUC': np.float64(0.9008264462809917), 'PRAUC': np.float64(0.9285204991087346), 'Acc': np.float64(0.8636363635971074), 'P': np.float64(0.89999999991), 'R': np.float64(0.8181818181074381), 'thr': 0.6900000000000001, 'n': 22}
[VAL:top7] {'AUC': np.float64(0.9090909090909091), 'PRAUC': np.float64(0.9321969696969699), 'Acc': np.float64(0.8636363635971074), 'P': np.float64(0.89999999991), 'R': np.float64(0.8181818181074381), 'thr': 0.985, 'n': 22}
[VAL:pmean_2] {'AUC': np.float64(0.9090909090909091), 'PRAUC': np.float64(0.9321969696969699), 'Acc': np.float64(0.8636363635971074), 'P': np.float64(0.89999999991), 'R': np.float64(0.8181818181074381), 'thr': 0.745, 'n': 22}
[TEST:mean] {'AUC': np.float64(0.7651515151515151), 'PRAUC': np.float64(0.7881059028117

In [23]:
# F) Normalización mínima (ya sale como y_score, y_true, patient_id)
# Esta celda es un recordatorio de formato correcto (no cambia nada si ya viene OK).
def norm_cols(path):
    df = pd.read_csv(path)
    ren = {}
    if "sigmoid(logit)" in df.columns: ren["sigmoid(logit)"]="y_score"
    if "sigmoid(logits)" in df.columns: ren["sigmoid(logits)"]="y_score"
    if "pred" in df.columns: ren["pred"]="y_score"
    if "target" in df.columns and "y_true" not in df.columns: ren["target"]="y_true"
    if ren:
        df = df.rename(columns=ren)
        df.to_csv(path, index=False)
        print("Normalizado:", path.name, "->", list(df.columns))
    else:
        print("OK:", path.name, "->", list(df.columns))

norm_cols(OUT/"val_png_preds_oas2_effb3.csv")
norm_cols(OUT/"test_png_preds_oas2_effb3.csv")


OK: val_png_preds_oas2_effb3.csv -> ['patient_id', 'png_path', 'y_true', 'y_score']
OK: test_png_preds_oas2_effb3.csv -> ['patient_id', 'png_path', 'y_true', 'y_score']


In [26]:
# === Gfix v2: lector robusto para score y logits, y rebuild de features ===
import pandas as pd, numpy as np, json, os
from pathlib import Path

CATALOG_JSON = "/content/drive/MyDrive/CognitivaAI/p11_alt_backbones/p11_backbone_catalog.json"
P11 = Path("/content/drive/MyDrive/CognitivaAI/p11_alt_backbones")

def safe_sigmoid(z):
    z = np.clip(z, -50, 50)
    return 1.0/(1.0+np.exp(-z))

# Candidatos de columnas
ID_CANDS    = ["patient_id","scan_id","subject_id","id"]
Y_CANDS     = ["y_true","target","label","y","gt"]
SCORE_CANDS = [
    # primero columnas que ya son probas
    "y_score","sigmoid(logit)","sigmoid(logits)","pred","prob","proba","score",
    # luego logits crudos
    "logit","logits",
]

def detect_col(cands, cols, must=False):
    for c in cands:
        if c in cols:
            return c
    if must:
        raise KeyError(f"No se pudo detectar ninguna entre {cands} en {list(cols)}")
    return None

def read_preds(path):
    df = pd.read_csv(path)
    cols = list(df.columns)

    id_col = detect_col(ID_CANDS, cols, must=True)
    y_col  = detect_col(Y_CANDS,  cols, must=True)
    s_col  = detect_col(SCORE_CANDS, cols, must=True)

    # Normalización de score
    if s_col in ("logit","logits"):
        df["y_score"] = safe_sigmoid(df[s_col].astype(float))
    else:
        df["y_score"] = df[s_col].astype(float)

    # Normalizar nombres id / y
    if id_col != "patient_id":
        df = df.rename(columns={id_col:"patient_id"})
    if y_col != "y_true":
        df = df.rename(columns={y_col:"y_true"})

    # Forzar tipos
    df["patient_id"] = df["patient_id"].astype(str)
    df["y_true"] = df["y_true"].astype(int)
    return df[["patient_id","y_true","y_score"]]

def patient_pool(df: pd.DataFrame) -> pd.DataFrame:
    g = df.groupby("patient_id")["y_score"]
    def trimmed20(s):
        if len(s) < 10:  # fallback si hay pocas slices
            return s.mean()
        s2 = s.sort_values()
        lo = int(len(s2)*0.10); hi = int(len(s2)*0.90)
        return s2.iloc[lo:hi].mean()
    out = pd.DataFrame({
        "patient_id": g.mean().index,
        "mean":       g.mean().values,
        "trimmed20":  g.apply(trimmed20).values,
        "top7":       g.apply(lambda s: s.sort_values(ascending=False).head(7).mean()).values,
        "pmean_2":    g.apply(lambda s: (np.mean(np.clip(s,0,1)**2))**0.5).values,
    })
    return out

# --- Rebuild de features VAL/TEST desde catálogo ---
with open(CATALOG_JSON, "r") as f:
    catalog = json.load(f)

def build_features(cat: dict) -> tuple[pd.DataFrame, pd.DataFrame]:
    features_val = None
    features_test= None

    for tag, paths in cat.items():
        val_path = paths["VAL"]
        test_path= paths["TEST"]
        if not os.path.exists(val_path):
            raise FileNotFoundError(f"[{tag}] No existe VAL: {val_path}")
        if not os.path.exists(test_path):
            raise FileNotFoundError(f"[{tag}] No existe TEST: {test_path}")

        v = read_preds(val_path)
        t = read_preds(test_path)

        # Pool a paciente (si ya vienen a paciente, simplemente habrá 1 fila por paciente)
        v_pt = patient_pool(v).rename(columns={
            "mean":      f"{tag}_mean",
            "trimmed20": f"{tag}_trimmed20",
            "top7":      f"{tag}_top7",
            "pmean_2":   f"{tag}_p2",
        })
        t_pt = patient_pool(t).rename(columns={
            "mean":      f"{tag}_mean",
            "trimmed20": f"{tag}_trimmed20",
            "top7":      f"{tag}_top7",
            "pmean_2":   f"{tag}_p2",
        })

        # Recuperar y_true por paciente desde v/t originales
        yv = v[["patient_id","y_true"]].drop_duplicates()
        yt = t[["patient_id","y_true"]].drop_duplicates()
        v_pt = yv.merge(v_pt, on="patient_id", how="left")
        t_pt = yt.merge(t_pt, on="patient_id", how="left")

        if features_val is None:
            features_val = v_pt
            features_test= t_pt
        else:
            features_val = features_val.merge(v_pt, on=["patient_id","y_true"], how="outer")
            features_test= features_test.merge(t_pt, on=["patient_id","y_true"], how="outer")

        print(f"✅ {tag}: VAL {v.shape}->{v_pt.shape} | TEST {t.shape}->{t_pt.shape} (cols score detectadas OK)")

    return features_val, features_test

VAL, TEST = build_features(catalog)
VAL.to_csv(P11/"val_patient_features_backbones.csv", index=False)
TEST.to_csv(P11/"test_patient_features_backbones.csv", index=False)
print("💾 Guardado:",
      str(P11/"val_patient_features_backbones.csv"),
      "|",
      str(P11/"test_patient_features_backbones.csv"))




✅ SwinTiny: VAL (940, 3)->(47, 6) | TEST (940, 3)->(47, 6) (cols score detectadas OK)
✅ convnext_tiny.in12k_ft_in1k_slices: VAL (940, 3)->(47, 6) | TEST (940, 3)->(47, 6) (cols score detectadas OK)
✅ png_preds_d121: VAL (940, 3)->(47, 6) | TEST (940, 3)->(47, 6) (cols score detectadas OK)
✅ patient_preds: VAL (10, 3)->(10, 6) | TEST (47, 3)->(47, 6) (cols score detectadas OK)
✅ patient_preds_ensemble: VAL (10, 3)->(10, 6) | TEST (47, 3)->(47, 6) (cols score detectadas OK)
✅ patient_preds_plus: VAL (47, 3)->(47, 6) | TEST (47, 3)->(47, 6) (cols score detectadas OK)
✅ png_preds: VAL (940, 3)->(47, 6) | TEST (940, 3)->(47, 6) (cols score detectadas OK)
✅ slice_preds_plus: VAL (940, 3)->(47, 6) | TEST (940, 3)->(47, 6) (cols score detectadas OK)
✅ slice_preds_seedENS: VAL (940, 3)->(47, 6) | TEST (940, 3)->(47, 6) (cols score detectadas OK)
✅ slices_preds: VAL (200, 3)->(10, 6) | TEST (940, 3)->(47, 6) (cols score detectadas OK)
✅ slice_preds: VAL (1068, 3)->(47, 6) | TEST (1068, 3)->(47, 