In [5]:
import pandas as pd
from pathlib import Path

# Ruta al Excel original
EXCEL_PATH = Path("data\oasis_longitudinal_demographics-8d83e569fa2e2d30.xlsx")

# Lee Excel y estandariza columnas clave
df = pd.read_excel(EXCEL_PATH)

# Normaliza nombres esperados: ajusta si tu Excel difiere
# Columnas típicas de OASIS-2:
#   - "Subject ID": OAS2_0001
#   - "MRI ID": OAS2_0001_MR1
#   - "Group": "Control" / "Dementia" / (posible "Converted")
cols = {c.lower().strip(): c for c in df.columns}
def pick(name):
    return cols.get(name, name)

df = df.rename(columns={
    pick("subject id"): "patient_id",
    pick("mri id"): "scan_id",
    pick("group"): "group"
})

# Filtra filas con scan_id válido (evita NaN/filas agregadas)
df = df[df["scan_id"].notna()].copy()
df["patient_id"] = df["patient_id"].astype(str).str.strip()
df["scan_id"]   = df["scan_id"].astype(str).str.strip()
df["group"]     = df["group"].astype(str).str.strip()

# --- OPCIÓN A: convertir "Converted" a 1 (positivo) ---
map_group_to_target = {"Control": 0, "Dementia": 1, "Converted": 1}
# --- OPCIÓN B: excluir "Converted" ---
# df = df[df["group"].isin(["Control","Dementia"])]
# map_group_to_target = {"Control": 0, "Dementia": 1}

df["target"] = df["group"].map(map_group_to_target)

# Si alguna fila queda sin target (p.ej. grupos inesperados), elimínala
df = df[df["target"].isin([0,1])].copy()

# Selección columnas finales (puedes añadir edad, sexo si están)
labels = df[["patient_id", "scan_id", "target"]].drop_duplicates()

OUT = Path("data/oas2_labels.csv")     # <- pon aquí donde quieres dejarlo
labels.to_csv(OUT, index=False)
print(f"Guardado labels en: {OUT} | filas={len(labels)}")
print(labels.head())


Guardado labels en: data\oas2_labels.csv | filas=37
   patient_id        scan_id  target
33  OAS2_0018  OAS2_0018_MR1     1.0
34  OAS2_0018  OAS2_0018_MR3     1.0
35  OAS2_0018  OAS2_0018_MR4     1.0
36  OAS2_0020  OAS2_0020_MR1     1.0
37  OAS2_0020  OAS2_0020_MR2     1.0


  EXCEL_PATH = Path("data\oasis_longitudinal_demographics-8d83e569fa2e2d30.xlsx")


In [1]:
# -*- coding: utf-8 -*-
import re
import pandas as pd
import numpy as np
from pathlib import Path

# -----------------------------
# 1) Rutas de entrada / salida
# -----------------------------
EXCEL_PATH = Path("data\oasis_longitudinal_demographics-8d83e569fa2e2d30.xlsx")

# inventario que generaste al exportar PNGs:
INV_CSV    = Path("data\OAS2_PROCESSED\oas2_slices_inventory.csv")  # tiene scan_id, png_path, ...
OUT_CSV    = Path("data\oas2_labels.csv")  # salida final (scan_id, patient_id, target)

# ---------------------------------------------------
# 2) Helpers de normalización de nombres y valores
# ---------------------------------------------------
def norm_colname(c: str) -> str:
    """normaliza nombres de columna: minúsculas, sin espacios/guiones/underscore repetidos"""
    c = c.strip().lower()
    c = re.sub(r"[\s\-]+", "_", c)
    return c

def pick_col(cols, *cands):
    """
    Devuelve el primer nombre de columna presente en 'cols'
    que matchee alguno de los candidatos (normalizando).
    Ej: pick_col(df.columns, "mri id","mri_id","mr id")
    """
    nmap = {norm_colname(c): c for c in cols}
    for cand in cands:
        nc = norm_colname(cand)
        if nc in nmap:
            return nmap[nc]
    # intentar búsqueda 'fuzzy' por tokens
    for c in cols:
        if all(tok in norm_colname(c) for tok in norm_colname(cands[0]).split("_")):
            return c
    return None

def map_group_to_target_raw(g):
    """Mapea valor textual del grupo a target binario, tolerante a variantes."""
    if g is None or (isinstance(g, float) and np.isnan(g)):
        return np.nan
    s = str(g).strip().lower()
    # variantes típicas en OASIS
    if s in {"nondemented", "non-demented", "control", "cn"}:
        return 0
    if s in {"demented", "ad", "alzheimers"}:
        return 1
    if s in {"converted", "converter", "conversion"}:
        # << opción A: contar como 1 (positivo) >>
        return 1
        # << opción B: si prefieres excluir Converted, devuelve np.nan y luego filtras >>
        # return np.nan
    # valores inesperados -> nan
    return np.nan

# --------------------------------------
# 3) Cargar Excel y normalizar columnas
# --------------------------------------
xl = pd.ExcelFile(EXCEL_PATH)
# toma la primera hoja que contenga "oasis" o "longitudinal" o usa la primera
sheet = None
for s in xl.sheet_names:
    ls = s.lower()
    if ("oasis" in ls) or ("longitud" in ls) or ("demograph" in ls):
        sheet = s
        break
if sheet is None:
    sheet = xl.sheet_names[0]

df = pd.read_excel(EXCEL_PATH, sheet_name=sheet)
orig_cols = df.columns.tolist()

# Renombrar columnas clave de forma robusta
col_patient = pick_col(df.columns, "Subject ID", "subject_id", "subject")
col_mri     = pick_col(df.columns, "MRI ID", "MR ID", "mri_id", "mriid", "mrid")
col_group   = pick_col(df.columns, "Group", "Diagnosis", "diag", "group")

if not col_patient or not col_mri or not col_group:
    raise RuntimeError(
        f"No pude localizar columnas clave.\n"
        f"Encontré: patient={col_patient}, mri={col_mri}, group={col_group}\n"
        f"Columnas disponibles: {orig_cols}"
    )

df = df.rename(columns={
    col_patient: "patient_id",
    col_mri:     "scan_id",
    col_group:   "group",
})

# limpiar strings
for c in ["patient_id", "scan_id", "group"]:
    df[c] = df[c].astype(str).str.strip()

# eliminar filas sin scan_id válido
df = df[df["scan_id"].str.len() > 0].copy()

# ------------------------------------------------
# 4) Generar target binario desde "group" robusto
# ------------------------------------------------
df["target"] = df["group"].apply(map_group_to_target_raw)

# quitar filas sin target (si hubo grupos raros o si decidiste excluir "Converted")
df = df[df["target"].isin([0, 1])].copy()

# normalizar formato de scan_id (OAS2_####_MR\d)
# si tu Excel lo trae perfecto, esto no toca nada.
pat = re.compile(r"^OAS2_\d{4}_MR\d$", re.IGNORECASE)
def fix_scan_id(x):
    s = x.strip()
    s = s.replace(" ", "_")
    s = s.upper()
    # a veces ponen "OAS2-0001 MR1" → convertir a OAS2_0001_MR1
    s = s.replace("-", "_")
    s = re.sub(r"MR[_\s]*", "MR", s)
    if "OAS2_" in s and "_MR" in s:
        return s
    # si no cuadra, lo dejamos tal cual
    return s

df["scan_id"] = df["scan_id"].apply(fix_scan_id)

# ------------------------------------------------------
# 5) Cruzar con inventario real de imágenes por scan_id
# ------------------------------------------------------
inv = pd.read_csv(INV_CSV)
if "scan_id" not in inv.columns:
    # En tu inventario venía seguro 'scan_id'; por si acaso:
    guess = pick_col(inv.columns, "scan_id", "mri id", "mri_id", "mr id")
    if not guess:
        raise RuntimeError(f"Inventario {INV_CSV} no tiene scan_id localizable. Cols={inv.columns}")
    inv = inv.rename(columns={guess: "scan_id"})

inv["scan_id"] = inv["scan_id"].astype(str).str.strip().apply(fix_scan_id)

# nos quedamos sólo con scan_id que EXISTEN en el inventario
labels = df[df["scan_id"].isin(inv["scan_id"].unique())].copy()

# -----------------------------------------
# 6) (Opcional) 1 sesión por paciente
#    Reglas:
#     - si tiene alguna sesión "Demented/Converted" → nos quedamos con la última por MR#
#     - si sólo tiene "Nondemented" → nos quedamos con la primera por MR#
# -----------------------------------------
def mr_number(sid: str) -> int:
    m = re.search(r"_MR(\d+)$", sid)
    return int(m.group(1)) if m else 0

labels["_mr"] = labels["scan_id"].apply(mr_number)

def pick_one_session(g):
    # g: dataframe por patient_id
    g = g.sort_values("_mr")
    pos = g[g["target"] == 1]
    if len(pos):
        return pos.iloc[[-1]]   # la última con target=1
    return g.iloc[[0]]          # sino, la primera

labels_1sess = labels.groupby("patient_id", as_index=False, group_keys=False).apply(pick_one_session).copy()
labels_1sess = labels_1sess.drop(columns=["_mr"])

# -----------------------------------------
# 7) Guardar
# -----------------------------------------
labels_final = labels_1sess[["patient_id", "scan_id", "target"]].drop_duplicates()

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
labels_final.to_csv(OUT_CSV, index=False)

print(f"Guardado labels en: {OUT_CSV} | filas={len(labels_final)}")
print(labels_final.head(10))

# -------------------------
# 8) Diagnósticos útiles
# -------------------------
print("\nCobertura contra inventario:")
print("  - scan_id en inventario:", inv["scan_id"].nunique())
print("  - scan_id con label final:", labels_final["scan_id"].nunique())
miss = sorted(set(inv["scan_id"].unique()) - set(labels_final["scan_id"].unique()))
print("  - scan_id SIN label (muestras):", miss[:10])

print("\nDistribución de target:")
print(labels_final["target"].value_counts(dropna=False))


  EXCEL_PATH = Path("data\oasis_longitudinal_demographics-8d83e569fa2e2d30.xlsx")
  INV_CSV    = Path("data\OAS2_PROCESSED\oas2_slices_inventory.csv")  # tiene scan_id, png_path, ...
  OUT_CSV    = Path("data\oas2_labels.csv")  # salida final (scan_id, patient_id, target)


Guardado labels en: data\oas2_labels.csv | filas=150
   patient_id        scan_id  target
0   OAS2_0001  OAS2_0001_MR1       0
4   OAS2_0002  OAS2_0002_MR3       1
5   OAS2_0004  OAS2_0004_MR1       0
7   OAS2_0005  OAS2_0005_MR1       0
12  OAS2_0007  OAS2_0007_MR4       1
13  OAS2_0008  OAS2_0008_MR1       0
16  OAS2_0009  OAS2_0009_MR2       1
18  OAS2_0010  OAS2_0010_MR2       1
19  OAS2_0012  OAS2_0012_MR1       0
22  OAS2_0013  OAS2_0013_MR1       0

Cobertura contra inventario:
  - scan_id en inventario: 367
  - scan_id con label final: 150
  - scan_id SIN label (muestras): ['OAS2_0001_MR2', 'OAS2_0002_MR1', 'OAS2_0002_MR2', 'OAS2_0004_MR2', 'OAS2_0005_MR2', 'OAS2_0005_MR3', 'OAS2_0007_MR1', 'OAS2_0007_MR3', 'OAS2_0008_MR2', 'OAS2_0009_MR1']

Distribución de target:
target
1    78
0    72
Name: count, dtype: int64


  labels_1sess = labels.groupby("patient_id", as_index=False, group_keys=False).apply(pick_one_session).copy()


In [None]:
# ========================================
# P4-aplicado-a-OASIS2 (RAW → Slices PNG)
#   - Entrada: /data/OAS2_RAW/OAS2_XXXX_MR{1..4}/RAW/*mpr-*.hdr
#   - Salida : OUT_DIR con PNGs y oas2_slices_inventory.csv
#   - Normalización: máscara FSL (si hay) + z-score + stretch [0.5..99.5]
#   - Cortes: 20 axiales, equiespaciados, evitando extremos (edge_crop=0.08)
# ========================================

import os, sys, json, math, glob, warnings
from pathlib import Path
from typing import Optional, Tuple, List

import numpy as np
import pandas as pd
import nibabel as nib
from tqdm import tqdm
import cv2

# ---- Parámetros principales ----
OAS2_RAW_DIR = Path("data/OAS2_RAW")         # <--- tu raíz OAS2 RAW
OUT_DIR      = Path("data/OAS2_PROCESSED")   # <--- salida (cámbialo si quieres)
OUT_DIR.mkdir(parents=True, exist_ok=True)

NUM_SLICES  = 20
EDGE_CROP   = 0.08
APPLY_CLAHE = False  # en P4 lo dejamos opcional (P2 lo aplicaba siempre)

# Si tienes un CSV con etiquetas por scan_id (p.ej., OAS2_0001_MR1),
# puedes indicarlo aquí para generar un CSV final unido a labels:
LABELS_CSV: Optional[Path] = Path("data/oas2_labels.csv")
LABELS_SCAN_COL = "scan_id"
LABELS_TARGET_COL = "target"

# Dependencias opcionales (nilearn) para remuestrear máscara exacta por affine
USE_NILEARN = True
try:
    from nilearn.image import resample_to_img
except Exception:
    USE_NILEARN = False
    warnings.warn("nilearn no disponible; se hará remuestreo simple por forma (ndi.zoom).")

import scipy.ndimage as ndi

# -----------------------------------------
# Utilidades (idénticas o equivalentes a P4)
# -----------------------------------------
CLAHE = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))

def log(msg: str): print(msg, flush=True)

def best_volume_path_oas2(mr_folder: Path) -> Optional[Path]:
    """
    OAS2: usamos RAW/*mpr-*.hdr (no hay PROCESSED típico OAS1).
    """
    raw_hdrs = sorted((mr_folder / "RAW").glob("*mpr-*.hdr"))
    if raw_hdrs:
        return raw_hdrs[0]
    return None

def squeeze_to_3d(arr: np.ndarray) -> np.ndarray:
    """
    Salida 3D (H,W,D) desde 2D/3D/4D:
    - 2D → añade eje D=1
    - 3D → igual
    - 4D → si última dim=1 → squeeze; si >1 → primer volumen
    """
    if arr.ndim == 2:  return arr[..., None]
    if arr.ndim == 3:  return arr
    if arr.ndim == 4:
        if arr.shape[-1] == 1:
            return np.squeeze(arr, axis=-1)
        else:
            return arr[..., 0]
    raise ValueError(f"Volumen con ndim no soportado: {arr.ndim}")

def load_volume_3d(vol_hdr_path: Path) -> Tuple[np.ndarray, nib.spatialimages.SpatialImage]:
    img = nib.load(str(vol_hdr_path))
    vol = img.get_fdata().astype(np.float32)
    vol = squeeze_to_3d(vol)
    return vol, img

def _load_fsl_mask_any(mr_folder: Path) -> Tuple[Optional[np.ndarray], Optional[nib.Nifti1Image], str]:
    """
    OAS2 puede tener FSL_SEG igual que OAS1. Priorizamos:
      1) *_fseg*.hdr (segmentación tisular → bin>0)
      2) *_mask*.hdr / *_brain*.hdr (binaria)
    """
    seg_dir = mr_folder / "FSL_SEG"
    if not seg_dir.exists():
        return None, None, "none"
    cands_fseg = sorted(seg_dir.glob("*fseg*.hdr"))
    cands_mask = sorted(list(seg_dir.glob("*mask*.hdr")) + list(seg_dir.glob("*brain*.hdr")))
    try:
        if cands_fseg:
            mimg = nib.load(str(cands_fseg[0]))
            mdat = mimg.get_fdata()
            return (mdat > 0).astype(np.uint8), mimg, "fseg>0"
        if cands_mask:
            mimg = nib.load(str(cands_mask[0]))
            mdat = mimg.get_fdata()
            return (mdat > 0.5).astype(np.uint8), mimg, "mask/bin"
    except Exception:
        return None, None, "none"
    return None, None, "none"

def _resample_mask_to_shape(mask: np.ndarray, target_shape: Tuple[int,int,int]) -> np.ndarray:
    if mask.shape == target_shape:
        return (mask > 0.5).astype(np.uint8)
    zx = target_shape[0] / mask.shape[0]
    zy = target_shape[1] / mask.shape[1]
    zz = target_shape[2] / mask.shape[2]
    resized = ndi.zoom(mask.astype(np.uint8), zoom=(zx, zy, zz), order=0)
    return (resized > 0.5).astype(np.uint8)

def _resample_mask_to_img(mask_img: nib.Nifti1Image, target_img: nib.spatialimages.SpatialImage) -> np.ndarray:
    mask_bin = (mask_img.get_fdata() > 0).astype(np.int16)
    mask_bin_img = nib.Nifti1Image(mask_bin, affine=mask_img.affine)
    mask_rs = resample_to_img(mask_bin_img, target_img, interpolation='nearest', force_resample=True)
    return (squeeze_to_3d(mask_rs.get_fdata()) > 0.5).astype(np.uint8)

def make_otsu_mask(vol_u8: np.ndarray) -> np.ndarray:
    """
    Otsu 2D sobre proyección media axial → expandida a 3D.
    """
    avg2d = vol_u8.mean(axis=2).astype(np.uint8)
    thr, _ = cv2.threshold(avg2d, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    mask2d = (avg2d > thr).astype(np.uint8)
    return np.repeat(mask2d[:, :, None], vol_u8.shape[2], axis=2)

def build_brain_mask(mr_folder: Path,
                     vol_hdr_path: Path,
                     vol_float: np.ndarray,
                     target_img: nib.spatialimages.SpatialImage) -> Tuple[np.ndarray, str]:
    """
    1) Intenta FSL_SEG (remuestreo exacto por affine si hay nilearn; si no, por forma).
    2) Valida tamaño (5%..95% voxels). Si falla → Otsu.
    """
    m_arr, m_img, m_src = _load_fsl_mask_any(mr_folder)
    if m_arr is not None:
        try:
            if USE_NILEARN and m_img is not None:
                m_rs = _resample_mask_to_img(m_img, target_img)
            else:
                m_rs = _resample_mask_to_shape(m_arr, vol_float.shape)
            frac = float(m_rs.mean())
            if 0.05 <= frac <= 0.95:
                return (m_rs.astype(np.uint8), m_src)
        except Exception:
            pass

    # Fallback: Otsu (necesita u8 preliminar)
    vmin, vmax = float(vol_float.min()), float(vol_float.max())
    if vmax > vmin:
        vol_u8_tmp = ((vol_float - vmin) / (vmax - vmin + 1e-8) * 255).astype(np.uint8)
    else:
        vol_u8_tmp = np.zeros_like(vol_float, dtype=np.uint8)
    return (make_otsu_mask(vol_u8_tmp), "otsu")

def normalize_volume_zscore_u8(vol_float: np.ndarray, mask: Optional[np.ndarray]) -> np.ndarray:
    """
    Z-score dentro de máscara si existe; luego stretch robusto a [0..255] por percentiles (0.5, 99.5).
    """
    v = vol_float.astype(np.float32)
    if mask is not None and mask.shape == v.shape and mask.sum() >= 50:
        m = mask.astype(bool)
        mu, sd = v[m].mean(), v[m].std() + 1e-8
        v = (v - mu) / sd
    else:
        mu, sd = v.mean(), v.std() + 1e-8
        v = (v - mu) / sd

    lo, hi = np.percentile(v, [0.5, 99.5])
    v = np.clip((v - lo) / (hi - lo + 1e-8), 0, 1)
    return (v * 255).astype(np.uint8)

def select_slices_indices(depth: int, num_slices: int, edge_crop: float = 0.08) -> List[int]:
    z0 = int(depth * edge_crop)
    z1 = int(depth * (1 - edge_crop))
    z1 = max(z1, z0 + num_slices)
    idxs = np.linspace(z0, z1 - 1, num_slices).astype(int)
    idxs = np.unique(np.clip(idxs, 0, depth - 1)).tolist()
    return idxs

def save_slices_png(volume_u8: np.ndarray,
                    scan_id: str,
                    out_dir: Path,
                    num_slices: int,
                    edge_crop: float,
                    apply_clahe: bool) -> List[str]:
    out_dir.mkdir(parents=True, exist_ok=True)
    saved = []
    H, W, D = volume_u8.shape
    idxs = select_slices_indices(D, num_slices, edge_crop)
    for i, z in enumerate(idxs):
        sl = volume_u8[:, :, z]
        if apply_clahe:
            sl = CLAHE.apply(sl)
        sl_rgb = cv2.cvtColor(sl, cv2.COLOR_GRAY2RGB)
        pth = out_dir / f"{scan_id}_slice{i:02d}.png"
        cv2.imwrite(str(pth), sl_rgb)
        saved.append(str(pth))
    return saved

# -----------------------------------------
# Bucle principal OASIS-2
# -----------------------------------------
mr_folders = []
for k in range(1, 5):
    mr_folders += [p for p in OAS2_RAW_DIR.glob(f"OAS2_*_MR{k}") if p.is_dir()]
mr_folders = sorted(mr_folders)
log(f"Carpetas OASIS-2 MR1..MR4 encontradas: {len(mr_folders)}")

rows = []
n_ok, n_fail = 0, 0

for folder in tqdm(mr_folders, desc="Procesando OASIS-2"):
    scan_id = folder.name  # p.ej. OAS2_0123_MR2
    vol_hdr = best_volume_path_oas2(folder)
    if vol_hdr is None:
        n_fail += 1
        print(f"[WARN] {scan_id}: no se encontró RAW/*mpr-*.hdr")
        continue

    try:
        # 1) Carga volumen y ref espacial
        vol_float, target_img = load_volume_3d(vol_hdr)   # (H,W,D) float32

        # 2) Máscara (FSL si hay; si no Otsu)
        mask, mask_src = build_brain_mask(folder, vol_hdr, vol_float, target_img)

        # 3) z-score+stretch → uint8
        vol_u8 = normalize_volume_zscore_u8(vol_float, mask)

        # 4) Guardar slices
        paths = save_slices_png(
            vol_u8, scan_id, OUT_DIR,
            num_slices=NUM_SLICES, edge_crop=EDGE_CROP, apply_clahe=APPLY_CLAHE
        )

        # 5) Registrar inventario
        for pth in paths:
            rows.append({
                "scan_id": scan_id,
                "patient_id": scan_id.split("_MR")[0],  # OAS2_0123
                "png_path": pth,
                "source_hdr": str(vol_hdr),
                "has_mask": int(mask is not None),
                "mask_source": mask_src,
            })
        n_ok += 1

    except Exception as e:
        n_fail += 1
        print(f"[WARN] {scan_id}: error procesando {vol_hdr.name} → {e}")

df_inv = pd.DataFrame(rows)
inv_csv = OUT_DIR / "oas2_slices_inventory.csv"
df_inv.to_csv(inv_csv, index=False)

log(f"\n✔ Procesados OK: {n_ok} | ✖ Fallidos: {n_fail} | PNG totales: {len(rows)}")
log(f"Inventario guardado en: {inv_csv}")

# -----------------------------------------
# (Opcional) Unir etiquetas, si LABELS_CSV existe
# -----------------------------------------
if LABELS_CSV is not None and LABELS_CSV.exists():
    lab = pd.read_csv(LABELS_CSV)
    assert LABELS_SCAN_COL in lab.columns, f"Falta columna '{LABELS_SCAN_COL}' en labels"
    assert LABELS_TARGET_COL in lab.columns, f"Falta columna '{LABELS_TARGET_COL}' en labels"
    out_df = df_inv.merge(lab[[LABELS_SCAN_COL, LABELS_TARGET_COL]],
                          left_on="scan_id", right_on=LABELS_SCAN_COL, how="left")
    # renombra target
    out_df = out_df.rename(columns={LABELS_TARGET_COL: "target"})
    # (si quieres filtrar solo los que tienen target no nulo)
    # out_df = out_df[~out_df["target"].isna()].copy()

    # dataset completo (por-slice) con target
    ds_csv = OUT_DIR / "oas2_slices_dataset.csv"
    out_df.to_csv(ds_csv, index=False)
    log(f"Dataset por-slice con target guardado en: {ds_csv}")

# Pequeño resumen a ojo
try:
    display(df_inv.sample(min(len(df_inv), 5)))
except Exception:
    pass


Carpetas OASIS-2 MR1..MR4 encontradas: 367


Procesando OASIS-2: 100%|██████████| 367/367 [05:22<00:00,  1.14it/s]


✔ Procesados OK: 367 | ✖ Fallidos: 0 | PNG totales: 7340
Inventario guardado en: data\OAS2_PROCESSED\oas2_slices_inventory.csv
Dataset por-slice con target guardado en: data\OAS2_PROCESSED\oas2_slices_dataset.csv





Unnamed: 0,scan_id,patient_id,png_path,source_hdr,has_mask,mask_source
3311,OAS2_0078_MR3,OAS2_0078,data\OAS2_PROCESSED\OAS2_0078_MR3_slice11.png,data\OAS2_RAW\OAS2_0078_MR3\RAW\mpr-1.nifti.hdr,1,otsu
4533,OAS2_0111_MR1,OAS2_0111,data\OAS2_PROCESSED\OAS2_0111_MR1_slice13.png,data\OAS2_RAW\OAS2_0111_MR1\RAW\mpr-1.nifti.hdr,1,otsu
3664,OAS2_0089_MR3,OAS2_0089,data\OAS2_PROCESSED\OAS2_0089_MR3_slice04.png,data\OAS2_RAW\OAS2_0089_MR3\RAW\mpr-1.nifti.hdr,1,otsu
7038,OAS2_0181_MR2,OAS2_0181,data\OAS2_PROCESSED\OAS2_0181_MR2_slice18.png,data\OAS2_RAW\OAS2_0181_MR2\RAW\mpr-1.nifti.hdr,1,otsu
1864,OAS2_0047_MR1,OAS2_0047,data\OAS2_PROCESSED\OAS2_0047_MR1_slice04.png,data\OAS2_RAW\OAS2_0047_MR1\RAW\mpr-1.nifti.hdr,1,otsu


: 

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold, train_test_split

BASE = Path("data")
INV  = BASE/"OAS2_PROCESSED/oas2_slices_inventory.csv"   # inventario que generaste
LAB  = BASE/"oas2_labels.csv"                            # ¡el bueno! (150 filas)
DS   = BASE/"OAS2_PROCESSED/oas2_slices_dataset.csv"     # saldrá aquí tras el merge

INV, LAB


(WindowsPath('data/OAS2_PROCESSED/oas2_slices_inventory.csv'),
 WindowsPath('data/oas2_labels.csv'))

In [4]:
inv = pd.read_csv(INV)
lab = pd.read_csv(LAB)

# Asegura tipos/campos
for c in ["scan_id","patient_id"]:
    inv[c] = inv[c].astype(str).str.strip()
    lab[c] = lab[c].astype(str).str.strip()

# Merge por scan_id (1 sesión por paciente ya resuelta en LAB)
ds = inv.merge(lab[["scan_id","patient_id","target"]], on="scan_id", how="inner", suffixes=("", "_lbl"))
ds["target"] = ds["target"].astype(int)

print("Inventario total:", inv["scan_id"].nunique(), "scans")
print("Labels:", len(lab), "scans")
print("Dataset etiquetado:", ds["scan_id"].nunique(), "scans | filas:", len(ds))
ds.head(3)


Inventario total: 367 scans
Labels: 150 scans
Dataset etiquetado: 150 scans | filas: 3000


Unnamed: 0,scan_id,patient_id,png_path,source_hdr,has_mask,mask_source,patient_id_lbl,target
0,OAS2_0001_MR1,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR1_slice00.png,data\OAS2_RAW\OAS2_0001_MR1\RAW\mpr-1.nifti.hdr,1,otsu,OAS2_0001,0
1,OAS2_0001_MR1,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR1_slice01.png,data\OAS2_RAW\OAS2_0001_MR1\RAW\mpr-1.nifti.hdr,1,otsu,OAS2_0001,0
2,OAS2_0001_MR1,OAS2_0001,data\OAS2_PROCESSED\OAS2_0001_MR1_slice02.png,data\OAS2_RAW\OAS2_0001_MR1\RAW\mpr-1.nifti.hdr,1,otsu,OAS2_0001,0


In [5]:
DS.parent.mkdir(parents=True, exist_ok=True)
ds.to_csv(DS, index=False)
print("✔ Guardado dataset:", DS, "| filas=", len(ds))


✔ Guardado dataset: data\OAS2_PROCESSED\oas2_slices_dataset.csv | filas= 3000


In [6]:
# Paciente->target (un único valor por paciente)
pt = ds.groupby("patient_id", as_index=False).agg({"target":"max"})
y  = pt["target"].values
groups = pt["patient_id"].values

# Split estratificado por paciente
pt_train, pt_temp = train_test_split(pt, test_size=0.4, stratify=y, random_state=42)
y_temp = pt_temp["target"].values
pt_val, pt_test = train_test_split(pt_temp, test_size=0.5, stratify=y_temp, random_state=42)

def subset_by_patients(df, patient_ids):
    return df[df["patient_id"].isin(set(patient_ids))].copy()

train_df = subset_by_patients(ds, pt_train["patient_id"])
val_df   = subset_by_patients(ds, pt_val["patient_id"])
test_df  = subset_by_patients(ds, pt_test["patient_id"])

print("Pacientes →",
      "train:", pt_train.shape[0],
      "val:",   pt_val.shape[0],
      "test:",  pt_test.shape[0])

print("Slices →",
      "train:", len(train_df),
      "val:",   len(val_df),
      "test:",  len(test_df))


Pacientes → train: 90 val: 30 test: 30
Slices → train: 1800 val: 600 test: 600


In [7]:
OUT_DIR = Path("data/OAS2_PROCESSED")  # local; en Colab será /content/datasets/OAS2_PROCESSED
OUT_DIR.mkdir(parents=True, exist_ok=True)

def save_map(df, name):
    cols = ["png_path","target","patient_id","scan_id","source_hdr","has_mask","mask_source"]
    m = df[cols].copy()
    m.to_csv(OUT_DIR/name, index=False)
    print("💾", name, "| filas=", len(m))

save_map(train_df, "oas2_train_colab_mapped.csv")
save_map(val_df,   "oas2_val_colab_mapped.csv")
save_map(test_df,  "oas2_test_colab_mapped.csv")


💾 oas2_train_colab_mapped.csv | filas= 1800
💾 oas2_val_colab_mapped.csv | filas= 600
💾 oas2_test_colab_mapped.csv | filas= 600
