What does this file does.


1.  Imports--
2.  Designates root, Directories, and CSVs.
3.  Makes functions to make ID recalling easier.
4.  Upload Files(.img and .hdr files, .txt files)
5.  Sort files by .hdr, .img, and .txt into their root paths. Put them into a pandas table to make looking up easier.
6.  **(upload previous CSV/Excel if you want it all in one place)**
7.  Extracts relevant data from .txt files and includes them into the pandas table.
8.  **Volume Stuff**
9.  Converts MRI slices into png.
10. **(If previous CSV/EXCEL sheet was uploaded, merge data tables)**
11. Zip up .png files for individual patient information.

Outputs:
1.  Zip files of invidual people. Including PNG files of mri scans.

**Process:
Input .ing, .hdr, .txt files from the FSL_SEG from each of the patient's folders**


In [None]:
!pip -q install nibabel scikit-image torch torchvision pandas scikit-learn tqdm openpyxl


In [None]:
# ---- Paths & imports ----
import os, re, glob, io
import numpy as np
import pandas as pd
from google.colab import files
from tqdm import tqdm

# Imaging / preprocessing
import nibabel as nib
from skimage.transform import resize
from skimage.filters import threshold_otsu
from skimage import morphology
from scipy import ndimage as ndi
from PIL import Image

# Repro
SEED = 42
np.random.seed(SEED)

# Root folders
ROOT = "/content/oasis_inputs"
MRI_DIR = os.path.join(ROOT, "mri_uploads")     # where .hdr/.img/.txt go
LABELS_DIR = os.path.join(ROOT, "labels")       # where cleaned label file goes
OUT_ROOT = "/content/oasis_pipeline"            # outputs
PNG_DIR = os.path.join(OUT_ROOT, "pngs")

# output CSVs
MRI_INDEX_CSV   = os.path.join(ROOT, "mri_upload_index.csv")
SUBJ_FEATS_CSV  = os.path.join(OUT_ROOT, "subject_features_from_fast.csv")
MANIFEST_CSV    = os.path.join(OUT_ROOT, "manifest_slices.csv")
MERGED_CSV      = os.path.join(OUT_ROOT, "manifest_with_tabular_and_labels.csv")
CLEAN_LABELS_CSV= os.path.join(LABELS_DIR, "oasis_labels_cleaned.csv")  # optional (created below)

# knobs
K_SLICES  = 5     # slices per visit
TARGET_MM = 1.0   # resample to ~1mm
PNG_SIZE  = 224   # ResNet input size

os.makedirs(MRI_DIR, exist_ok=True)
os.makedirs(LABELS_DIR, exist_ok=True)
os.makedirs(OUT_ROOT, exist_ok=True)
os.makedirs(PNG_DIR, exist_ok=True)

print("ROOT     :", ROOT)
print("MRI_DIR  :", MRI_DIR)
print("LABELS   :", LABELS_DIR)
print("OUT_ROOT :", OUT_ROOT)
print("PNG_DIR  :", PNG_DIR)


### Citations
**AI Assitance**

Used OpenAI's ChatGPT to implement loading in the data through zip paths and the google drive

In [None]:
# ---------------------------
# Helpers: ID normalization
# ---------------------------
def _norm(s: str) -> str:
    if pd.isna(s): return ""
    s = str(s).strip().upper()
    s = s.replace(" ", "").replace("-", "_")
    return s

def _visit_id_from_any(s: str) -> str:
    if not s: return ""
    m = re.search(r"(OAS1|OAS2)_[0-9]{4}_MR[0-9]+", str(s).upper())
    return m.group(0) if m else ""

def _subject_root_from_any(s: str) -> str:
    if not s: return ""
    m = re.search(r"(OAS1|OAS2)_[0-9]{4}", str(s).upper())
    return m.group(0) if m else ""


In [None]:
# ---------------------------
# 1) Upload MRI files (.hdr/.img/.txt) — multi-select
# ---------------------------
print("  Select ALL your MRI files (multi-select ok):")
print("   - .hdr  (Analyze header)")
print("   - .img  (Analyze image)")
print("   - .txt  (FAST segmentation report from FSL_SEG)")
uploaded = files.upload()  # multi-select allowed

# Save to MRI_DIR
for name, content in uploaded.items():
    dst = os.path.join(MRI_DIR, name)
    with open(dst, "wb") as f:
        f.write(content)
    print("Saved:", dst)

print("\nFiles uploaded:")
!ls -lh $MRI_DIR

In [None]:
# ---------------------------
# 2) Build index (per visit)
# ---------------------------
hdrs = sorted(glob.glob(os.path.join(MRI_DIR, "*.hdr")))
imgs = sorted(glob.glob(os.path.join(MRI_DIR, "*.img")))
txts = sorted(glob.glob(os.path.join(MRI_DIR, "*.txt")))

by_vis = {}  # key: visit_id (OAS1_0001_MR1)

for p in hdrs:
    vis = _visit_id_from_any(os.path.basename(p)) or _visit_id_from_any(p)
    if not vis: continue
    by_vis.setdefault(vis, {})["hdr_path"] = p
    by_vis[vis]["subject_root"] = _subject_root_from_any(vis)

for p in imgs:
    vis = _visit_id_from_any(os.path.basename(p)) or _visit_id_from_any(p)
    if not vis: continue
    by_vis.setdefault(vis, {})["img_path"] = p
    by_vis[vis]["subject_root"] = _subject_root_from_any(vis)

for p in txts:
    # FAST detection: be lenient — just require a "Volumes:" line later when we parse
    vis = _visit_id_from_any(os.path.basename(p)) or _visit_id_from_any(p)
    if not vis:
        # still accept; we'll parse later and try to pull visit_id from the txt body
        continue
    by_vis.setdefault(vis, {})
    if "fast_txt_path" not in by_vis[vis]:
        by_vis[vis]["fast_txt_path"] = p
    by_vis[vis]["subject_root"] = _subject_root_from_any(vis)

rows = []
for vis, rec in by_vis.items():
    rows.append({
        "visit_id": vis,
        "subject_root": rec.get("subject_root", _subject_root_from_any(vis)),
        "hdr_path": rec.get("hdr_path",""),
        "img_path": rec.get("img_path",""),
        "fast_txt_path": rec.get("fast_txt_path","")
    })
df_mri_index = pd.DataFrame(rows).sort_values("visit_id").reset_index(drop=True)

# Add any txts with no visit_id in filename by parsing later; we'll still keep them separate
# (we'll parse every uploaded txt anyway in the FAST parser step)

print(f"Indexed {len(df_mri_index)} visits from uploads.")
display(df_mri_index.head(10))

df_mri_index.to_csv(MRI_INDEX_CSV, index=False)
print("Saved MRI index to:", MRI_INDEX_CSV)

# Sanity checks
missing_pairs = df_mri_index[(df_mri_index["hdr_path"]=="") & (df_mri_index["img_path"]=="")]
if len(missing_pairs):
    print("\n Some visits are missing both .hdr and .img (no image slices will be produced):")
    display(missing_pairs)
else:
    print("\n All indexed visits have at least one volume file (hdr or img).")


### Citation
**AI Assitance**

Used OpenAI's ChatGPT to help implement loading in patient MRI data

In [None]:
# ---------------------------
# 3) (Optional) Upload Excel/CSV with Patient ID & CDR → cleaned labels (visit-level)
# ---------------------------
print("\n (Optional) Upload your Excel/CSV with columns: 'Patient ID' and 'CDR'")
print("    - If you skip this, we will still produce tabular features + slice PNGs.")
try:
    uploaded_labels = files.upload()  # user may cancel/skip
    if uploaded_labels:
        labels_name = list(uploaded_labels.keys())[0]
        labels_path = os.path.join(LABELS_DIR, labels_name)
        with open(labels_path, "wb") as f:
            f.write(uploaded_labels[labels_name])
        print("Saved label file to:", labels_path)

        # Load & clean
        if labels_name.lower().endswith(".xlsx"):
            df_labels_raw = pd.read_excel(labels_path)
        else:
            df_labels_raw = pd.read_csv(labels_path)

        df_labels = df_labels_raw.copy()
        df_labels.columns = [c.strip() for c in df_labels.columns]
        assert "Patient ID" in df_labels.columns and "CDR" in df_labels.columns, "Need 'Patient ID' and 'CDR' columns"

        df_labels["patient_id_norm"] = df_labels["Patient ID"].apply(_norm)
        df_labels["visit_id"] = df_labels["patient_id_norm"].apply(_visit_id_from_any)
        df_labels["cdr"] = df_labels["CDR"]

        def cdr_to_label(x):
            try:
                v = float(x)
            except:
                return None
            if v == 0.0: return "CN"
            if v >= 0.5: return "AD"
            return None

        df_labels["label"] = df_labels["cdr"].apply(cdr_to_label)
        df_labels = df_labels.dropna(subset=["visit_id","label"]).reset_index(drop=True)
        df_labels[["visit_id","cdr","label"]].to_csv(CLEAN_LABELS_CSV, index=False)
        print(f"Cleaned labels saved to: {CLEAN_LABELS_CSV}")
        display(df_labels[["visit_id","cdr","label"]].head(10))
    else:
        print("Skipped label upload (no file chosen).")
except Exception as e:
    print("Skipped label upload:", e)

In [None]:
# ---------------------------
# 4) FAST parser → tabular features (visit-level)
# ---------------------------
def parse_fast_txt_relaxed(txt_path):
    """Parse FSL FAST report, requiring only a 'Volumes:' line; infer visit_id."""
    try:
        with open(txt_path, "r", errors="ignore") as f:
            text = f.read()
    except Exception:
        return None

    # find the Volumes: line (case-insensitive)
    vols_line = next((ln for ln in text.splitlines() if ln.strip().lower().startswith("volumes:")), None)
    if not vols_line:
        return None

    # numbers: CSF, GM, WM in mm^3 typically
    nums = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", vols_line)
    if len(nums) < 3:
        return None

    # visit_id inference from text or path
    vi = _visit_id_from_any(text) or _visit_id_from_any(txt_path)
    subj = _subject_root_from_any(text) or _subject_root_from_any(txt_path)

    csf_mm3, gm_mm3, wm_mm3 = map(float, nums[:3])
    mm3_to_ml = 1/1000.0
    csf_ml, gm_ml, wm_ml = csf_mm3*mm3_to_ml, gm_mm3*mm3_to_ml, wm_mm3*mm3_to_ml
    brain_ml = gm_ml + wm_ml

    return {
        "visit_id": vi,
        "subject_root": subj,
        "csf_ml": csf_ml, "gm_ml": gm_ml, "wm_ml": wm_ml, "brain_ml": brain_ml,
        "csf_ratio": csf_ml/brain_ml if brain_ml>0 else np.nan,
        "gm_ratio":  gm_ml/brain_ml if brain_ml>0 else np.nan,
        "wm_ratio":  wm_ml/brain_ml if brain_ml>0 else np.nan,
        "gm_wm_ratio": gm_ml/wm_ml if wm_ml>0 else np.nan,
        "source_txt": os.path.basename(txt_path)
    }

# Parse every uploaded txt (including ones not matched earlier)
all_txts = sorted(glob.glob(os.path.join(MRI_DIR, "*.txt")))
feat_rows = []
for t in tqdm(all_txts, desc="Parsing FAST txt → tabular"):
    d = parse_fast_txt_relaxed(t)
    if d: feat_rows.append(d)

df_feats = pd.DataFrame(feat_rows)
# Ensure we have a row for every visit in df_mri_index (even if no FAST txt)
if not df_mri_index.empty:
    df_feats = df_mri_index[["visit_id","subject_root"]].merge(
        df_feats, on=["visit_id","subject_root"], how="left"
    )

# Deduplicate visits (keep the row with most non-null values)
if not df_feats.empty:
    df_feats["nonnull"] = df_feats.notna().sum(axis=1)
    df_feats = (df_feats.sort_values(["visit_id","nonnull"], ascending=[True, False])
                        .drop(columns=["nonnull"])
                        .drop_duplicates(subset=["visit_id"], keep="first"))

df_feats.to_csv(SUBJ_FEATS_CSV, index=False)
print("Tabular features saved →", SUBJ_FEATS_CSV, df_feats.shape)
display(df_feats.head(8))

In [None]:
# ---------------------------
# 5) Volume → CENTRAL slice PNGs, build manifest (best for AD)
# ---------------------------
import os, re, glob
import numpy as np
import nibabel as nib
from skimage.transform import resize
from skimage.filters import threshold_otsu
from skimage import morphology
from scipy import ndimage as ndi
from PIL import Image
import pandas as pd
from tqdm import tqdm

# Respect prior knobs if they exist
try: TARGET_MM
except NameError: TARGET_MM = 1.0
try: PNG_SIZE
except NameError: PNG_SIZE = 224
try: MANIFEST_CSV
except NameError: MANIFEST_CSV = "/content/oasis_pipeline/manifest_slices.csv"
try: PNG_DIR
except NameError:
    PNG_DIR = "/content/oasis_pipeline/pngs"
    os.makedirs(PNG_DIR, exist_ok=True)

CENTRAL_WIDTH_MM   = 60.0    # keep slices within ±30 mm of brain mid-plane
DESIRED_CENTRAL_K  = 12      # ~12 central slices per visit

# ---- helpers defined here so this cell is self-contained
def _score_volume_name(name: str) -> int:
    """Rank likely T1 volumes; exclude seg/prob maps."""
    n = name.lower()
    if any(t in n for t in ["_seg", "fseg", "probseg", "prob", "pve"]):
        return -1
    if "masked_gfc" in n:   return 100
    if n.startswith("mpr"): return 90
    if "t1" in n:           return 80
    return 10

def _prefer_t1_in_folder(hdr_or_img_path: str) -> str:
    """Given one file path, search its folder for the best anatomical (T1) volume."""
    if not isinstance(hdr_or_img_path, str) or not len(hdr_or_img_path):
        return ""
    folder = os.path.dirname(hdr_or_img_path)
    if not os.path.isdir(folder):  # if it's just a file in MRI_DIR, still try folder
        folder = os.path.dirname(hdr_or_img_path)
    cands = []
    for ext in ("*.hdr", "*.nii", "*.nii.gz", "*.img"):
        cands += glob.glob(os.path.join(folder, ext))
    ranked = sorted([( _score_volume_name(os.path.basename(p)), p) for p in cands ], reverse=True)
    for sc, p in ranked:
        if sc >= 0:
            return p
    return hdr_or_img_path if os.path.exists(hdr_or_img_path) else (ranked[0][1] if ranked else "")

def load_any(path):
    img = nib.load(path)
    vol = img.get_fdata().astype(np.float32)
    if vol.ndim == 4:
        vol = vol[..., 0]
    vox = tuple(float(z) for z in img.header.get_zooms()[:3])
    affine = img.affine if hasattr(img, "affine") else None
    return vol, vox, affine

def resample_iso(vol, voxel_mm, target_mm=1.0):
    sx, sy, sz = voxel_mm
    nx = int(round(vol.shape[0]*sx/target_mm))
    ny = int(round(vol.shape[1]*sy/target_mm))
    nz = int(round(vol.shape[2]*sz/target_mm))
    nx = max(nx, 32); ny = max(ny, 32); nz = max(nz, 16)
    return resize(vol, (nx, ny, nz), preserve_range=True, anti_aliasing=True)

def norm_robust(vol, p1=1, p99=99):
    a = vol[np.isfinite(vol)]
    lo, hi = np.percentile(a, [p1, p99]) if a.size else (0, 1)
    vol = np.clip(vol, lo, hi)
    return ((vol - lo) / (hi - lo + 1e-8)).astype(np.float32)

def skull_strip(vol_norm):
    a = vol_norm[np.isfinite(vol_norm)]
    thr = threshold_otsu(a) if a.size else 0.0
    mask = vol_norm > thr
    mask = morphology.binary_closing(mask, morphology.ball(2))
    mask = ndi.binary_fill_holes(mask)
    lbl, n = ndi.label(mask)
    if n > 0:
        sizes = ndi.sum(mask, lbl, index=np.arange(1, n+1))
        keep = 1 + int(np.argmax(sizes))
        mask = (lbl == keep)
    return mask.astype(np.uint8)

def choose_central_slices(mask, vox, desired_k=12, central_width_mm=60.0):
    """Return z indices only inside the central window; evenly spaced to ~desired_k."""
    sx, sy, sz = vox
    Z = mask.shape[2]
    z_has_brain = np.where(mask.sum(axis=(0,1)) > 0)[0]
    if z_has_brain.size == 0:
        mid = Z // 2
        return [z for z in [mid-2, mid-1, mid, mid+1, mid+2] if 0 <= z < Z]
    mid = int(np.median(z_has_brain))
    half_vox = int(round((central_width_mm/2.0) / max(sz, 1e-6)))
    left, right = max(0, mid - half_vox), min(Z - 1, mid + half_vox)
    if right <= left:
        return [z for z in [mid-2, mid-1, mid, mid+1, mid+2] if 0 <= z < Z]
    k = max(5, min(desired_k, right - left + 1))
    idxs = np.linspace(left, right, k).astype(int).tolist()
    return sorted(set(int(i) for i in idxs if 0 <= i < Z))

def slice_height_metrics(mask, vox, z_idxs, central_width_mm=60.0, affine=None):
    sx, sy, sz = vox
    z_has_brain = np.where(mask.sum(axis=(0,1)) > 0)[0]
    zmin, zmax = int(z_has_brain.min()), int(z_has_brain.max())
    brain_height_vox = (zmax - zmin + 1)
    brain_height_mm = brain_height_vox * sz
    z_mid = int(np.median(z_has_brain))
    half_win_mm = central_width_mm / 2.0
    X, Y, Z = mask.shape
    cx, cy = (X - 1) / 2.0, (Y - 1) / 2.0
    rows = []
    for z in z_idxs:
        z_mm_from_min = (z - zmin) * sz
        z_frac = float(np.clip(z_mm_from_min / max(brain_height_mm, 1e-6), 0, 1))
        dist_mid_mm = abs((z - z_mid) * sz)
        is_central = dist_mid_mm <= half_win_mm
        if affine is not None:
            ijk = np.array([cx, cy, float(z), 1.0], dtype=np.float32)
            xyz = affine @ ijk
            world_z_mm = float(xyz[2])
        else:
            world_z_mm = np.nan
        rows.append(dict(
            z_index=int(z),
            brain_height_mm=float(brain_height_mm),
            z_mm_from_brain_min=float(z_mm_from_min),
            z_frac_0to1=z_frac,
            dist_to_mid_mm=float(dist_mid_mm),
            is_central=bool(is_central),
            world_z_mm=world_z_mm
        ))
    return rows

def slice_to_png(sl2d, out_path, size=224):
    s = resize(sl2d, (size, size), preserve_range=True, anti_aliasing=True)
    s = np.clip(s, 0, 1)
    u8 = (s * 255).astype(np.uint8)
    rgb = np.stack([u8]*3, axis=-1)
    Image.fromarray(rgb).save(out_path, "PNG")

# ---- central-only export
slice_rows = []

for _, r in tqdm(df_mri_index.iterrows(), total=len(df_mri_index), desc="Exporting CENTRAL PNG slices"):
    # Prefer true T1 in the same folder; avoid *_seg/*_fseg
    base_path = r["hdr_path"] if isinstance(r["hdr_path"], str) and len(r["hdr_path"]) else r["img_path"]
    vol_path  = _prefer_t1_in_folder(base_path)

    if not isinstance(vol_path, str) or not len(vol_path) or not os.path.exists(vol_path):
        print("Missing/invalid volume for visit:", r.get("visit_id"), "— skipping")
        continue

    try:
        vol, vox, affine = load_any(vol_path)
        vol = resample_iso(vol, vox, target_mm=TARGET_MM)
        vol = norm_robust(vol)
        mask = skull_strip(vol)

        # 1) choose only central candidates
        z_idxs = choose_central_slices(mask, vox,
                                       desired_k=DESIRED_CENTRAL_K,
                                       central_width_mm=CENTRAL_WIDTH_MM)

        # 2) compute metrics and filter to is_central == True
        metrics_all = slice_height_metrics(mask, vox, z_idxs,
                                           central_width_mm=CENTRAL_WIDTH_MM,
                                           affine=affine)
        metrics = [m for m in metrics_all if m["is_central"]]
        z_idxs  = [m["z_index"] for m in metrics]

        # 3) fallback: if none inside window, take 5 closest-to-mid brain slices
        if len(z_idxs) == 0:
            z_has_brain = np.where(mask.sum(axis=(0,1)) > 0)[0]
            if z_has_brain.size == 0:
                continue
            z_mid = int(np.median(z_has_brain))
            cands = sorted(list(set(z_has_brain)), key=lambda z: abs(z - z_mid))[:5]
            metrics = slice_height_metrics(mask, vox, cands,
                                           central_width_mm=CENTRAL_WIDTH_MM,
                                           affine=affine)
            metrics = sorted(metrics, key=lambda m: abs(m["dist_to_mid_mm"]))[:5]
            z_idxs  = [m["z_index"] for m in metrics]

    except Exception as e:
        print(f"⚠️ Volume failed for visit {r.get('visit_id')}: {vol_path} -> {e}")
        continue

    visit_id = r["visit_id"]
    subj_root = r.get("subject_root", visit_id.split("_MR")[0])

    for z, m in zip(z_idxs, metrics):
        sl = vol[:, :, z]
        if (sl > 0).sum() < 50:
            continue
        png_name = f"{visit_id}_central_ax{z}.png"
        out_path = os.path.join(PNG_DIR, png_name)
        slice_to_png(sl, out_path, size=PNG_SIZE)

        slice_rows.append({
            "visit_id": visit_id,
            "subject_root": subj_root,
            "png_path": out_path,
            "slice_index": int(z),
            "brain_height_mm": m["brain_height_mm"],
            "z_mm_from_brain_min": m["z_mm_from_brain_min"],
            "z_frac_0to1": m["z_frac_0to1"],
            "dist_to_mid_mm": m["dist_to_mid_mm"],
            "is_central": True,                     # guaranteed
            "world_z_mm": m["world_z_mm"]
        })

df_manifest = pd.DataFrame(slice_rows)
df_manifest.to_csv(MANIFEST_CSV, index=False)
print("CENTRAL-ONLY manifest →", MANIFEST_CSV, df_manifest.shape)
display(df_manifest.head(10))
print("PNG dir:", PNG_DIR)


### Citation
**AI Assistance**

Used OpenAI ChatGPT to help create multiple slices of MRI patient scans

In [None]:
# ---------------------------
# 6) (Optional) Merge with labels (if provided)
# ---------------------------
if os.path.exists(CLEAN_LABELS_CSV):
    df_labels_clean = pd.read_csv(CLEAN_LABELS_CSV)
    labels_visit = df_labels_clean[["visit_id","cdr","label"]].drop_duplicates()
    df_all = (df_manifest
              .merge(df_feats, on=["visit_id","subject_root"], how="left")
              .merge(labels_visit, on="visit_id", how="left"))
    df_all.to_csv(MERGED_CSV, index=False)
    print("Merged (images + tabular + labels) →", MERGED_CSV, df_all.shape)
    display(df_all.head(10))
else:
    # Still save a merged file without labels for later
    df_all = df_manifest.merge(df_feats, on=["visit_id","subject_root"], how="left")
    df_all.to_csv(MERGED_CSV, index=False)
    print("Labels not found; saved merged (images + tabular) without labels →", MERGED_CSV, df_all.shape)
    display(df_all.head(10))

print("\n Done. Outputs:")
print("   • Tabular features:", SUBJ_FEATS_CSV)
print("   • Slice PNGs dir  :", PNG_DIR)
print("   • Slice manifest  :", MANIFEST_CSV)
print("   • Merged CSV      :", MERGED_CSV, "(with labels if you uploaded them)")

### Citation
**AI Assistance**

Used OpenAI ChatGPT to help make the optional cell

In [None]:
import os, re, glob, shutil
import pandas as pd
from tqdm import tqdm

PNG_DIR = "/content/oasis_pipeline/pngs"
BY_VISIT_DIR = "/content/oasis_pipeline/pngs_by_visit"
EXPORT_ZIPS_DIR = "/content/oasis_pipeline/exports"
os.makedirs(BY_VISIT_DIR, exist_ok=True)
os.makedirs(EXPORT_ZIPS_DIR, exist_ok=True)

def _visit_id_from_any(s: str) -> str:
    m = re.search(r"(OAS1|OAS2)_[0-9]{4}_MR[0-9]+", str(s).upper())
    return m.group(0) if m else ""

# 1) Build mapping: visit_id -> list of png paths
pngs = sorted(glob.glob(os.path.join(PNG_DIR, "*.png")))
visit_map = {}

use_manifest = ('df_manifest' in globals()) and ('visit_id' in df_manifest.columns)
if use_manifest:
    print("Using df_manifest to map visit_ids → PNGs")
    for _, r in df_manifest.iterrows():
        vi = r.get("visit_id") or _visit_id_from_any(r.get("png_path"))
        if not vi:
            continue
        visit_map.setdefault(vi, []).append(r["png_path"])
else:
    print("No df_manifest with visit_id found; parsing visit_id from filenames")
    for p in pngs:
        vi = _visit_id_from_any(os.path.basename(p)) or _visit_id_from_any(p) or "_unknown_visit"
        visit_map.setdefault(vi, []).append(p)

_has_feats = ('df_feats' in globals()) and (not pd.DataFrame(df_feats).empty)
_has_labels = ('labels_visit' in globals()) and (not pd.DataFrame(labels_visit).empty)

# 2) Copy PNGs + write per-visit CSVs
print(f"Organizing {sum(len(v) for v in visit_map.values())} PNGs into folders and writing CSVs…")
for vi, paths in tqdm(visit_map.items()):
    out_dir = os.path.join(BY_VISIT_DIR, vi)
    os.makedirs(out_dir, exist_ok=True)

    # Copy PNGs
    for src in paths:
        if not isinstance(src, str) or not os.path.exists(src):
            continue
        dst = os.path.join(out_dir, os.path.basename(src))
        if not os.path.exists(dst):
            try:
                shutil.copy2(src, dst)
            except Exception as e:
                print("copy failed:", src, "->", e)

    # Build per-visit manifest CSV
    if use_manifest:
        df_v = df_manifest[df_manifest['visit_id'] == vi].copy()
        # Attach labels if present
        if _has_labels:
            df_v = df_v.merge(labels_visit, on='visit_id', how='left')
        # Save
        man_csv = os.path.join(out_dir, f"manifest_{vi}.csv")
        df_v.to_csv(man_csv, index=False)
    else:
        # Minimal manifest from filenames
        df_v = pd.DataFrame({
            "visit_id": [vi]*len(paths),
            "png_path": [os.path.join(out_dir, os.path.basename(p)) for p in paths]
        })
        man_csv = os.path.join(out_dir, f"manifest_{vi}.csv")
        df_v.to_csv(man_csv, index=False)

    # Build per-visit tabular CSV (FAST-derived volumes/ratios)
    if _has_feats:
        df_tab = df_feats[df_feats['visit_id'] == vi].copy()
        if df_tab.empty:
            # create a 1-row placeholder so downstream code doesn't break
            df_tab = pd.DataFrame([{"visit_id": vi}])
        tab_csv = os.path.join(out_dir, f"tabular_{vi}.csv")
        df_tab.to_csv(tab_csv, index=False)

print("Per-visit folders with PNGs + CSVs created at:", BY_VISIT_DIR)

# 3) Zip each visit
print("Zipping each visit…")
made = []
for vi in tqdm(sorted(visit_map.keys())):
    folder = os.path.join(BY_VISIT_DIR, vi)
    if not os.path.isdir(folder):
        continue
    zip_base = os.path.join(EXPORT_ZIPS_DIR, vi)
    shutil.make_archive(zip_base, 'zip', folder)
    made.append(zip_base + ".zip")

print(f"Created {len(made)} ZIPs in:", EXPORT_ZIPS_DIR)

# 4) One big ZIP with all per-visit folders
BIG_ZIP_BASE = os.path.join(EXPORT_ZIPS_DIR, "all_visits_packages")
shutil.make_archive(BIG_ZIP_BASE, 'zip', BY_VISIT_DIR)
BIG_ZIP = BIG_ZIP_BASE + ".zip"
print("Big ZIP:", BIG_ZIP)

# 5) Show sizes for convenience
def _fmt_size(bytes_):
    for unit in ['B','KB','MB','GB','TB']:
        if bytes_ < 1024.0:
            return f"{bytes_:3.1f} {unit}"
        bytes_ /= 1024.0
    return f"{bytes_:3.1f} PB"

print("\n ZIP file sizes:")
for z in made[:10]:
    try:
        print(f" - {os.path.basename(z)}  {_fmt_size(os.path.getsize(z))}")
    except:
        pass
print(f" - {os.path.basename(BIG_ZIP)}  {_fmt_size(os.path.getsize(BIG_ZIP))}")

# Optional download prompt for the big zip:
# from google.colab import files
# files.download(BIG_ZIP)


### Citation
**AI Assistance**

Used OpenAI ChatGPT to help make a zip file with all the extracted data from each patient data file


# Citation
OpenAI. (2025). ChatGPT (Nov 25 version) [Large language model]

ChatGPT assistance for help with implementing the syntax for making the preprocessing code and uploading the patient data files process and extract the necessary features [Large language model]

Conversations with the user within November 2025

Link to ChatGPT: https://chatgpt.com/