In [1]:
import os
from collections import Counter, defaultdict

DATA_DIR = "data/Sagittal_T1_FLAIR"  # change if needed

patient_counter = Counter()
patient_files = defaultdict(list)

for fname in os.listdir(DATA_DIR):
    if fname.endswith(".nii.gz"):
        try:
            # patient ID = number before first "_"
            patient_id = int(fname.split("_")[0])
            patient_counter[patient_id] += 1
            patient_files[patient_id].append(fname)
        except ValueError:
            print(f"Could not parse patient ID from: {fname}")

print("\n================ SUMMARY ================")
print(f"Total files          : {sum(patient_counter.values())}")
print(f"Unique patients      : {len(patient_counter)}")
print(f"Repeated patients    : {sum(1 for p in patient_counter if patient_counter[p] > 1)}")

print("\n=========== PATIENT SCAN COUNTS ==========")
for pid, count in sorted(patient_counter.items()):
    print(f"Patient {pid:>3} -> {count} scan(s)")

print("\n====== PATIENTS WITH MULTIPLE SCANS ======")
for pid, files in sorted(patient_files.items()):
    if len(files) > 1:
        print(f"\nPatient {pid} ({len(files)} scans):")
        for f in files:
            print(f"   {f}")


Could not parse patient ID from: 436@_20200819_Lumbar_Spine_Sagittal_T1_FLAIR_s5.nii.gz
Could not parse patient ID from: 368@_20200805_Lumbar_Spine_Sagittal_T1_FLAIR_s5.nii.gz

Total files          : 240
Unique patients      : 231
Repeated patients    : 9

Patient   1 -> 1 scan(s)
Patient   2 -> 1 scan(s)
Patient   3 -> 1 scan(s)
Patient   6 -> 2 scan(s)
Patient   9 -> 1 scan(s)
Patient  10 -> 1 scan(s)
Patient  12 -> 1 scan(s)
Patient  13 -> 1 scan(s)
Patient  14 -> 1 scan(s)
Patient  15 -> 1 scan(s)
Patient  18 -> 1 scan(s)
Patient  22 -> 1 scan(s)
Patient  23 -> 1 scan(s)
Patient  26 -> 1 scan(s)
Patient  31 -> 1 scan(s)
Patient  33 -> 1 scan(s)
Patient  36 -> 1 scan(s)
Patient  37 -> 1 scan(s)
Patient  39 -> 1 scan(s)
Patient  42 -> 1 scan(s)
Patient  46 -> 1 scan(s)
Patient  48 -> 1 scan(s)
Patient  55 -> 1 scan(s)
Patient  58 -> 1 scan(s)
Patient  65 -> 1 scan(s)
Patient  67 -> 1 scan(s)
Patient  68 -> 1 scan(s)
Patient  71 -> 1 scan(s)
Patient  73 -> 1 scan(s)
Patient  74 -> 1 s

In [2]:
import os, re
import pandas as pd
from collections import Counter, defaultdict

DATA_DIR = "data/Sagittal_T1_FLAIR"     # change if needed
XLSX_PATH = "data/metadata.xlsx"        # change if needed

def parse_patient_id(filename: str):
    """
    Robustly extract patient ID from start of filename:
    Examples:
      '368@_20200805_...' -> 368
      '368_20200805_...'  -> 368
    """
    first_token = filename.split("_")[0]
    m = re.search(r"\d+", first_token)
    return int(m.group()) if m else None

# --------- load metadata ----------
df = pd.read_excel(XLSX_PATH)

# basic column check
required = {"ID", "BMD"}
missing_cols = required - set(df.columns)
if missing_cols:
    raise ValueError(f"metadata.xlsx missing columns: {missing_cols}. Found: {list(df.columns)}")

# coerce types
df["ID"] = pd.to_numeric(df["ID"], errors="coerce").astype("Int64")
df["BMD"] = pd.to_numeric(df["BMD"], errors="coerce")

df_bad = df[df["ID"].isna() | df["BMD"].isna()]
if len(df_bad) > 0:
    print("\nWARNING: Rows with invalid ID or BMD (these will be ignored):")
    print(df_bad)

df_clean = df.dropna(subset=["ID", "BMD"]).copy()
df_clean["ID"] = df_clean["ID"].astype(int)

# duplicates in metadata
dup_ids = df_clean["ID"][df_clean["ID"].duplicated()].unique().tolist()

meta_ids = set(df_clean["ID"].tolist())
meta_map = dict(zip(df_clean["ID"], df_clean["BMD"]))

# --------- scan files ----------
file_ids = []
id_to_files = defaultdict(list)
all_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".nii.gz")]

unparsed = []
for f in all_files:
    pid = parse_patient_id(f)
    if pid is None:
        unparsed.append(f)
        continue
    file_ids.append(pid)
    id_to_files[pid].append(f)

file_id_counts = Counter(file_ids)
folder_ids = set(file_id_counts.keys())

# --------- compare ----------
ids_in_files_not_in_meta = sorted(list(folder_ids - meta_ids))
ids_in_meta_not_in_files = sorted(list(meta_ids - folder_ids))
ids_common = sorted(list(folder_ids & meta_ids))

print("\n=================== SUMMARY ===================")
print(f"Total .nii.gz files                 : {len(all_files)}")
print(f"Files with parsed patient IDs       : {sum(file_id_counts.values())}")
print(f"Files that FAILED ID parsing        : {len(unparsed)}")
print(f"Unique patient IDs in folder        : {len(folder_ids)}")
print(f"Unique patient IDs in metadata      : {len(meta_ids)}")
print(f"Patient IDs present in BOTH         : {len(ids_common)}")

print("\n=============== METADATA QUALITY ===============")
print(f"Duplicate IDs in metadata           : {len(dup_ids)}")
if dup_ids:
    print("Duplicate ID examples:", dup_ids[:20])

print("\n=============== MISMATCH REPORT ================")
print(f"IDs in folder but NOT in metadata   : {len(ids_in_files_not_in_meta)}")
print(f"IDs in metadata but NOT in folder   : {len(ids_in_meta_not_in_files)}")

if ids_in_files_not_in_meta:
    print("\nFirst 25 IDs in folder missing metadata:", ids_in_files_not_in_meta[:25])

if ids_in_meta_not_in_files:
    print("\nFirst 25 IDs in metadata missing files:", ids_in_meta_not_in_files[:25])

print("\n=========== REPEATED PATIENTS (folder) =========")
repeated = [(pid, c) for pid, c in file_id_counts.items() if c > 1]
repeated.sort(key=lambda x: (-x[1], x[0]))
print(f"Patients with >1 scan in folder     : {len(repeated)}")
for pid, c in repeated[:30]:
    print(f"Patient {pid} -> {c} scans")

if repeated:
    print("\nExample files for first 5 repeated patients:")
    for pid, c in repeated[:5]:
        print(f"\nPatient {pid} ({c} scans):")
        for f in sorted(id_to_files[pid])[:10]:
            print("  ", f)

if unparsed:
    print("\n=========== UNPARSED FILENAMES ===========")
    for f in unparsed[:20]:
        print("  ", f)
    if len(unparsed) > 20:
        print(f"  ... and {len(unparsed)-20} more")



Total .nii.gz files                 : 242
Files with parsed patient IDs       : 242
Files that FAILED ID parsing        : 0
Unique patient IDs in folder        : 233
Unique patient IDs in metadata      : 231
Patient IDs present in BOTH         : 231

Duplicate IDs in metadata           : 0

IDs in folder but NOT in metadata   : 2
IDs in metadata but NOT in folder   : 0

First 25 IDs in folder missing metadata: [368, 436]

Patients with >1 scan in folder     : 9
Patient 6 -> 2 scans
Patient 125 -> 2 scans
Patient 166 -> 2 scans
Patient 267 -> 2 scans
Patient 346 -> 2 scans
Patient 367 -> 2 scans
Patient 376 -> 2 scans
Patient 439 -> 2 scans
Patient 560 -> 2 scans

Example files for first 5 repeated patients:

Patient 6 (2 scans):
   6_20220722_Lumbar_Spine_Sagittal_T1_FLAIR_s4.nii.gz
   6_20220722_Lumbar_Spine_Sagittal_T1_FLAIR_s4_1.nii.gz

Patient 125 (2 scans):
   125_20220808_Lumbar_Spine_Sagittal_T1_FLAIR_s10.nii.gz
   125_20220808_Lumbar_Spine_Sagittal_T1_FLAIR_s6.nii.gz

Patient 

In [1]:
import nibabel as nib
import numpy as np

# âœ… change this to ONE of your .nii or .nii.gz paths
path = r"data/Sagittal_T1_FLAIR/100_20220730_Lumbar_Spine_Sagittal_T1_FLAIR_s6.nii.gz"

img = nib.load(path)
data = img.get_fdata()   # float array
aff = img.affine

print("File:", path)
print("Shape (X,Y,Z,...) :", data.shape)
print("Affine:\n", aff)

# Orientation: which array axis corresponds to L/R, P/A, I/S in real space
try:
    axcodes = nib.aff2axcodes(aff)  # e.g. ('R','A','S')
    print("Orientation codes (axis0,axis1,axis2):", axcodes)

    # Sagittal = Left/Right axis => 'L' or 'R'
    sag_axis = None
    for i, c in enumerate(axcodes[:3]):
        if c in ("L", "R"):
            sag_axis = i
            break

    print("Sagittal axis index:", sag_axis)

    if sag_axis == 0:
        print("Sagittal slicing: data[mid, :, :]   (axis 0)")
    elif sag_axis == 1:
        print("Sagittal slicing: data[:, mid, :]   (axis 1)")
    elif sag_axis == 2:
        print("Sagittal slicing: data[:, :, mid]   (axis 2)")
    else:
        print("Could not detect sagittal axis (unexpected orientation).")

except Exception as e:
    print("Orientation detection failed:", e)

# Also show what YOUR CURRENT code is doing if you slice [:,:,mid]
if data.ndim >= 3:
    mid = data.shape[2] // 2
    sl = data[:, :, mid]
    print("\nIf you do data[:, :, mid], slice shape =", sl.shape, "(this is what your current slicing would output)")


File: data/Sagittal_T1_FLAIR/100_20220730_Lumbar_Spine_Sagittal_T1_FLAIR_s6.nii.gz
Shape (X,Y,Z,...) : (512, 512, 18)
Affine:
 [[-1.39474599e-02 -1.65454186e-02  4.49647141e+00 -2.76760006e+01]
 [-5.46721518e-01 -3.56541481e-04 -1.14903599e-01  1.73287994e+02]
 [ 7.78735674e-04 -5.46649516e-01 -1.36019558e-01  3.66510010e+02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00]]
Orientation codes (axis0,axis1,axis2): ('P', 'I', 'R')
Sagittal axis index: 2
Sagittal slicing: data[:, :, mid]   (axis 2)

If you do data[:, :, mid], slice shape = (512, 512) (this is what your current slicing would output)
