# 1. Organize Raw ADNI Data from CSV

This notebook organizes the raw ADNI dataset using the provided CSV file that contains metadata about each neuroimaging file. The CSV contains information about Image Data ID, Subject, Group, and other metadata.

**CSV Structure:**
- `Image Data ID`: Unique identifier for each image (e.g., 'I325923')
- `Subject`: Subject identifier (e.g., '116_S_4855')
- `Group`: Research group classification (AD, CN, or MCI)
- Other columns: Sex, Age, Visit, Modality, etc.

The script will organize NIfTI files into separate folders based on the research group classification.

In [None]:
import pandas as pd
import os
import shutil
from pathlib import Path
from tqdm.notebook import tqdm
import re

### Define Paths

In [None]:
# Path to your CSV file containing the metadata
csv_file = Path("PATH_TO_DATA")

# Base path where your NIfTI files are located
# Update this to point to your actual ADNI data directory
data_base_path = Path("PATH_TO_DATA")

# Output path for organized data
output_path = Path("PATH_TO_DATA")
ad_path = output_path / "ad"
cn_path = output_path / "cn"
mci_path = output_path / "mci"

# Create output directories
ad_path.mkdir(parents=True, exist_ok=True)
cn_path.mkdir(parents=True, exist_ok=True)
mci_path.mkdir(parents=True, exist_ok=True)

print(f"CSV file: {csv_file}")
print(f"Data base path: {data_base_path}")
print(f"Output path: {output_path}")

### Load and Inspect CSV Data

In [None]:
# Load the CSV file - try different parsing approaches
try:
    # First, try with comma delimiter (standard)
    df = pd.read_csv(csv_file, delimiter=',')
    print(f"Loaded CSV with {len(df)} rows using comma delimiter")
    print("\nColumn names:")
    print(df.columns.tolist())
    print("\nFirst few rows:")
    print(df.head())
    print("\nGroup distribution:")
    print(df['Group'].value_counts())
except Exception as e:
    print(f"Error with comma delimiter: {e}")
    
    try:
        # Try with semicolon delimiter
        df = pd.read_csv(csv_file, delimiter=';')
        print(f"Loaded CSV with {len(df)} rows using semicolon delimiter")
        print("\nColumn names:")
        print(df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())
        
        # Check if we have the malformed single column issue
        if len(df.columns) == 1 and ',' in df.columns[0]:
            print("\nDetected malformed CSV - column names are in one cell.")
            print("Attempting to fix...")
            
            # Extract the column names from the first column header
            header_col = df.columns[0]
            column_names = [col.strip().strip('"') for col in header_col.split(',')]
            print(f"Extracted column names: {column_names}")
            
            # Read the CSV again with proper column names
            df = pd.read_csv(csv_file, delimiter=';', names=column_names, skiprows=1)
            print(f"Fixed CSV with {len(df)} rows and {len(df.columns)} columns")
            print("\nColumn names:")
            print(df.columns.tolist())
            print("\nFirst few rows:")
            print(df.head())
        
        print("\nGroup distribution:")
        print(df['Group'].value_counts())
        
    except Exception as e2:
        print(f"Error with semicolon delimiter: {e2}")
        print("Please check the file path and format")

### Data Validation and Cleaning

In [None]:
# Remove any rows with missing essential information
initial_count = len(df)
df_clean = df.dropna(subset=['Image Data ID', 'Subject', 'Group'])
dropped_count = initial_count - len(df_clean)

print(f"Dropped {dropped_count} rows with missing essential data")
print(f"Remaining: {len(df_clean)} rows")

# Make baseline-only filtering optional with a flag
BASELINE_ONLY = False  # <-- Set this to True to only use baseline scans for MCI/EMCI/LMCI

print("\n=== MCI PROCESSING (includes MCI, EMCI, LMCI) ===")
mci_types = ['MCI', 'EMCI', 'LMCI']
mci_before = len(df_clean[df_clean['Group'].isin(mci_types)])
print(f"MCI/EMCI/LMCI entries before processing: {mci_before}")

# Show breakdown by type
mci_breakdown = df_clean[df_clean['Group'].isin(mci_types)]['Group'].value_counts()
print(f"Breakdown: {dict(mci_breakdown)}")

if BASELINE_ONLY:
    # For MCI patients (all types), filter to only baseline/initial scans
    mci_mask = df_clean['Group'].isin(mci_types)
    mci_patients = df_clean[mci_mask].copy()

    # Filter MCI patients to only include visits with 'bl' or 'init'
    mci_baseline_mask = mci_patients['Visit'].str.contains('bl|init', case=False, na=False)
    mci_baseline = mci_patients[mci_baseline_mask].copy()

    # For each MCI subject, keep only the first baseline/initial scan
    mci_filtered = mci_baseline.sort_values(['Subject', 'Visit']).groupby('Subject').first().reset_index()

    # Normalize all MCI types to 'MCI' for consistency
    mci_filtered['Group'] = 'MCI'

    print(f"MCI entries after baseline filtering: {len(mci_baseline)}")
    print(f"MCI entries after keeping only first baseline per subject: {len(mci_filtered)}")

    # Combine non-MCI data with filtered MCI data
    non_mci_data = df_clean[~df_clean['Group'].isin(mci_types)]
    df_clean = pd.concat([non_mci_data, mci_filtered], ignore_index=True)

    mci_after = len(df_clean[df_clean['Group'] == 'MCI'])
    print(f"Final MCI entries (normalized from MCI/EMCI/LMCI): {mci_after}")
    print(f"Total MCI entries removed: {mci_before - mci_after}")
else:
    # Normalize all MCI types to 'MCI' for consistency, but keep all scans/rows
    df_clean['Group'] = df_clean['Group'].replace({'EMCI': 'MCI', 'LMCI': 'MCI'})

    mci_after = len(df_clean[df_clean['Group'] == 'MCI'])
    print(f"Final MCI entries (normalized from MCI/EMCI/LMCI): {mci_after}")
    print(f"Total MCI entries removed: {mci_before - mci_after}")  # This will likely be 0

# Show some statistics
print("\n=== DATA STATISTICS ===")
print(f"Unique subjects: {df_clean['Subject'].nunique()}")
print(f"Unique images: {df_clean['Image Data ID'].nunique()}")
print("\nGroup distribution:")
print(df_clean['Group'].value_counts())

print("\nSample subjects by group:")
for group in ['AD', 'CN', 'MCI']:
    subjects = df_clean[df_clean['Group'] == group]['Subject'].unique()[:3]
    print(f"  {group}: {subjects}")

if BASELINE_ONLY:
    print("\nMCI visit types (should only be baseline/initial):")
else:
    print("\nMCI visit types (all visit types for MCI retained):")

if len(df_clean[df_clean['Group'] == 'MCI']) > 0:
    mci_visits = df_clean[df_clean['Group'] == 'MCI']['Visit'].value_counts()
    print(mci_visits)

### File Finding Functions

In [None]:
def find_nifti_file(subject_id, image_id, base_path, acq_date=None, description=None):
    """Find the NIfTI file for a given subject and image ID.
    
    Since Image Data ID doesn't appear in filenames, we use:
    1. Subject ID to find the directory
    2. Acquisition date to match the scan date folder
    3. Description to match scan type (FDG, PIB, etc.)
    
    File structure: base_path/SUBJECT_ID/SCAN_TYPE/DATE/FILENAME.nii.gz
    """
    from datetime import datetime
    
    # First, look for the subject directory
    subject_path = base_path / subject_id
    if not subject_path.exists():
        print(f"  Warning: Subject directory not found: {subject_path}")
        return None
    
    # Search for any .nii or .nii.gz files in subject's subdirectories
    nifti_files = []
    for pattern in ["**/*.nii", "**/*.nii.gz"]:
        nifti_files.extend(list(subject_path.glob(pattern)))
    
    if not nifti_files:
        print(f"  Warning: No NIfTI files found for subject {subject_id}")
        return None
    
    # If only one file, return it
    if len(nifti_files) == 1:
        return nifti_files[0]
    
    # Multiple files found - try to match using acquisition date and description
    print(f"  Info: Found {len(nifti_files)} files for {subject_id}")
    
    best_matches = []
    
    # Try to match by acquisition date if provided
    if acq_date:
        try:
            # Parse the acquisition date (format: M/D/YYYY)
            target_date = datetime.strptime(acq_date, "%m/%d/%Y")
            target_date_str = target_date.strftime("%Y-%m-%d")
            
            for file_path in nifti_files:
                # Check if date folder matches
                date_folder = file_path.parent.name  # e.g., "2011-06-09_08_23_48.0"
                if target_date_str in date_folder:
                    best_matches.append(file_path)
                    
        except ValueError:
            print(f"  Warning: Could not parse acquisition date: {acq_date}")
    
    # If we found date matches, use those
    if best_matches:
        if len(best_matches) == 1:
            print(f"  ✓ Matched by date: {best_matches[0].name}")
            return best_matches[0]
        else:
            print(f"  Info: Multiple date matches, trying description filter...")
            nifti_files = best_matches
    
    # Try to match by description/modality (FDG, PIB, etc.)
    if description:
        desc_upper = description.upper()
        modality_matches = []
        
        for file_path in nifti_files:
            file_upper = str(file_path).upper()
            # Check for common scan types
            if 'FDG' in desc_upper and 'FDG' in file_upper:
                modality_matches.append(file_path)
            elif 'PIB' in desc_upper and 'PIB' in file_upper:
                modality_matches.append(file_path)
            elif 'AMYLOID' in desc_upper and ('PIB' in file_upper or 'AMYLOID' in file_upper):
                modality_matches.append(file_path)
        
        if modality_matches:
            print(f"  ✓ Matched by modality: {modality_matches[0].name}")
            return modality_matches[0]
    
    # If no specific matches, return the first file but warn
    print(f"  Warning: Multiple files, no clear match. Using first file:")
    for i, f in enumerate(nifti_files[:3]):  # Show first 3
        print(f"    {i+1}. {f.name}")
    if len(nifti_files) > 3:
        print(f"    ... and {len(nifti_files)-3} more")
    
    return nifti_files[0]

def get_destination_path(group, subject_id, image_id, source_file):
    """Get the destination path for a file based on group."""
    if group == 'AD':
        dest_dir = ad_path
    elif group == 'CN':
        dest_dir = cn_path
    elif group in ['MCI', 'EMCI', 'LMCI']:
        # All MCI types go to the same MCI folder
        dest_dir = mci_path
    else:
        return None
    
    # Preserve the original file extension (.nii or .nii.gz)
    original_name = source_file.name
    if original_name.endswith('.nii.gz'):
        extension = '.nii.gz'
    elif original_name.endswith('.nii'):
        # Check if it's actually compressed despite .nii extension
        try:
            with open(source_file, 'rb') as f:
                header = f.read(2)
            if header == b'\x1f\x8b':  # gzip magic number
                extension = '.nii.gz'
                print(f"  Note: Correcting extension for compressed file: {original_name}")
            else:
                extension = '.nii'
        except:
            extension = '.nii'  # fallback
    else:
        extension = '.nii'  # fallback
    
    # Create standardized filename: Subject_ImageID.extension
    dest_filename = f"{subject_id}_{image_id}{extension}"
    return dest_dir / dest_filename

# Test the function with a few examples
print("Testing file search with first few entries...")
print(f"Base path: {data_base_path}")
print(f"Base path exists: {data_base_path.exists()}")
print()

test_rows = df_clean.head(5)
for _, row in test_rows.iterrows():
    subject_id = row['Subject']
    image_id = row['Image Data ID']
    group = row['Group']
    acq_date = row.get('Acq Date', None)
    description = row.get('Description', None)
    
    print(f"Testing: {subject_id} / {image_id} / {group}")
    print(f"  Acq Date: {acq_date}, Description: {description}")
    
    found_file = find_nifti_file(subject_id, image_id, data_base_path, acq_date, description)
    if found_file:
        print(f"  ✓ Found: {found_file}")
        dest_path = get_destination_path(group, subject_id, image_id, found_file)
        print(f"  → Would copy to: {dest_path}")
    else:
        print(f"  ✗ Not found")
    print()

### Organize Files

In [None]:
# Process each row in the CSV
# Note: MCI patients are filtered to only include baseline/initial scans
success_count = 0
error_count = 0
group_counts = {'AD': 0, 'CN': 0, 'MCI': 0}
errors = []
skipped_duplicates = 0

print("Starting file organization...")
print(f"Processing {len(df_clean)} entries...")
print("Note: MCI patients filtered to only baseline/initial scans\n")

for idx, row in tqdm(df_clean.iterrows(), total=len(df_clean), desc="Organizing files"):
    try:
        # Extract information from row
        subject_id = row['Subject']
        image_id = row['Image Data ID']
        group = row['Group']
        acq_date = row.get('Acq Date', None)
        description = row.get('Description', None)
        
        # Skip if group is not one of the expected values (MCI types are normalized to 'MCI' earlier)
        if group not in ['AD', 'CN', 'MCI', 'EMCI', 'LMCI']:
            errors.append(f"Row {idx}: Invalid group '{group}' for subject {subject_id}")
            error_count += 1
            continue
        
        # Find the source NIfTI file
        source_file = find_nifti_file(subject_id, image_id, data_base_path, acq_date, description)
        if source_file is None:
            errors.append(f"Row {idx}: File not found for {subject_id}/{image_id}")
            error_count += 1
            continue
        
        # Get destination path
        dest_path = get_destination_path(group, subject_id, image_id, source_file)
        if dest_path is None:
            errors.append(f"Row {idx}: Invalid group '{group}'")
            error_count += 1
            continue
        
        # Copy file if it doesn't exist
        if dest_path.exists():
            skipped_duplicates += 1
        else:
            shutil.copy2(source_file, dest_path)
        
        group_counts[group] += 1
        success_count += 1
        
    except Exception as e:
        errors.append(f"Row {idx}: Unexpected error - {str(e)}")
        error_count += 1

# Print summary
print(f"\n{'='*50}")
print("ORGANIZATION SUMMARY")
print(f"{'='*50}")
print(f"Successfully processed: {success_count} files")
print(f"Errors encountered: {error_count} files")
print(f"Skipped duplicates: {skipped_duplicates} files")
print(f"\nFiles organized by group:")
for group, count in group_counts.items():
    print(f"  {group}: {count} files")

if errors:
    print(f"\nFirst 10 errors:")
    for error in errors[:10]:
        print(f"  {error}")
    if len(errors) > 10:
        print(f"  ... and {len(errors) - 10} more errors")

### Verification and Final Summary

In [None]:
# Verify the organization by counting actual files
print("\n" + "="*50)
print("VERIFICATION")
print("="*50)

actual_counts = {}
all_organized_files = []

for group_name, group_path in [('AD', ad_path), ('CN', cn_path), ('MCI', mci_path)]:
    files = list(group_path.glob('*.nii*'))
    actual_counts[group_name] = len(files)
    all_organized_files.extend(files)
    print(f"\n{group_name} folder:")
    print(f"  Files found: {len(files)}")
    if files:
        print(f"  Example files: {[f.name for f in files[:3]]}")
        if len(files) > 3:
            print(f"  ... and {len(files) - 3} more")

total_organized = sum(actual_counts.values())
print(f"\nTotal files organized: {total_organized}")
print(f"Total CSV entries: {len(df_clean)}")

# Better success rate calculation
csv_entries_processed = success_count  # From the processing loop
print(f"CSV entries successfully processed: {csv_entries_processed}")
print(f"Processing success rate: {csv_entries_processed/len(df_clean)*100:.1f}%")

# Analyze subjects with multiple scans - both in CSV and in organized files
print(f"\n" + "="*30)
print("MULTIPLE SCANS ANALYSIS")
print("="*30)

# CSV analysis
subject_counts_csv = df_clean['Subject'].value_counts()
multiple_scans_csv = subject_counts_csv[subject_counts_csv > 1]

# Organized files analysis - extract subject IDs from organized filenames
organized_subjects = []
for file_path in all_organized_files:
    # Extract subject ID from filename (assuming format: SUBJECT_ID_*.nii)
    filename = file_path.name
    if '_' in filename:
        subject_id = filename.split('_')[0] + '_' + filename.split('_')[1] + '_' + filename.split('_')[2]
        organized_subjects.append(subject_id)

from collections import Counter
organized_subject_counts = Counter(organized_subjects)
multiple_scans_organized = {subj: count for subj, count in organized_subject_counts.items() if count > 1}

print(f"Subjects with multiple scans in CSV: {len(multiple_scans_csv)}")
print(f"Subjects with multiple scans organized: {len(multiple_scans_organized)}")

if multiple_scans_organized:
    print(f"\nTop subjects with most organized scans:")
    sorted_subjects = sorted(multiple_scans_organized.items(), key=lambda x: x[1], reverse=True)
    for subj, count in sorted_subjects[:5]:
        csv_count = subject_counts_csv.get(subj, 0)
        print(f"  {subj}: {count} organized files (from {csv_count} CSV entries)")

# Check for any mismatches
print(f"\n" + "="*25)
print("CONSISTENCY CHECK")
print("="*25)
total_unique_subjects_csv = len(df_clean['Subject'].unique())
total_unique_subjects_organized = len(set(organized_subjects))
print(f"Unique subjects in CSV: {total_unique_subjects_csv}")
print(f"Unique subjects organized: {total_unique_subjects_organized}")

if total_unique_subjects_organized < total_unique_subjects_csv:
    missing_subjects = set(df_clean['Subject'].unique()) - set(organized_subjects)
    print(f"⚠️  Missing subjects: {len(missing_subjects)}")
    if len(missing_subjects) <= 10:
        print(f"  Missing: {list(missing_subjects)}")
    else:
        print(f"  First 10 missing: {list(missing_subjects)[:10]}")
        
print("\nNote: Multiple scans per subject are EXPECTED and beneficial for longitudinal analysis.")

print("\n" + "="*50)
print("DATA ORGANIZATION COMPLETED!")
print("="*50)
print("\nNext steps:")
print("1. Verify the file organization looks correct")
print("2. Run the next notebook: 02_preprocess_3d.ipynb")
print("3. For MCI data with multiple scans, consider using 04_mci_conversion_split_improved.ipynb")