# UDS Data Cleaning and Preparation

This notebook extracts the C1/C2 Neuropsych Battery variable catalog from the UDS PDF,
aligns the investigator CSV to those variables, and prepares analysis-ready subsets.

Outputs saved to the configured output directory include:
- `variable_catalog.csv`
- `cleaned_subset.parquet` (only catalog variables)
- `availability_summary.csv` (column-wise non-missing counts)
- `stats.txt` (empty-rows summary)
- Optional: `availability_heatmap.png`
- `mmse_only.parquet`, `moca_only.parquet` (mutually exclusive subsets)

Requirements: `pandas`, `pdfplumber`, `matplotlib`, `seaborn` (for optional heatmap).


In [None]:
# Parameters
CSV_PATH = '../../data-files/investigator_nacc67.csv'
PDF_PATH = '../../data-files/rdd_uds.pdf'
PAGE_RANGE = (23, 27)  # inclusive zero-based pages for C1/C2 tables
OUT_DIR = '../../outputs/uds_extraction'
MMSE_COLS = ['NACCMMSE']  # extend if needed
MOCA_COLS = ['NACCMOCA']  # extend if needed
PLOT_HEATMAP = True


In [None]:
from pathlib import Path
import pandas as pd
import sys

# Add the project root to sys.path so we can import from 'src'
project_root = next((p for p in [Path.cwd()] + list(Path.cwd().parents) if (p / 'src').exists()), None)
if project_root and str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.data.uds_extraction import (
    build_variable_catalog,
    load_nacc_csv,
    align_dataset_to_catalog,
    compute_empty_rows_mask,
    plot_availability_heatmap,
)

out_dir = Path(OUT_DIR)
out_dir.mkdir(parents=True, exist_ok=True)
out_dir.as_posix()


## Build variable catalog from PDF


In [None]:
catalog = build_variable_catalog(PDF_PATH, PAGE_RANGE)
catalog_path = out_dir / 'variable_catalog.csv'
catalog.to_csv(catalog_path, index=False)
catalog.head(10)


## Load CSV and align to catalog


In [None]:
df = load_nacc_csv(CSV_PATH)
cleaned, availability = align_dataset_to_catalog(
    df, catalog, mmse_cols=MMSE_COLS, moca_cols=MOCA_COLS
)
cleaned_path = out_dir / 'cleaned_subset.parquet'
availability_path = out_dir / 'availability_summary.csv'

# Fix for pyarrow compatibility:
# 1. Convert to best possible types
cleaned = cleaned.convert_dtypes()

# 2. Convert any remaining object columns to string, handling all edge cases
for col in cleaned.columns:
    if cleaned[col].dtype == 'object' or str(cleaned[col].dtype) == 'object':
        cleaned[col] = cleaned[col].astype("string")

# 3. Save with engine specification to handle nullable types properly
cleaned.to_parquet(cleaned_path, index=False, engine='pyarrow')
availability.to_csv(availability_path, index=False)
cleaned.shape, availability.shape


In [None]:
empty_mask = compute_empty_rows_mask(cleaned)
stats_txt = (
    f'Rows total: {len(cleaned)}\n'
    f'Completely empty (all -4/NaN): {int(empty_mask.sum())}\n'
    f'With some data: {int((~empty_mask).sum())}\n'
)
(out_dir / 'stats.txt').write_text(stats_txt)
print(stats_txt)
if PLOT_HEATMAP:
    plot_availability_heatmap(cleaned, out_path=str(out_dir / 'availability_heatmap.png'))


## Quick previews


In [None]:
display(cleaned)
display(availability)


## Split into MMSE-only and MOCA-only (XOR) and save


In [None]:
# Keep rows where exactly one of has_MMSE / has_MOCA is True (XOR)
xor_mask = cleaned["has_MMSE"] ^ cleaned["has_MOCA"]
filtered = cleaned.loc[xor_mask].copy()

# Split into two sets
df_mmse_only = filtered.loc[filtered["has_MMSE"].fillna(False)].copy()
df_moca_only = filtered.loc[filtered["has_MOCA"].fillna(False)].copy()

# Save
mmse_only_path = out_dir / 'mmse_only.parquet'
moca_only_path = out_dir / 'moca_only.parquet'
df_mmse_only.to_parquet(mmse_only_path, index=False)
df_moca_only.to_parquet(moca_only_path, index=False)

print(
    f"Saved MMSE-only rows: {len(df_mmse_only)} to {mmse_only_path}\n"
    f"Saved MOCA-only rows: {len(df_moca_only)} to {moca_only_path}"
)
