# UDS Table Extraction and Dataset Cleaning (Clean Notebook)

This notebook focuses on extracting the C1/C2 Neuropsych Battery variable catalog from the UDS PDF and aligning the investigator CSV to those variables.

Outputs saved to the configured output directory include:
- `variable_catalog.csv`
- `cleaned_subset.parquet` (only catalog variables)
- `availability_summary.csv` (column-wise non-missing counts)
- `stats.txt` (empty-rows summary)
- Optional: `availability_heatmap.png`

Requirements: `pandas`, `pdfplumber`, `matplotlib`, `seaborn` (for optional heatmap).


In [1]:
# Parameters
CSV_PATH = '../../data-files/investigator_nacc67.csv'
PDF_PATH = '../../data-files/rdd_uds.pdf'
PAGE_RANGE = (23, 27)  # inclusive zero-based pages for C1/C2 tables
OUT_DIR = '../../outputs/uds_extraction'
MMSE_COLS = ['NACCMMSE']  # extend if needed
MOCA_COLS = ['NACCMOCA']  # extend if needed
PLOT_HEATMAP = True


In [2]:
from pathlib import Path
import pandas as pd
from src.data.uds_extraction import (
    build_variable_catalog,
    load_nacc_csv,
    align_dataset_to_catalog,
    compute_empty_rows_mask,
    plot_availability_heatmap,
)

out_dir = Path(OUT_DIR)
out_dir.mkdir(parents=True, exist_ok=True)
out_dir.as_posix()


'../../outputs/uds_extraction'

## Build variable catalog from PDF


In [3]:
catalog = build_variable_catalog(PDF_PATH, PAGE_RANGE)
catalog_path = out_dir / 'variable_catalog.csv'
catalog.to_csv(catalog_path, index=False)
catalog.head(10)


Unnamed: 0,form_field,variable_name,label,source_page
0,C1,MMSECOMP,Was any part of the MMSE completed?,26
1,C1 Neuropsychological Battery\nSummary Scores,MMSELOC,Administration of the MMSE was:,27
2,C1,MMSELAN,Language of MMSE administration,28
3,C1 Neuropsychological Battery\nSummary Scores,MMSELANX,Language of MMSE administration —\nOther (spec...,29
4,C1,MMSEVIS,Subject was unable to complete one or\nmore se...,30
5,C1 Neuropsychological Battery\nSummary Scores,MMSEHEAR,Subject was unable to complete one or\nmore se...,31
6,C1,MMSEORDA,Orientation subscale score — Time,32
7,C1 Neuropsychological Battery\nSummary Scores,MMSEORLO,Orientation subscale score — Place,33
8,C1,PENTAGON,Intersecting pentagon subscale score,34
9,C1 Neuropsychological Battery\nSummary Scores,NACCMMSE,Total MMSE score (using D-L-R-O-W),35


## Load CSV and align to catalog


In [4]:
df = load_nacc_csv(CSV_PATH)
cleaned, availability = align_dataset_to_catalog(
    df, catalog, mmse_cols=MMSE_COLS, moca_cols=MOCA_COLS
)
cleaned_path = out_dir / 'cleaned_subset.parquet'
availability_path = out_dir / 'availability_summary.csv'
cleaned.to_parquet(cleaned_path, index=False)
availability.to_csv(availability_path, index=False)
cleaned.shape, availability.shape


  return pd.read_csv(path)


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

## Empty-rows statistics and optional heatmap


In [None]:
empty_mask = compute_empty_rows_mask(cleaned)
stats_txt = (
    f'Rows total: {len(cleaned)}\n'
    f'Completely empty (all -4/NaN): {int(empty_mask.sum())}\n'
    f'With some data: {int((~empty_mask).sum())}\n'
)
(out_dir / 'stats.txt').write_text(stats_txt)
print(stats_txt)
if PLOT_HEATMAP:
    plot_availability_heatmap(cleaned, out_path=str(out_dir / 'availability_heatmap.png'))


## Quick previews


In [None]:
display(cleaned.head())
display(availability.head())
# End of notebook


In [None]:
# Look at a few sample rows with some data
rows_with_data = df_neither[~completely_empty]

if len(rows_with_data) > 0:
    print(f"\nSample of rows with some data (showing first 5):")
    print(rows_with_data.head())
    
    # Show which columns have data in these sample rows
    print("\nNon-empty values in first sample row:")
    first_row = rows_with_data.iloc[0]
    for col in first_row.index:
        if col not in ['has_MMSE', 'has_MOCA'] and pd.notna(first_row[col]) and first_row[col] != -4:
            print(f"  {col}: {first_row[col]}")