# UDS Table Extraction and Dataset Cleaning (Clean Notebook)

This notebook focuses on extracting the C1/C2 Neuropsych Battery variable catalog from the UDS PDF and aligning the investigator CSV to those variables.

Outputs saved to the configured output directory include:
- `variable_catalog.csv`
- `cleaned_subset.parquet` (only catalog variables)
- `availability_summary.csv` (column-wise non-missing counts)
- `stats.txt` (empty-rows summary)
- Optional: `availability_heatmap.png`

Requirements: `pandas`, `pdfplumber`, `matplotlib`, `seaborn` (for optional heatmap).


In [None]:
# Parameters
CSV_PATH = '../../data-files/investigator_nacc67.csv'
PDF_PATH = '../../data-files/rdd_uds.pdf'
PAGE_RANGE = (23, 27)  # inclusive zero-based pages for C1/C2 tables
OUT_DIR = '../../outputs/uds_extraction'
MMSE_COLS = ['NACCMMSE']  # extend if needed
MOCA_COLS = ['NACCMOCA']  # extend if needed
PLOT_HEATMAP = True


In [None]:
from pathlib import Path
import pandas as pd
import sys

# Add the project root to sys.path so we can import from 'src'
# This searches for the 'src' folder in the current directory or its parents
project_root = next((p for p in [Path.cwd()] + list(Path.cwd().parents) if (p / 'src').exists()), None)
if project_root and str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.data.uds_extraction import (
    build_variable_catalog,
    load_nacc_csv,
    align_dataset_to_catalog,
    compute_empty_rows_mask,
    plot_availability_heatmap,
)

out_dir = Path(OUT_DIR)
out_dir.mkdir(parents=True, exist_ok=True)
out_dir.as_posix()


## Build variable catalog from PDF


In [None]:
catalog = build_variable_catalog(PDF_PATH, PAGE_RANGE)
catalog_path = out_dir / 'variable_catalog.csv'
catalog.to_csv(catalog_path, index=False)
catalog.head(10)


## Load CSV and align to catalog


In [None]:
df = load_nacc_csv(CSV_PATH)
cleaned, availability = align_dataset_to_catalog(
    df, catalog, mmse_cols=MMSE_COLS, moca_cols=MOCA_COLS
)
cleaned_path = out_dir / 'cleaned_subset.parquet'
availability_path = out_dir / 'availability_summary.csv'

# 1. Automatically convert generic objects to best possible types (Int64, Float64, String)
#    This prepares your numeric columns correctly for correlation analysis.
cleaned = cleaned.convert_dtypes()

# 2. For any columns that remain 'object' (likely mixed text/numbers), convert to String.
#    This fixes the 'ArrowTypeError' by ensuring a valid format for Parquet
#    without losing data. You can still cast these to numeric later if needed.
for col in cleaned.select_dtypes(include=['object']).columns:
    cleaned[col] = cleaned[col].astype("string")

cleaned.to_parquet(cleaned_path, index=False)
availability.to_csv(availability_path, index=False)
cleaned.shape, availability.shape


## Empty-rows statistics and optional heatmap


In [None]:
empty_mask = compute_empty_rows_mask(cleaned)
stats_txt = (
    f'Rows total: {len(cleaned)}\n'
    f'Completely empty (all -4/NaN): {int(empty_mask.sum())}\n'
    f'With some data: {int((~empty_mask).sum())}\n'
)
(out_dir / 'stats.txt').write_text(stats_txt)
print(stats_txt)
if PLOT_HEATMAP:
    plot_availability_heatmap(cleaned, out_path=str(out_dir / 'availability_heatmap.png'))


## Quick previews


In [None]:
display(cleaned)
display(availability)
# End of notebook


## Split into MMSE-only and MOCA-only (XOR) and save


In [None]:
# Keep rows where exactly one of has_MMSE / has_MOCA is True (XOR)
xor_mask = cleaned["has_MMSE"] ^ cleaned["has_MOCA"]
filtered = cleaned.loc[xor_mask].copy()

# Split into two sets
df_mmse_only = filtered.loc[filtered["has_MMSE"]].copy()
df_moca_only = filtered.loc[filtered["has_MOCA"]].copy()

# Save
mmse_only_path = out_dir / 'mmse_only.parquet'
moca_only_path = out_dir / 'moca_only.parquet'
df_mmse_only.to_parquet(mmse_only_path, index=False)
df_moca_only.to_parquet(moca_only_path, index=False)

print(
    f"Saved MMSE-only rows: {len(df_mmse_only)} to {mmse_only_path}\n"
    f"Saved MOCA-only rows: {len(df_moca_only)} to {moca_only_path}"
)

# Correlation analysis for MMSE-only and MOCA-only sets


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def _prepare_numeric(df: pd.DataFrame, drop_cols=None) -> pd.DataFrame:
    drop_cols = set(drop_cols or [])
    # Select numeric columns only
    num = df.select_dtypes(include=["number"]).copy()
    # Drop known indicator/ID columns if present
    for col in ["has_MMSE", "has_MOCA"]:
        if col in num.columns:
            drop_cols.add(col)
    num = num.drop(columns=[c for c in drop_cols if c in num.columns], errors="ignore")
    # Drop columns that are all NA or constant
    non_na = num.dropna(axis=1, how="all")
    nunique = non_na.nunique(dropna=True)
    non_constant = non_na.loc[:, nunique > 1]
    # Further drop near-zero-variance (NZV) columns to aid factor convergence
    # Criteria: very small variance OR one level dominates (>= 99% same value)
    if non_constant.shape[1] == 0:
        return non_constant
    variances = non_constant.var(ddof=0)
    var_mask = variances > 1e-6
    # Dominant level frequency
    freq_mask = []
    for c in non_constant.columns:
        vc = non_constant[c].value_counts(normalize=True, dropna=True)
        max_prop = float(vc.iloc[0]) if len(vc) else 1.0
        freq_mask.append(max_prop < 0.99)
    freq_mask = pd.Series(freq_mask, index=non_constant.columns)
    keep_mask = var_mask & freq_mask
    nzv_filtered = non_constant.loc[:, keep_mask]
    return nzv_filtered

def compute_and_save_correlations(df: pd.DataFrame, label: str, out_dir: Path,
                                  methods=("pearson", "spearman"),
                                  plot=PLOT_HEATMAP) -> None:
    data = _prepare_numeric(df)
    if data.shape[1] < 2:
        print(f"[{label}] Not enough numeric columns for correlation (found {data.shape[1]}). Skipping.")
        return
    for method in methods:
        corr = data.corr(method=method)
        out_csv = out_dir / f"corr_{label}_{method}.csv"
        corr.to_csv(out_csv)
        print(f"[{label}] Saved {method} correlation matrix to {out_csv}")
        if plot:
            plt.figure(figsize=(max(8, min(20, 0.35 * corr.shape[1])),
                               max(6, min(20, 0.35 * corr.shape[0]))))
            sns.heatmap(corr, cmap="vlag", center=0, square=True,
                        cbar_kws={"shrink": 0.6}, linewidths=0.3)
            plt.title(f"{label.upper()} — {method.title()} correlation")
            plt.tight_layout()
            img_path = out_dir / f"corr_{label}_{method}.png"
            plt.savefig(img_path, dpi=200)
            plt.close()
            print(f"[{label}] Saved {method} correlation heatmap to {img_path}")

# Run for each set separately
compute_and_save_correlations(df_mmse_only, label="mmse_only", out_dir=out_dir)
compute_and_save_correlations(df_moca_only, label="moca_only", out_dir=out_dir)


## Correlation filter and Exploratory Factor Analysis (EFA)


In [None]:
from typing import List, Tuple, Dict
from sklearn.decomposition import FactorAnalysis

def _missing_rate_per_column(df: pd.DataFrame) -> pd.Series:
    return df.isna().mean()

def correlation_filter(df: pd.DataFrame,
                       method: str = "spearman",
                       threshold: float = 0.95,
                       prefer_keep: Tuple[str, ...] = (),
                       drop_cols: Tuple[str, ...] = ()) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
    """Filter out one variable from any pair with |corr| >= threshold.

    Tie-breaker rules:
      1) Keep variables listed in prefer_keep if involved in a tie.
      2) Otherwise drop the one with higher missingness rate.
      3) If equal, drop the one that is later alphabetically.

    Returns: (filtered_df, dropped_columns, corr_matrix_of_kept)
    """
    data = _prepare_numeric(df, drop_cols=drop_cols)
    if data.shape[1] < 2:
        return data, [], data.corr(method=method)

    corr = data.corr(method=method).abs()
    np.fill_diagonal(corr.values, 0.0)
    miss = _missing_rate_per_column(data)

    to_drop: set = set()
    keep_set: set = set(prefer_keep)
    cols = list(data.columns)

    # Create list of pairs above threshold
    pairs: List[Tuple[str, str, float]] = []
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            r = corr.iloc[i, j]
            if r >= threshold:
                pairs.append((cols[i], cols[j], r))

    # Sort pairs by strength descending so we handle strongest first
    pairs.sort(key=lambda x: x[2], reverse=True)

    for a, b, _ in pairs:
        if a in to_drop or b in to_drop:
            continue
        # Decide which to drop
        if a in keep_set and b in keep_set:
            # both preferred, fall through to missingness
            pass
        elif a in keep_set:
            to_drop.add(b)
            continue
        elif b in keep_set:
            to_drop.add(a)
            continue

        ma = float(miss.get(a, 0.0))
        mb = float(miss.get(b, 0.0))
        if ma > mb:
            to_drop.add(a)
        elif mb > ma:
            to_drop.add(b)
        else:
            # Alphabetical tiebreaker: drop later
            to_drop.add(max(a, b))

    kept_cols = [c for c in cols if c not in to_drop]
    filtered = data[kept_cols].copy()
    corr_kept = filtered.corr(method=method)
    return filtered, sorted(list(to_drop)), corr_kept

def _standardize(df: pd.DataFrame) -> pd.DataFrame:
    return (df - df.mean()) / df.std(ddof=0)

def _eigenvalues_from_corr(df: pd.DataFrame) -> np.ndarray:
    c = df.corr(method="pearson").fillna(0)
    vals, _ = np.linalg.eigh(c.values)
    return np.sort(vals)[::-1]

def varimax(Phi: np.ndarray, gamma: float = 1.0, q: int = 20, tol: float = 1e-6) -> np.ndarray:
    """Varimax rotation of loadings matrix Phi (features x factors)."""
    p, k = Phi.shape
    R = np.eye(k)
    d = 0
    for i in range(q):
        d_old = d
        Lambda = Phi @ R
        u, s, vh = np.linalg.svd(Phi.T @ (Lambda**3 - (gamma / p) * (Lambda @ np.diag(np.diag(Lambda.T @ Lambda)))))
        R = u @ vh
        d = s.sum()
        if d_old != 0 and d / d_old < 1 + tol:
            break
    return Phi @ R

def run_efa(df: pd.DataFrame,
            label: str,
            out_dir: Path,
            rotation: str = "varimax",
            kaiser: bool = True,
            n_factors: int = None) -> Dict[str, object]:
    """Run EFA on standardized numeric data with listwise deletion.
    Chooses number of factors by Kaiser criterion if n_factors is None and kaiser is True.
    Saves artifacts to out_dir with prefix 'efa_{label}_*'.
    """
    X = _prepare_numeric(df)
    # Listwise deletion
    X = X.dropna(axis=0, how="any")
    if X.shape[1] < 2 or X.shape[0] < 10:
        print(f"[{label}] Not enough data for EFA. Observations={X.shape[0]}, Vars={X.shape[1]}")
        return {}

    # Standardize
    Z = _standardize(X)

    # Determine number of factors
    if n_factors is None:
        eigvals = _eigenvalues_from_corr(Z)
        if kaiser:
            n_factors = int((eigvals > 1.0).sum())
        if not kaiser or n_factors < 1:
            n_factors = max(1, min(6, X.shape[1] // 3))
    # Scree plot
    eigvals = _eigenvalues_from_corr(Z)
    plt.figure(figsize=(6,4))
    plt.plot(range(1, len(eigvals)+1), eigvals, marker='o')
    plt.xlabel('Component')
    plt.ylabel('Eigenvalue')
    plt.title(f'{label.upper()} Scree Plot')
    plt.tight_layout()
    scree_path = out_dir / f'efa_{label}_scree.png'
    plt.savefig(scree_path, dpi=200)
    plt.close()
    print(f"[{label}] Saved scree plot to {scree_path}")

    # Fit FactorAnalysis (ML) and rotate
    fa = FactorAnalysis(n_components=n_factors, rotation=None)
    fa.fit(Z.values)
    loadings = fa.components_.T  # features x factors
    if rotation == "varimax":
        loadings = varimax(loadings)

    # Summaries
    features = list(Z.columns)
    loadings_df = pd.DataFrame(loadings, index=features, columns=[f"F{i+1}" for i in range(loadings.shape[1])])
    uniqueness = getattr(fa, 'noise_variance_', np.maximum(0.0, 1.0 - (loadings**2).sum(axis=1)))
    communalities = 1.0 - uniqueness
    comm_df = pd.DataFrame({"communality": communalities, "uniqueness": uniqueness}, index=features)

    # Save artifacts
    loadings_csv = out_dir / f"efa_{label}_loadings.csv"
    loadings_df.to_csv(loadings_csv)
    print(f"[{label}] Saved loadings to {loadings_csv}")

    comm_csv = out_dir / f"efa_{label}_communalities.csv"
    comm_df.to_csv(comm_csv)
    print(f"[{label}] Saved communalities/uniqueness to {comm_csv}")

    # Heatmap of loadings
    plt.figure(figsize=(max(6, 0.5 * loadings_df.shape[1] + 4), max(6, 0.25 * loadings_df.shape[0] + 2)))
    sns.heatmap(loadings_df, cmap="coolwarm", center=0, cbar_kws={"shrink": 0.6})
    plt.title(f"{label.upper()} Factor Loadings ({n_factors} factors)")
    plt.tight_layout()
    loadings_png = out_dir / f"efa_{label}_loadings.png"
    plt.savefig(loadings_png, dpi=200)
    plt.close()
    print(f"[{label}] Saved loadings heatmap to {loadings_png}")

    return {
        "n_factors": n_factors,
        "loadings": loadings_df,
        "communalities": comm_df,
        "scree": scree_path,
    }

# Apply correlation filter with confirmed parameters, then EFA
CF_METHOD = "spearman"
CF_THRESHOLD = 0.95

def run_correlation_filter_and_efa(df: pd.DataFrame, label: str):
    # Correlation filter
    filtered, dropped, corr_kept = correlation_filter(df, method=CF_METHOD, threshold=CF_THRESHOLD)
    # Save correlation after filtering
    out_csv = out_dir / f"corr_{label}_filtered_{CF_METHOD}.csv"
    corr_kept.to_csv(out_csv)
    print(f"[{label}] Saved filtered correlation matrix to {out_csv}")
    # Plot heatmap
    plt.figure(figsize=(max(8, min(20, 0.35 * corr_kept.shape[1])), max(6, min(20, 0.35 * corr_kept.shape[0]))))
    sns.heatmap(corr_kept, cmap="vlag", center=0, square=True, cbar_kws={"shrink": 0.6}, linewidths=0.3)
    plt.title(f"{label.upper()} — filtered ({CF_METHOD}) correlation")
    plt.tight_layout()
    img_path = out_dir / f"corr_{label}_filtered_{CF_METHOD}.png"
    plt.savefig(img_path, dpi=200)
    plt.close()
    print(f"[{label}] Saved filtered correlation heatmap to {img_path}")
    if dropped:
        print(f"[{label}] Dropped due to high correlation (|r|>={CF_THRESHOLD}): {', '.join(dropped)}")

    # EFA on filtered data
    run_efa(filtered, label=label, out_dir=out_dir, rotation="varimax", kaiser=True, n_factors=None)

run_correlation_filter_and_efa(df_mmse_only, label="mmse_only")
run_correlation_filter_and_efa(df_moca_only, label="moca_only")
