# UDS Analysis: Correlations and EFA (Post‑Cleaning)

This notebook focuses on analysis steps only. Please run the separate notebook
"uds_data_cleaning" first to build the variable catalog, clean/align the dataset,
and generate the parquet files used here.

Inputs expected from uds_data_cleaning:
- outputs/uds_extraction/mmse_only.parquet
- outputs/uds_extraction/moca_only.parquet

Outputs produced here include correlation matrices/plots, filtered correlation artifacts,
and EFA loadings/communalities and plots.


In [None]:
# Parameters
OUT_DIR = '../../outputs/uds_extraction'
PLOT_HEATMAP = True


In [None]:
from pathlib import Path
import pandas as pd
import sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Add the project root to sys.path so we can import from 'src' if needed
project_root = next((p for p in [Path.cwd()] + list(Path.cwd().parents) if (p / 'src').exists()), None)
if project_root and str(project_root) not in sys.path:
    sys.path.append(str(project_root))

out_dir = Path(OUT_DIR)
out_dir.mkdir(parents=True, exist_ok=True)
out_dir.as_posix()


## Load pre-cleaned subsets


In [None]:
mmse_only_path = out_dir / 'mmse_only.parquet'
moca_only_path = out_dir / 'moca_only.parquet'
df_mmse_only = pd.read_parquet(mmse_only_path)
df_moca_only = pd.read_parquet(moca_only_path)
print(f"Loaded: {mmse_only_path} ({df_mmse_only.shape}) | {moca_only_path} ({df_moca_only.shape})")


# Correlation analysis for MMSE-only and MOCA-only sets


In [None]:

def _prepare_numeric(df: pd.DataFrame, drop_cols=None) -> pd.DataFrame:
    drop_cols = set(drop_cols or [])
    # Select numeric columns only
    num = df.select_dtypes(include=["number"]).copy()
    # Drop known indicator/ID columns if present
    for col in ["has_MMSE", "has_MOCA"]:
        if col in num.columns:
            drop_cols.add(col)
    num = num.drop(columns=[c for c in drop_cols if c in num.columns], errors="ignore")
    # Drop columns that are all NA or constant
    non_na = num.dropna(axis=1, how="all")
    nunique = non_na.nunique(dropna=True)
    non_constant = non_na.loc[:, nunique > 1]
    # Further drop near-zero-variance (NZV) columns to aid factor convergence
    # Criteria: very small variance OR one level dominates (>= 99% same value)
    if non_constant.shape[1] == 0:
        return non_constant
    variances = non_constant.var(ddof=0)
    var_mask = variances > 1e-6
    # Dominant level frequency
    freq_mask = []
    for c in non_constant.columns:
        vc = non_constant[c].value_counts(normalize=True, dropna=True)
        max_prop = float(vc.iloc[0]) if len(vc) else 1.0
        freq_mask.append(max_prop < 0.99)
    freq_mask = pd.Series(freq_mask, index=non_constant.columns)
    keep_mask = var_mask & freq_mask
    nzv_filtered = non_constant.loc[:, keep_mask]
    return nzv_filtered

def compute_and_save_correlations(df: pd.DataFrame, label: str, out_dir: Path,
                                  methods=("pearson", "spearman"),
                                  plot=PLOT_HEATMAP) -> None:
    data = _prepare_numeric(df)
    if data.shape[1] < 2:
        print(f"[{label}] Not enough numeric columns for correlation (found {data.shape[1]}). Skipping.")
        return
    for method in methods:
        corr = data.corr(method=method)
        out_csv = out_dir / f"corr_{label}_{method}.csv"
        corr.to_csv(out_csv)
        print(f"[{label}] Saved {method} correlation matrix to {out_csv}")
        if plot:
            plt.figure(figsize=(max(8, min(20, 0.35 * corr.shape[1])),
                               max(6, min(20, 0.35 * corr.shape[0]))))
            sns.heatmap(corr, cmap="vlag", center=0, square=True,
                        cbar_kws={"shrink": 0.6}, linewidths=0.3)
            plt.title(f"{label.upper()} — {method.title()} correlation")
            plt.tight_layout()
            img_path = out_dir / f"corr_{label}_{method}.png"
            plt.savefig(img_path, dpi=200)
            plt.close()
            print(f"[{label}] Saved {method} correlation heatmap to {img_path}")

# Run for each set separately
compute_and_save_correlations(df_mmse_only, label="mmse_only", out_dir=out_dir)
compute_and_save_correlations(df_moca_only, label="moca_only", out_dir=out_dir)


## Correlation filter and Exploratory Factor Analysis (EFA)


In [None]:
from typing import List, Tuple, Dict
from sklearn.decomposition import FactorAnalysis

def _missing_rate_per_column(df: pd.DataFrame) -> pd.Series:
    return df.isna().mean()

def correlation_filter(df: pd.DataFrame,
                       method: str = "spearman",
                       threshold: float = 0.95,
                       prefer_keep: Tuple[str, ...] = (),
                       drop_cols: Tuple[str, ...] = ()) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
    """Filter out one variable from any pair with |corr| >= threshold.

    Tie-breaker rules:
      1) Keep variables listed in prefer_keep if involved in a tie.
      2) Otherwise drop the one with higher missingness rate.
      3) If equal, drop the one that is later alphabetically.

    Returns: (filtered_df, dropped_columns, corr_matrix_of_kept)
    """
    data = _prepare_numeric(df, drop_cols=drop_cols)
    if data.shape[1] < 2:
        return data, [], data.corr(method=method)

    corr = data.corr(method=method).abs()
    np.fill_diagonal(corr.values, 0.0)
    miss = _missing_rate_per_column(data)

    to_drop: set = set()
    keep_set: set = set(prefer_keep)
    cols = list(data.columns)

    # Create list of pairs above threshold
    pairs: List[Tuple[str, str, float]] = []
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            r = corr.iloc[i, j]
            if r >= threshold:
                pairs.append((cols[i], cols[j], r))

    # Sort pairs by strength descending so we handle strongest first
    pairs.sort(key=lambda x: x[2], reverse=True)

    for a, b, _ in pairs:
        if a in to_drop or b in to_drop:
            continue
        # Decide which to drop
        if a in keep_set and b in keep_set:
            # both preferred, fall through to missingness
            pass
        elif a in keep_set:
            to_drop.add(b)
            continue
        elif b in keep_set:
            to_drop.add(a)
            continue

        ma = float(miss.get(a, 0.0))
        mb = float(miss.get(b, 0.0))
        if ma > mb:
            to_drop.add(a)
        elif mb > ma:
            to_drop.add(b)
        else:
            # Alphabetical tiebreaker: drop later
            to_drop.add(max(a, b))

    kept_cols = [c for c in cols if c not in to_drop]
    filtered = data[kept_cols].copy()
    corr_kept = filtered.corr(method=method)
    return filtered, sorted(list(to_drop)), corr_kept

def _standardize(df: pd.DataFrame) -> pd.DataFrame:
    return (df - df.mean()) / df.std(ddof=0)

def _eigenvalues_from_corr(df: pd.DataFrame) -> np.ndarray:
    c = df.corr(method="pearson").fillna(0)
    vals, _ = np.linalg.eigh(c.values)
    return np.sort(vals)[::-1]

def varimax(Phi: np.ndarray, gamma: float = 1.0, q: int = 20, tol: float = 1e-6) -> np.ndarray:
    """Varimax rotation of loadings matrix Phi (features x factors)."""
    p, k = Phi.shape
    R = np.eye(k)
    d = 0
    for i in range(q):
        d_old = d
        Lambda = Phi @ R
        u, s, vh = np.linalg.svd(Phi.T @ (Lambda**3 - (gamma / p) * (Lambda @ np.diag(np.diag(Lambda.T @ Lambda)))))
        R = u @ vh
        d = s.sum()
        if d_old != 0 and d / d_old < 1 + tol:
            break
    return Phi @ R

def promax(Phi: np.ndarray, kappa: float = 4.0) -> tuple:
    """Promax (oblique) rotation of loadings matrix Phi.
    Returns (Lambda, Phi_f), where Lambda are oblique-rotated loadings and
    Phi_f is the factor correlation matrix.
    """
    # Start from an orthogonal varimax solution
    L = varimax(Phi)
    # Target matrix U by raising to power kappa with sign
    U = np.sign(L) * (np.abs(L) ** kappa)
    # Regress U on L to find transformation P
    # P = (L'L)^{-1} L'U  (stable via pinv)
    P = np.linalg.pinv(L) @ U
    Lambda = L @ P
    # Factor correlation matrix Phi_f = (P^{-1})'(P^{-1})
    try:
        Pinv = np.linalg.inv(P)
    except np.linalg.LinAlgError:
        Pinv = np.linalg.pinv(P)
    Phi_f = Pinv.T @ Pinv
    return Lambda, Phi_f

def run_efa(df: pd.DataFrame,
            label: str,
            out_dir: Path,
            rotation: str = "varimax",
            kaiser: bool = True,
            n_factors: int = None) -> Dict[str, object]:
    """Run EFA on standardized numeric data with listwise deletion.
    Chooses number of factors by Kaiser criterion if n_factors is None and kaiser is True.
    Saves artifacts to out_dir with prefix 'efa_{label}_*'.
    """
    X = _prepare_numeric(df)
    # Listwise deletion
    X = X.dropna(axis=0, how="any")
    if X.shape[1] < 2 or X.shape[0] < 10:
        print(f"[{label}] Not enough data for EFA. Observations={X.shape[0]}, Vars={X.shape[1]}")
        return {}

    # Standardize
    Z = _standardize(X)

    # Determine number of factors
    if n_factors is None:
        eigvals = _eigenvalues_from_corr(Z)
        if kaiser:
            n_factors = int((eigvals > 1.0).sum())
        if not kaiser or n_factors < 1:
            n_factors = max(1, min(6, X.shape[1] // 3))
    # Scree plot
    eigvals = _eigenvalues_from_corr(Z)
    plt.figure(figsize=(6,4))
    plt.plot(range(1, len(eigvals)+1), eigvals, marker='o')
    plt.xlabel('Component')
    plt.ylabel('Eigenvalue')
    plt.title(f'{label.upper()} Scree Plot')
    plt.tight_layout()
    scree_path = out_dir / f'efa_{label}_scree.png'
    plt.savefig(scree_path, dpi=200)
    plt.close()
    print(f"[{label}] Saved scree plot to {scree_path}")

    # Fit FactorAnalysis (ML) and rotate
    fa = FactorAnalysis(n_components=n_factors, rotation=None)
    fa.fit(Z.values)
    loadings = fa.components_.T  # features x factors
    factor_corr = None
    rot = (rotation or "").lower()
    if rot == "varimax":
        loadings = varimax(loadings)
    elif rot == "promax":
        loadings, factor_corr = promax(loadings)

    # Summaries
    features = list(Z.columns)
    loadings_df = pd.DataFrame(loadings, index=features, columns=[f"F{i+1}" for i in range(loadings.shape[1])])
    uniqueness = getattr(fa, 'noise_variance_', np.maximum(0.0, 1.0 - (loadings**2).sum(axis=1)))
    communalities = 1.0 - uniqueness
    comm_df = pd.DataFrame({"communality": communalities, "uniqueness": uniqueness}, index=features)

    # Save artifacts
    loadings_csv = out_dir / f"efa_{label}_loadings.csv"
    loadings_df.to_csv(loadings_csv)
    print(f"[{label}] Saved loadings to {loadings_csv}")

    comm_csv = out_dir / f"efa_{label}_communalities.csv"
    comm_df.to_csv(comm_csv)
    print(f"[{label}] Saved communalities/uniqueness to {comm_csv}")

    # Save factor correlation matrix for oblique rotations
    if factor_corr is not None:
        phi_df = pd.DataFrame(factor_corr, index=loadings_df.columns, columns=loadings_df.columns)
        phi_csv = out_dir / f"efa_{label}_factor_correlation.csv"
        phi_df.to_csv(phi_csv)
        print(f"[{label}] Saved factor correlation matrix to {phi_csv}")

    # Heatmap of loadings
    plt.figure(figsize=(max(6, 0.5 * loadings_df.shape[1] + 4), max(6, 0.25 * loadings_df.shape[0] + 2)))
    sns.heatmap(loadings_df, cmap="coolwarm", center=0, cbar_kws={"shrink": 0.6})
    plt.title(f"{label.upper()} Factor Loadings ({n_factors} factors)")
    plt.tight_layout()
    loadings_png = out_dir / f"efa_{label}_loadings.png"
    plt.savefig(loadings_png, dpi=200)
    plt.close()
    print(f"[{label}] Saved loadings heatmap to {loadings_png}")

    return {
        "n_factors": n_factors,
        "loadings": loadings_df,
        "communalities": comm_df,
        "scree": scree_path,
        "factor_correlation": factor_corr,
    }


### Correlation filter (separate step)


In [None]:
CF_METHOD = "spearman"
CF_THRESHOLD = 0.95

filtered_mmse, dropped_mmse, corr_kept_mmse = correlation_filter(
    df_mmse_only, method=CF_METHOD, threshold=CF_THRESHOLD
)
filtered_moca, dropped_moca, corr_kept_moca = correlation_filter(
    df_moca_only, method=CF_METHOD, threshold=CF_THRESHOLD
)

# Save correlation of kept variables and plots
for label, corr_kept, dropped in [
    ("mmse_only", corr_kept_mmse, dropped_mmse),
    ("moca_only", corr_kept_moca, dropped_moca),
]:
    out_csv = out_dir / f"corr_{label}_filtered_{CF_METHOD}.csv"
    corr_kept.to_csv(out_csv)
    print(f"[{label}] Saved filtered correlation matrix to {out_csv}")
    plt.figure(figsize=(max(8, min(20, 0.35 * corr_kept.shape[1])), max(6, min(20, 0.35 * corr_kept.shape[0]))))
    sns.heatmap(corr_kept, cmap="vlag", center=0, square=True, cbar_kws={"shrink": 0.6}, linewidths=0.3)
    plt.title(f"{label.upper()} — filtered ({CF_METHOD}) correlation")
    plt.tight_layout()
    img_path = out_dir / f"corr_{label}_filtered_{CF_METHOD}.png"
    plt.savefig(img_path, dpi=200)
    plt.close()
    if dropped:
        print(f"[{label}] Dropped due to high correlation (|r|>={CF_THRESHOLD}): {', '.join(dropped)}")


### Exploratory Factor Analysis (separate step)


In [None]:
run_efa(filtered_mmse, label="mmse_only", out_dir=out_dir, rotation="promax", kaiser=True, n_factors=None)
run_efa(filtered_moca, label="moca_only", out_dir=out_dir, rotation="promax", kaiser=True, n_factors=None)
