# 03 — Missingness Analysis & Dataset Variants (HCS)

Input: PRE-CLEAN early-only dataset exported by Notebook 2
- Contains predictors + MSPH target
- Missing values preserved

Outputs:
- Missingness reports (global + by class)
- STRICT dataset (complete-case)
- IMPUTED dataset (exploratory sensitivity)
- Data card JSON (metadata for downstream modeling)


In [1]:
from __future__ import annotations

from pathlib import Path
from datetime import datetime
import json

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

In [2]:
# ---- EDIT THESE to point to your latest PRE-CLEAN artifacts ----
PRE_CLEAN_PATH = Path("../../data/processed/hcs_early_only_preclean_20260129_111749.csv")
META_PATH      = Path("../../data/metadata/hcs_dataset_metadata_20260129_111749.json")

OUT_DIR = Path("../../data/processed_variants")
REP_DIR = Path("../../data/reports")
OUT_DIR.mkdir(parents=True, exist_ok=True)
REP_DIR.mkdir(parents=True, exist_ok=True)

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")

assert PRE_CLEAN_PATH.exists(), f"Missing PRE-CLEAN dataset: {PRE_CLEAN_PATH}"
assert META_PATH.exists(), f"Missing metadata JSON: {META_PATH}"


In [3]:
df = pd.read_csv(PRE_CLEAN_PATH)
meta = json.loads(META_PATH.read_text(encoding="utf-8"))

pretty = meta.get("pretty_columns", {})
roles = meta.get("column_roles", {})
units = meta.get("units", {})

TARGET = meta.get("target", {}).get("name", "MSPH")
PREDICTORS = meta.get("predictors", [])
ID_COLS = ["row_id", "_sheet", "LocalID"]  # expected from Notebook 2

print("Loaded df:", df.shape)
print("TARGET:", TARGET)
print("n predictors:", len(PREDICTORS))
df.head()


Loaded df: (160, 15)
TARGET: MSPH
n predictors: 11


Unnamed: 0,row_id,_sheet,LocalID,Age,Weight,Height,BMI_final,Glycemia,SBP_1T,DBP_1T,TC_1T,TG_1T,HDL_1T,LDL_1T,MSPH
0,cohorte_and_000000,cohorte_and,1,33,68.0,169.0,23.8,76.2,101.0,60.0,239,85.1,69.0,153.0,1
1,cohorte_and_000001,cohorte_and,2,37,58.8,162.0,22.4,71.9,101.0,58.0,222,97.0,65.6,137.0,1
2,cohorte_and_000002,cohorte_and,3,30,63.0,160.0,24.6,79.8,113.0,58.0,197,100.0,56.3,120.7,1
3,cohorte_and_000003,cohorte_and,4,26,49.0,154.0,20.7,81.3,98.0,53.0,228,67.0,80.4,134.2,1
4,cohorte_and_000004,cohorte_and,5,25,62.0,163.0,23.3,82.5,106.0,61.0,215,130.0,52.6,136.4,1


In [4]:
required = ID_COLS + PREDICTORS + [TARGET]
missing_cols = [c for c in required if c not in df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

# Ensure numeric predictors are numeric (safe coercion)
for c in PREDICTORS + [TARGET]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Ensure target is binary-ish
target_counts = df[TARGET].value_counts(dropna=False)
print(target_counts)

# Quick check for unexpected labels
bad_labels = df.loc[~df[TARGET].isin([0, 1]) & df[TARGET].notna(), TARGET].unique()
print("Unexpected target labels:", bad_labels)


MSPH
0    116
1     44
Name: count, dtype: int64
Unexpected target labels: []


In [5]:
def missingness_summary(d: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame({
        "column": d.columns,
        "dtype": [str(t) for t in d.dtypes],
        "n_null": d.isna().sum().values,
        "pct_null": (d.isna().mean().values * 100).round(2),
        "n_unique": [d[c].nunique(dropna=True) for c in d.columns],
        "pretty": [pretty.get(c, c) for c in d.columns],
        "unit": [units.get(c, "") for c in d.columns],
        "role": [roles.get(c, "") for c in d.columns],
    })
    return out.sort_values(["pct_null", "n_null"], ascending=False).reset_index(drop=True)

miss_all = missingness_summary(df[ID_COLS + PREDICTORS + [TARGET]])
miss_pred = missingness_summary(df[PREDICTORS + [TARGET]])

miss_pred


Unnamed: 0,column,dtype,n_null,pct_null,n_unique,pretty,unit,role
0,Glycemia,float64,8,5.0,96,Fasting glycemia (mg/dL),mg/dL,predictor
1,SBP_1T,float64,4,2.5,38,"Systolic blood pressure (1st trimester, mmHg)",mmHg,predictor
2,DBP_1T,float64,4,2.5,38,"Diastolic blood pressure (1st trimester, mmHg)",mmHg,predictor
3,Height,float64,2,1.25,30,Height (cm),cm,predictor
4,BMI_final,float64,2,1.25,93,Body mass index (kg/m²),kg/m²,predictor
5,Weight,float64,1,0.62,52,Weight (kg),kg,predictor
6,Age,int64,0,0.0,26,Age (years),years,predictor
7,TC_1T,int64,0,0.0,83,"Total cholesterol (1st trimester, mg/dL)",mg/dL,predictor
8,TG_1T,float64,0,0.0,109,"Triglycerides (1st trimester, mg/dL)",mg/dL,predictor
9,HDL_1T,float64,0,0.0,106,"HDL cholesterol (1st trimester, mg/dL)",mg/dL,predictor


In [6]:
miss_all_out = REP_DIR / f"{RUN_TAG}_missingness_all_columns.csv"
miss_pred_out = REP_DIR / f"{RUN_TAG}_missingness_predictors_target.csv"

miss_all.to_csv(miss_all_out, index=False)
miss_pred.to_csv(miss_pred_out, index=False)

print("Saved:", miss_all_out)
print("Saved:", miss_pred_out)


Saved: ../../data/reports/20260129_114259_missingness_all_columns.csv
Saved: ../../data/reports/20260129_114259_missingness_predictors_target.csv


In [7]:
def missingness_by_group(d: pd.DataFrame, group_col: str, cols: list[str]) -> pd.DataFrame:
    rows = []
    for g, dg in d.groupby(group_col):
        m = dg[cols].isna().mean() * 100
        rows.append(pd.DataFrame({"group": g, "column": m.index, "pct_null": m.values}))
    out = pd.concat(rows, ignore_index=True)
    pivot = out.pivot(index="column", columns="group", values="pct_null").reset_index()
    # normalize column names
    pivot.columns = ["column"] + [f"pct_null_group_{int(c)}" for c in pivot.columns[1:]]
    return pivot

# Only use rows where target is known (0/1) for group analysis
df_known = df[df[TARGET].isin([0, 1])].copy()

miss_by_class = missingness_by_group(df_known, TARGET, PREDICTORS)

if "pct_null_group_0" in miss_by_class.columns and "pct_null_group_1" in miss_by_class.columns:
    miss_by_class["abs_diff_pct_points"] = (miss_by_class["pct_null_group_1"] - miss_by_class["pct_null_group_0"]).abs().round(2)
else:
    miss_by_class["abs_diff_pct_points"] = np.nan

miss_by_class["pretty"] = miss_by_class["column"].map(lambda c: pretty.get(c, c))
miss_by_class["unit"] = miss_by_class["column"].map(lambda c: units.get(c, ""))

miss_by_class.sort_values("abs_diff_pct_points", ascending=False).head(30)


Unnamed: 0,column,pct_null_group_0,pct_null_group_1,abs_diff_pct_points,pretty,unit
3,Glycemia,6.034483,2.272727,3.76,Fasting glycemia (mg/dL),mg/dL
5,Height,1.724138,0.0,1.72,Height (cm),cm
1,BMI_final,1.724138,0.0,1.72,Body mass index (kg/m²),kg/m²
10,Weight,0.862069,0.0,0.86,Weight (kg),kg
2,DBP_1T,2.586207,2.272727,0.31,"Diastolic blood pressure (1st trimester, mmHg)",mmHg
7,SBP_1T,2.586207,2.272727,0.31,"Systolic blood pressure (1st trimester, mmHg)",mmHg
0,Age,0.0,0.0,0.0,Age (years),years
6,LDL_1T,0.0,0.0,0.0,"LDL cholesterol (1st trimester, mg/dL)",mg/dL
4,HDL_1T,0.0,0.0,0.0,"HDL cholesterol (1st trimester, mg/dL)",mg/dL
8,TC_1T,0.0,0.0,0.0,"Total cholesterol (1st trimester, mg/dL)",mg/dL


In [8]:
mbc_out = REP_DIR / f"{RUN_TAG}_missingness_by_class.csv"
miss_by_class.to_csv(mbc_out, index=False)
print("Saved:", mbc_out)

Saved: ../../data/reports/20260129_114259_missingness_by_class.csv


In [9]:
rows = []
for c in PREDICTORS:
    miss_ind = df_known[c].isna()
    p1 = float(miss_ind[df_known[TARGET] == 1].mean()) if (df_known[TARGET] == 1).any() else np.nan
    p0 = float(miss_ind[df_known[TARGET] == 0].mean()) if (df_known[TARGET] == 0).any() else np.nan

    diff = (p1 - p0) if (p1 == p1 and p0 == p0) else np.nan
    rr = (p1 / p0) if (p1 == p1 and p0 not in [0, np.nan]) else np.nan

    rows.append({
        "feature": c,
        "pretty": pretty.get(c, c),
        "missing_%_MSPH1": round(p1 * 100, 2) if p1 == p1 else np.nan,
        "missing_%_MSPH0": round(p0 * 100, 2) if p0 == p0 else np.nan,
        "diff_pct_points": round(diff * 100, 2) if diff == diff else np.nan,
        "risk_ratio": round(rr, 3) if rr == rr else np.nan,
        "unit": units.get(c, ""),
    })

miss_dep = pd.DataFrame(rows).sort_values("diff_pct_points", ascending=False)
miss_dep.head(30)


Unnamed: 0,feature,pretty,missing_%_MSPH1,missing_%_MSPH0,diff_pct_points,risk_ratio,unit
0,Age,Age (years),0.0,0.0,0.0,,years
9,HDL_1T,"HDL cholesterol (1st trimester, mg/dL)",0.0,0.0,0.0,,mg/dL
10,LDL_1T,"LDL cholesterol (1st trimester, mg/dL)",0.0,0.0,0.0,,mg/dL
8,TG_1T,"Triglycerides (1st trimester, mg/dL)",0.0,0.0,0.0,,mg/dL
7,TC_1T,"Total cholesterol (1st trimester, mg/dL)",0.0,0.0,0.0,,mg/dL
5,SBP_1T,"Systolic blood pressure (1st trimester, mmHg)",2.27,2.59,-0.31,0.879,mmHg
6,DBP_1T,"Diastolic blood pressure (1st trimester, mmHg)",2.27,2.59,-0.31,0.879,mmHg
1,Weight,Weight (kg),0.0,0.86,-0.86,0.0,kg
2,Height,Height (cm),0.0,1.72,-1.72,0.0,cm
3,BMI_final,Body mass index (kg/m²),0.0,1.72,-1.72,0.0,kg/m²


In [10]:
md_out = REP_DIR / f"{RUN_TAG}_missingness_dependency_simple.csv"
miss_dep.to_csv(md_out, index=False)
print("Saved:", md_out)

Saved: ../../data/reports/20260129_114259_missingness_dependency_simple.csv


In [11]:
df_strict = df.dropna(subset=PREDICTORS + [TARGET]).copy()

# Keep only known target labels
df_strict = df_strict[df_strict[TARGET].isin([0, 1])].copy()

print("STRICT shape:", df_strict.shape)
print("STRICT class balance:")
print(df_strict[TARGET].value_counts(normalize=True).round(3))


STRICT shape: (148, 15)
STRICT class balance:
MSPH
0    0.716
1    0.284
Name: proportion, dtype: float64


In [12]:
strict_out = OUT_DIR / f"hcs_strict_complete_case_{RUN_TAG}.csv"
df_strict.to_csv(strict_out, index=False)
print("Saved:", strict_out)

Saved: ../../data/processed_variants/hcs_strict_complete_case_20260129_114259.csv


In [13]:
df_imp = df.copy()

# target must be known
df_imp = df_imp[df_imp[TARGET].isin([0, 1])].copy()

# median imputation for numeric predictors
for c in PREDICTORS:
    med = df_imp[c].median(skipna=True)
    df_imp[c] = df_imp[c].fillna(med)

print("IMPUTED shape:", df_imp.shape)
print("Remaining NaNs in predictors:", int(df_imp[PREDICTORS].isna().sum().sum()))


IMPUTED shape: (160, 15)
Remaining NaNs in predictors: 0


In [14]:
imp_out = OUT_DIR / f"hcs_imputed_exploratory_{RUN_TAG}.csv"
df_imp.to_csv(imp_out, index=False)
print("Saved:", imp_out)


Saved: ../../data/processed_variants/hcs_imputed_exploratory_20260129_114259.csv


In [15]:
data_card = {
    "run_tag": RUN_TAG,
    "input": {
        "pre_clean_dataset": str(PRE_CLEAN_PATH),
        "metadata_json": str(META_PATH),
    },
    "target": TARGET,
    "predictors": PREDICTORS,
    "pretty_columns": pretty,
    "units": units,
    "roles": roles,
    "missingness_reports": {
        "all_columns": str(miss_all_out),
        "predictors_target": str(miss_pred_out),
        "by_class": str(mbc_out),
        "dependency_simple": str(md_out),
    },
    "variants": {
        "strict_complete_case": str(strict_out),
        "imputed_exploratory": str(imp_out),
    },
    "counts": {
        "preclean_n": int(len(df)),
        "preclean_pos": int((df[TARGET] == 1).sum()),
        "preclean_neg": int((df[TARGET] == 0).sum()),
        "strict_n": int(len(df_strict)),
        "strict_pos": int((df_strict[TARGET] == 1).sum()),
        "strict_neg": int((df_strict[TARGET] == 0).sum()),
        "imputed_n": int(len(df_imp)),
        "imputed_pos": int((df_imp[TARGET] == 1).sum()),
        "imputed_neg": int((df_imp[TARGET] == 0).sum()),
    },
    "notes": [
        "STRICT is complete-case baseline.",
        "IMPUTED is exploratory; proper imputation should be done within CV folds to avoid leakage.",
        "This notebook quantifies missingness and whether it differs by MSPH status.",
    ],
}

card_out = REP_DIR / f"{RUN_TAG}_data_card.json"
card_out.write_text(json.dumps(data_card, indent=2), encoding="utf-8")
print("Saved data card:", card_out)


Saved data card: ../../data/reports/20260129_114259_data_card.json


## Next (Notebook 4 — Modeling & Validation)
- Repeated stratified CV (avoid tiny hold-out test)
- Scaling sensitivity (Standard/Robust/None)
- Balancing strategies (none, class_weight, undersampling, SMOTE)
- Calibration (Brier + reliability)
- Learning curves
- SHAP stability across folds

In [16]:
miss_pred.head(15)

Unnamed: 0,column,dtype,n_null,pct_null,n_unique,pretty,unit,role
0,Glycemia,float64,8,5.0,96,Fasting glycemia (mg/dL),mg/dL,predictor
1,SBP_1T,float64,4,2.5,38,"Systolic blood pressure (1st trimester, mmHg)",mmHg,predictor
2,DBP_1T,float64,4,2.5,38,"Diastolic blood pressure (1st trimester, mmHg)",mmHg,predictor
3,Height,float64,2,1.25,30,Height (cm),cm,predictor
4,BMI_final,float64,2,1.25,93,Body mass index (kg/m²),kg/m²,predictor
5,Weight,float64,1,0.62,52,Weight (kg),kg,predictor
6,Age,int64,0,0.0,26,Age (years),years,predictor
7,TC_1T,int64,0,0.0,83,"Total cholesterol (1st trimester, mg/dL)",mg/dL,predictor
8,TG_1T,float64,0,0.0,109,"Triglycerides (1st trimester, mg/dL)",mg/dL,predictor
9,HDL_1T,float64,0,0.0,106,"HDL cholesterol (1st trimester, mg/dL)",mg/dL,predictor


In [18]:
miss_by_class.sort_values(by="pct_null_group_0").head(10)

Unnamed: 0,column,pct_null_group_0,pct_null_group_1,abs_diff_pct_points,pretty,unit
0,Age,0.0,0.0,0.0,Age (years),years
4,HDL_1T,0.0,0.0,0.0,"HDL cholesterol (1st trimester, mg/dL)",mg/dL
6,LDL_1T,0.0,0.0,0.0,"LDL cholesterol (1st trimester, mg/dL)",mg/dL
8,TC_1T,0.0,0.0,0.0,"Total cholesterol (1st trimester, mg/dL)",mg/dL
9,TG_1T,0.0,0.0,0.0,"Triglycerides (1st trimester, mg/dL)",mg/dL
10,Weight,0.862069,0.0,0.86,Weight (kg),kg
5,Height,1.724138,0.0,1.72,Height (cm),cm
1,BMI_final,1.724138,0.0,1.72,Body mass index (kg/m²),kg/m²
2,DBP_1T,2.586207,2.272727,0.31,"Diastolic blood pressure (1st trimester, mmHg)",mmHg
7,SBP_1T,2.586207,2.272727,0.31,"Systolic blood pressure (1st trimester, mmHg)",mmHg
