# 02 — Cohort Definition & Dataset Build (HCS)

This notebook:
- Loads and concatenates `cohorte_and` + `cohorte_rs`
- Keeps relevant variables (1st trimester predictors + outcome definition variables)
- Renames columns to English
- Defines MSPH label from 3rd trimester total cholesterol (ct3)
- Validates and (optionally) corrects BMI using Weight/Height
- Exports an early-only modeling dataset + reproducibility metadata

Key rule:
- Third trimester variables are excluded from predictors (ct3/tg3/hdl3/ldl3).
- ct3 is used ONLY to define the outcome label.

In [38]:
from __future__ import annotations

from pathlib import Path
from datetime import datetime
import json
import hashlib

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

In [39]:
RAW_PATH = Path("../../data/raw_data/raw_data.xlsx")
SHEETS = ["cohorte_and", "cohorte_rs"]

OUT_DIR = Path("../../data/processed")
META_DIR = Path("../../data/metadata")
OUT_DIR.mkdir(parents=True, exist_ok=True)
META_DIR.mkdir(parents=True, exist_ok=True)

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")

# Outcome threshold (keep consistent with manuscript)
CT3_THRESHOLD = 290  # MSPH = 1 if ct3 > 290
LABEL_RULE = "ct3 > 290"


In [40]:
def file_sha256(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()

assert RAW_PATH.exists(), f"Raw file not found: {RAW_PATH}"
raw_hash = file_sha256(RAW_PATH)
print("RAW_PATH:", RAW_PATH)
print("SHA256:", raw_hash)

RAW_PATH: ../../data/raw_data/raw_data.xlsx
SHA256: 34e75e75253dc896f736050ee627730beb9a9f2f33c0ffcf8bb8f10f25ecc38e


In [41]:
def read_sheet(sheet_name: str) -> pd.DataFrame:
    df = pd.read_excel(RAW_PATH, sheet_name=sheet_name, dtype_backend="numpy_nullable")
    df.columns = [str(c).strip() for c in df.columns]
    return df

df_and = read_sheet("cohorte_and")
df_rs  = read_sheet("cohorte_rs")

print("cohorte_and:", df_and.shape)
print("cohorte_rs :", df_rs.shape)

cohorte_and: (83, 16)
cohorte_rs : (77, 19)


In [42]:
# Raw columns (as in the Excel)
RAW_ID = "id"

RAW_EARLY = [
    "edad", "peso", "talla", "imc", "glicemia",
    "pas1t", "pad1t",
    "ct1", "tg1", "hdl1", "ldl1",
]

# Third trimester vars exist in raw, but must NOT be used as predictors
RAW_T3_FOR_LABEL = ["ct3"]  # only ct3 needed for label
RAW_T3_EXCLUDE = ["tg3", "hdl3", "ldl3"]  # explicitly exclude from predictors if present

RAW_KEEP = [RAW_ID] + RAW_EARLY + RAW_T3_FOR_LABEL + RAW_T3_EXCLUDE  # keep tg3/hdl3/ldl3 only for auditing (optional)


In [43]:
def require_cols(df: pd.DataFrame, cols: list[str], name: str) -> None:
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"[{name}] Missing columns: {missing}")

require_cols(df_and, [RAW_ID] + RAW_EARLY + RAW_T3_FOR_LABEL, "cohorte_and")
require_cols(df_rs,  [RAW_ID] + RAW_EARLY + RAW_T3_FOR_LABEL, "cohorte_rs")

# Optional: tg3/hdl3/ldl3 might not exist; that's fine.
print("Required columns present")


Required columns present


In [44]:
# Keep only columns that exist (including optional t3 biomarkers)
def intersect_keep(df: pd.DataFrame, cols: list[str]) -> list[str]:
    return [c for c in cols if c in df.columns]

keep_and = intersect_keep(df_and, RAW_KEEP)
keep_rs  = intersect_keep(df_rs,  RAW_KEEP)

df_and2 = df_and[keep_and].copy()
df_rs2  = df_rs[keep_rs].copy()

df_and2["_sheet"] = "cohorte_and"
df_rs2["_sheet"]  = "cohorte_rs"

df_raw = pd.concat([df_and2, df_rs2], ignore_index=True)

print("Concatenated:", df_raw.shape)
df_raw["_sheet"].value_counts()

Concatenated: (160, 17)


_sheet
cohorte_and    83
cohorte_rs     77
Name: count, dtype: int64

In [45]:
df_raw["row_id"] = (
    df_raw["_sheet"].astype(str)
    + "_"
    + df_raw.groupby("_sheet").cumcount().astype(str).str.zfill(6)
)

df_raw[["row_id", "_sheet", RAW_ID]].head()

Unnamed: 0,row_id,_sheet,id
0,cohorte_and_000000,cohorte_and,1
1,cohorte_and_000001,cohorte_and,2
2,cohorte_and_000002,cohorte_and,3
3,cohorte_and_000003,cohorte_and,4
4,cohorte_and_000004,cohorte_and,5


In [46]:
# Programmatic column names (clean, snake_case)
RENAME_MAP = {
    "id": "LocalID",
    "edad": "Age",
    "peso": "Weight",
    "talla": "Height",
    "imc": "BMI",
    "glicemia": "Glycemia",
    "pas1t": "SBP_1T",   # systolic blood pressure 1st trimester
    "pad1t": "DBP_1T",   # diastolic blood pressure 1st trimester
    "ct1": "TC_1T",      # total cholesterol 1st trimester
    "tg1": "TG_1T",
    "hdl1": "HDL_1T",
    "ldl1": "LDL_1T",
    "ct3": "TC_3T",      # used only for MSPH definition
    "tg3": "TG_3T",
    "hdl3": "HDL_3T",
    "ldl3": "LDL_3T",
}

# Pretty labels (for figures/tables)
PRETTY_LABELS = {
    "Age": "Age",
    "Weight": "Weight",
    "Height": "Height",
    "BMI": "BMI",
    "Glycemia": "Glycemia",
    "SBP_1T": "Systolic blood pressure (1st trimester)",
    "DBP_1T": "Diastolic blood pressure (1st trimester)",
    "TC_1T": "Total cholesterol (1st trimester)",
    "TG_1T": "Triglycerides (1st trimester)",
    "HDL_1T": "HDL (1st trimester)",
    "LDL_1T": "LDL (1st trimester)",
    "MSPH": "Maternal supraphysiological hypercholesterolemia (MSPH)",
}

COLUMN_UNITS = {
    "Age": "years",
    "Weight": "kg",
    "Height": "cm",
    "BMI_final": "kg/m²",
    "Glycemia": "mg/dL",
    "SBP_1T": "mmHg",
    "DBP_1T": "mmHg",
    "TC_1T": "mg/dL",
    "TG_1T": "mg/dL",
    "HDL_1T": "mg/dL",
    "LDL_1T": "mg/dL",
}

COLUMN_ROLES = {
    "row_id": "identifier",
    "LocalID": "identifier_local",
    "Age": "predictor",
    "Weight": "predictor",
    "Height": "predictor",
    "BMI_final": "predictor",
    "Glycemia": "predictor",
    "SBP_1T": "predictor",
    "DBP_1T": "predictor",
    "TC_1T": "predictor",
    "TG_1T": "predictor",
    "HDL_1T": "predictor",
    "LDL_1T": "predictor",
    "MSPH": "target",
}

df = df_raw.rename(columns={k: v for k, v in RENAME_MAP.items() if k in df_raw.columns}).copy()
df.columns


Index(['LocalID', 'Age', 'Weight', 'Height', 'BMI', 'Glycemia', 'SBP_1T', 'DBP_1T', 'TC_1T', 'TG_1T', 'HDL_1T', 'LDL_1T', 'TC_3T', 'TG_3T', 'HDL_3T', 'LDL_3T', '_sheet', 'row_id'], dtype='str')

In [47]:
# Ensure numeric coercion
df["TC_3T"] = pd.to_numeric(df["TC_3T"], errors="coerce")

df["MSPH"] = (df["TC_3T"] > CT3_THRESHOLD).astype("Int64")

df["MSPH"].value_counts(dropna=False)


MSPH
0    116
1     44
Name: count, dtype: Int64

In [48]:
# Coerce types
for c in ["Weight", "Height", "BMI"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Heuristic:
# - if typical Height values are ~150-190 -> cm
# - if typical Height values are ~1.50-1.90 -> meters
h = df["Height"].dropna()
if len(h) == 0:
    raise ValueError("Height has no non-null values.")

p50 = float(h.quantile(0.50))
p95 = float(h.quantile(0.95))
print("Height p50:", p50, "| p95:", p95)

height_unit_guess = "cm" if p50 > 10 else "m"
print("Guessed height unit:", height_unit_guess)

Height p50: 161.0 | p95: 171.0
Guessed height unit: cm


In [49]:
def compute_bmi(weight_kg: pd.Series, height: pd.Series, unit: str) -> pd.Series:
    if unit == "cm":
        h_m = height / 100.0
    elif unit == "m":
        h_m = height
    else:
        raise ValueError("unit must be 'cm' or 'm'")
    bmi = weight_kg / (h_m ** 2)
    return bmi

BMI_calc = compute_bmi(df["Weight"], df["Height"], height_unit_guess)
df["BMI_calc"] = BMI_calc
df[["Weight", "Height", "BMI", "BMI_calc"]].head(10)


Unnamed: 0,Weight,Height,BMI,BMI_calc
0,68.0,169,23.8,23.80869
1,58.8,162,22.4,22.405121
2,63.0,160,24.6,24.609375
3,49.0,154,20.7,20.661157
4,62.0,163,23.3,23.335466
5,81.0,170,28.0,28.027682
6,71.0,164,26.4,26.397977
7,53.0,167,19.0,19.003908
8,65.0,173,21.7,21.718066
9,59.0,153,25.2,25.203981


In [50]:
# Difference between stored BMI and computed BMI
df["BMI_diff"] = df["BMI"] - df["BMI_calc"]

# Flag suspicious differences (tune thresholds if needed)
# Here: abs diff > 1.0 BMI unit OR computed BMI out of plausible range
df["BMI_flag"] = (
    df["BMI"].isna()
    | df["BMI_calc"].isna()
    | (df["BMI_calc"] < 10)
    | (df["BMI_calc"] > 80)
    | (df["BMI_diff"].abs() > 1.0)
)

print("BMI flagged rows:", int(df["BMI_flag"].sum()), "/", len(df))
df.loc[df["BMI_flag"], ["row_id", "_sheet", "LocalID", "Weight", "Height", "BMI", "BMI_calc", "BMI_diff"]].head(20)


BMI flagged rows: 5 / 160


Unnamed: 0,row_id,_sheet,LocalID,Weight,Height,BMI,BMI_calc,BMI_diff
53,cohorte_and_000053,cohorte_and,54,57.0,158.0,21.7,22.832879,-1.132879
54,cohorte_and_000054,cohorte_and,55,82.0,176.0,27.7,26.472107,1.227893
113,cohorte_rs_000030,cohorte_rs,21,59.0,163.0,27.9,22.206331,5.693669
115,cohorte_rs_000032,cohorte_rs,43,,,,,
133,cohorte_rs_000050,cohorte_rs,55,61.0,,,,


In [51]:
# Recommended: replace BMI only when flagged
df["BMI_final"] = df["BMI"]
df.loc[df["BMI_flag"], "BMI_final"] = df.loc[df["BMI_flag"], "BMI_calc"]

# Optional: keep a note column
df["BMI_source"] = np.where(df["BMI_flag"], "computed_from_weight_height", "as_recorded")

df[["BMI", "BMI_calc", "BMI_final", "BMI_source"]].head(10)


Unnamed: 0,BMI,BMI_calc,BMI_final,BMI_source
0,23.8,23.80869,23.8,as_recorded
1,22.4,22.405121,22.4,as_recorded
2,24.6,24.609375,24.6,as_recorded
3,20.7,20.661157,20.7,as_recorded
4,23.3,23.335466,23.3,as_recorded
5,28.0,28.027682,28.0,as_recorded
6,26.4,26.397977,26.4,as_recorded
7,19.0,19.003908,19.0,as_recorded
8,21.7,21.718066,21.7,as_recorded
9,25.2,25.203981,25.2,as_recorded


In [52]:
bmi_report = df[[
    "row_id", "_sheet", "LocalID",
    "Weight", "Height", "BMI", "BMI_calc", "BMI_final", "BMI_diff", "BMI_flag", "BMI_source"
]].copy()

bmi_out = META_DIR / f"hcs_bmi_qc_{RUN_TAG}.csv"
bmi_report.to_csv(bmi_out, index=False)
print("Saved BMI QC report:", bmi_out)


Saved BMI QC report: ../../data/metadata/hcs_bmi_qc_20260129_104323.csv


In [53]:
# Early predictors (English names)
EARLY_PREDICTORS = [
    "Age",
    "Weight",
    "Height",
    "BMI_final",     # use corrected BMI
    "Glycemia",
    "SBP_1T",
    "DBP_1T",
    "TC_1T",
    "TG_1T",
    "HDL_1T",
    "LDL_1T",
]

TARGET = "MSPH"

# Require cols
missing = [c for c in EARLY_PREDICTORS + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns after renaming/BMI: {missing}")

df_early = df.dropna(subset=EARLY_PREDICTORS + ["TC_3T"]).copy()  # TC_3T needed only for label integrity
df_early = df_early.dropna(subset=[TARGET]).copy()

df_model = df_early[["row_id", "_sheet", "LocalID"] + EARLY_PREDICTORS + [TARGET]].copy()

print("Model dataset:", df_model.shape)
print("Class balance:", df_model[TARGET].value_counts(normalize=True).round(3))
df_model.head()


Model dataset: (148, 15)
Class balance: MSPH
0    0.716
1    0.284
Name: proportion, dtype: Float64


Unnamed: 0,row_id,_sheet,LocalID,Age,Weight,Height,BMI_final,Glycemia,SBP_1T,DBP_1T,TC_1T,TG_1T,HDL_1T,LDL_1T,MSPH
0,cohorte_and_000000,cohorte_and,1,33,68.0,169,23.8,76.2,101,60,239,85.1,69.0,153.0,1
1,cohorte_and_000001,cohorte_and,2,37,58.8,162,22.4,71.9,101,58,222,97.0,65.6,137.0,1
2,cohorte_and_000002,cohorte_and,3,30,63.0,160,24.6,79.8,113,58,197,100.0,56.3,120.7,1
3,cohorte_and_000003,cohorte_and,4,26,49.0,154,20.7,81.3,98,53,228,67.0,80.4,134.2,1
4,cohorte_and_000004,cohorte_and,5,25,62.0,163,23.3,82.5,106,61,215,130.0,52.6,136.4,1


In [54]:
flow = []

def add_flow(step: str, d: pd.DataFrame) -> None:
    flow.append({
        "step": step,
        "n_rows": int(len(d)),
        "n_pos": int((d[TARGET] == 1).sum()) if TARGET in d.columns else np.nan,
        "n_neg": int((d[TARGET] == 0).sum()) if TARGET in d.columns else np.nan,
        "pct_pos": float((d[TARGET] == 1).mean() * 100) if TARGET in d.columns else np.nan,
    })

add_flow("01_concat_raw_sheets", df)
add_flow("02_after_label_definition", df)
add_flow("03_after_bmi_qc", df)
add_flow("04_strict_complete_cases_for_model", df_model)

flow_df = pd.DataFrame(flow)
flow_df


Unnamed: 0,step,n_rows,n_pos,n_neg,pct_pos
0,01_concat_raw_sheets,160,44,116,27.5
1,02_after_label_definition,160,44,116,27.5
2,03_after_bmi_qc,160,44,116,27.5
3,04_strict_complete_cases_for_model,148,42,106,28.378378


In [55]:
# Ensure no 3rd trimester biomarkers are in predictors
for forbidden in ["TC_3T", "TG_3T", "HDL_3T", "LDL_3T"]:
    assert forbidden not in EARLY_PREDICTORS, f"Leakage: {forbidden} is in predictors!"

print("Anti-leakage checks passed")


Anti-leakage checks passed


In [56]:
dataset_out = OUT_DIR / f"hcs_early_only_dataset_{RUN_TAG}.csv"
df_model.to_csv(dataset_out, index=False)

flow_out = META_DIR / f"hcs_flow_{RUN_TAG}.csv"
flow_df.to_csv(flow_out, index=False)

metadata = {
    "run_tag": RUN_TAG,
    "raw_path": str(RAW_PATH),
    "raw_sha256": raw_hash,
    "sheets_used": SHEETS,
    "strategy": "append/concat both sheets; no patient-level merge assumed",

    "columns_raw": {
        "id": RAW_ID,
        "early": RAW_EARLY,
        "t3_label": RAW_T3_FOR_LABEL,
        "t3_excluded": RAW_T3_EXCLUDE,
    },

    "renaming_map_used": {k: v for k, v in RENAME_MAP.items() if k in df_raw.columns},

    "pretty_columns": PRETTY_LABELS,
    "units": COLUMN_UNITS,
    "column_roles": COLUMN_ROLES,

    "bmi_qc": {
        "height_unit_guess": height_unit_guess,
        "rule_flag": "abs(BMI - BMI_calc) > 1.0 OR BMI_calc outside [10,80] OR BMI missing",
        "final_bmi_column": "BMI_final",
        "bmi_report_csv": str(bmi_out),
    },

    "target": {
        "name": TARGET,
        "rule": LABEL_RULE,
        "threshold": CT3_THRESHOLD,
        "positive_definition": "MSPH = 1",
        "negative_definition": "MSPH = 0",
    },

    "predictors": EARLY_PREDICTORS,
    "excluded_predictors": ["TC_3T", "TG_3T", "HDL_3T", "LDL_3T"],

    "final_counts": {
        "n_rows": int(len(df_model)),
        "n_pos": int((df_model[TARGET] == 1).sum()),
        "n_neg": int((df_model[TARGET] == 0).sum()),
        "pct_pos": float((df_model[TARGET] == 1).mean() * 100),
    },

    "notes": [
        "Third trimester variables are excluded from predictors; ct3 is used only to define the outcome label.",
        "LocalID may repeat across sheets; row_id is used for stable bookkeeping.",
        "Pretty column names are provided for figures and tables; code uses snake/camel case identifiers.",
    ],
}

meta_out = META_DIR / f"hcs_dataset_metadata_{RUN_TAG}.json"
meta_out.write_text(json.dumps(metadata, indent=2), encoding="utf-8")

print("Saved metadata with pretty columns:", meta_out)

print("Saved dataset:", dataset_out)
print("Saved flow:", flow_out)
print("Saved metadata:", meta_out)

Saved metadata with pretty columns: ../../data/metadata/hcs_dataset_metadata_20260129_104323.json
Saved dataset: ../../data/processed/hcs_early_only_dataset_20260129_104323.csv
Saved flow: ../../data/metadata/hcs_flow_20260129_104323.csv
Saved metadata: ../../data/metadata/hcs_dataset_metadata_20260129_104323.json


## Outputs
- `data/processed/hcs_early_only_dataset_<RUN_TAG>.csv`
- `data/metadata/hcs_bmi_qc_<RUN_TAG>.csv`
- `data/metadata/hcs_flow_<RUN_TAG>.csv`
- `data/metadata/hcs_dataset_metadata_<RUN_TAG>.json`

Next:
- Notebook 03: missingness patterns + sensitivity dataset (imputation vs strict) + start validation design
