# 02 · Description of the population — Table 1 & missing data

> **Purpose**: produce a defensible baseline description of the cohort, and explore missingness.

> **Learning objectives**
- Create a “Table 1” summary of baseline characteristics by outcome.
- Handle numeric vs categorical variables appropriately.
- Explore missing data (MCAR/MAR/MNAR patterns) visually and descriptively.
- Recognise the limits of simple imputation.

---

In [None]:
# Bootstrap: ensure repo root on path, then import init
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))
from scripts.bootstrap import init
df, ctx = init()
df.head(3)

## 1. Baseline description
A classic “Table 1” shows demographics, lifestyle, and clinical measures overall and by outcome group.

Here we’ll look at **CVD_incident** as the stratifier; the same pipeline works for cancer.

In [None]:
import pandas as pd, numpy as np

def table1(data, groupvar, cont, cat):
    out = []
    for v in cont:
        desc = data.groupby(groupvar)[v].agg(['mean','std','median']).round(2)
        out.append(desc)
    for v in cat:
        desc = data.groupby([groupvar,v]).size().unstack(fill_value=0)
        desc = (desc.T / desc.T.sum()).T.round(3)
        out.append(desc)
    return out

cont_vars = ['age','BMI','SBP','energy_kcal','fruit_veg_g_d','red_meat_g_d','salt_g_d']
cat_vars = ['sex','smoking_status','physical_activity','SES_class','IMD_quintile','menopausal_status']

tbls = table1(df, 'CVD_incident', cont_vars, cat_vars)
tbls[0].head()  # just show first piece

In [None]:


def make_table1(
    df: pd.DataFrame,
    group: str | None = None,
    continuous: list[str] = None,
    categorical: list[str] = None,
    digits: int = 2,
    include_overall: bool = True,
    dropna_group: bool = True,
):
    """
    Build a single tidy 'Table 1' DataFrame.

    Parameters
    ----------
    df : DataFrame
        Source data.
    group : str | None
        Optional stratifier (e.g., 'CVD_incident'). If None, only overall is returned.
    continuous : list[str]
        Names of continuous variables to summarise.
    categorical : list[str]
        Names of categorical variables to summarise.
    digits : int
        Rounding for numeric statistics.
    include_overall : bool
        Include an 'Overall' column alongside group columns.
    dropna_group : bool
        Drop rows with missing group when grouping.

    Returns
    -------
    DataFrame with MultiIndex rows:
      (variable, stat_or_level)  × columns = group(s) (plus Overall if requested).
    """

    continuous = continuous or []
    categorical = categorical or []
    data = df.copy()

    # Ensure categoricals are categorical dtype for stable level order
    for c in categorical:
        if c in data.columns and data[c].dtype.name not in ("category",):
            data[c] = data[c].astype("category")

    # Helper: numeric summary
    def cont_summary(d: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
        if not cols:
            return pd.DataFrame()
        s = d[cols].agg(["mean", "std", "median", "count"]).T
        s = s.rename(columns={"std": "sd", "count": "n"})
        return s

    # Helper: categorical counts/percents
    def cat_summary(d: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
        pieces = []
        for c in cols:
            ct = d[c].value_counts(dropna=False)
            n = ct.astype(int)
            p = (n / n.sum()).astype(float)
            tmp = pd.DataFrame({"n": n, "%": p})
            tmp.index.name = "level"
            tmp["variable"] = c
            pieces.append(tmp.reset_index())
        if pieces:
            out = pd.concat(pieces, axis=0, ignore_index=True)
            # MultiIndex rows: (variable, level) with columns n, %
            out = out.set_index(["variable", "level"]).sort_index()
        else:
            out = pd.DataFrame()
        return out

    # Build overall
    cols = []
    if include_overall:
        cont_overall = cont_summary(data, continuous).round(digits)
        cat_overall = cat_summary(data, categorical)
        # unify into same row space using MultiIndex
        cont_overall.index = pd.MultiIndex.from_product([cont_overall.index, ["mean","sd","median","n"]])
        cont_overall = cont_overall.stack().unstack(-1)  # to have a single column we’ll rename
        cont_overall.columns = ["Overall"]  # single stat stacked per row
        # For continuous we want separate rows for each stat:
        # (variable, stat) -> value in "Overall"
        cont_overall = cont_overall["Overall"].unstack(level=1).round(digits)

        if not cat_overall.empty:
            cat_overall = cat_overall.copy()
            cat_overall["Overall_n"] = cat_overall["n"].astype("Int64")
            cat_overall["Overall_%"] = (100 * cat_overall["%"]).round(1)
            cat_overall = cat_overall.drop(columns=["n", "%"])

    # Build by group
    group_cols = []
    if group and group in data:
        if dropna_group:
            gdata = data.dropna(subset=[group])
        else:
            gdata = data.copy()
        grouped = gdata.groupby(group, observed=True)

        # Continuous by group
        cont_by = []
        if continuous:
            for g, d in grouped:
                s = cont_summary(d, continuous)
                s.index = pd.MultiIndex.from_product([s.index, ["mean","sd","median","n"]])
                s = s.stack().unstack(-1)
                s.columns = [str(g)]
                s = s[str(g)].unstack(level=1).round(digits)
                cont_by.append(s)
            cont_by = pd.concat(cont_by, axis=1) if cont_by else pd.DataFrame()

        # Categorical by group
        cat_by = []
        if categorical:
            for g, d in grouped:
                cs = cat_summary(d, categorical)
                if cs.empty:
                    continue
                cs = cs.copy()
                cs[f"{g}_n"] = cs["n"].astype("Int64")
                cs[f"{g}_%"] = (100 * cs["%"]).round(1)
                cs = cs.drop(columns=["n", "%"])
                cat_by.append(cs)
            cat_by = pd.concat(cat_by, axis=1) if cat_by else pd.DataFrame()

    # Combine continuous overall + by-group
    blocks = []
    if include_overall:
        blocks.append(cont_overall)
    if group and continuous:
        blocks.append(cont_by)
    cont_block = pd.concat(blocks, axis=1) if blocks else pd.DataFrame()
    if not cont_block.empty:
        cont_block.index.names = ["variable", "stat"]

    # Combine categorical overall + by-group
    if include_overall:
        cat_all = cat_overall if group is None else pd.concat([cat_overall, cat_by], axis=1)
    else:
        cat_all = cat_by if group else pd.DataFrame()

    # Stack results with a blank row between continuous and categorical (nicer display)
    if cont_block.empty and (cat_all is None or cat_all.empty):
        return pd.DataFrame()

    parts = []
    if not cont_block.empty:
        parts.append(cont_block)
    if cat_all is not None and not cat_all.empty:
        # align column order: Overall, then groups interleaving n/% for categoricals
        # leave as-is; it’s readable
        parts.append(cat_all)

    out = pd.concat(parts, axis=0)
    return out


# ---- Use it on this cohort ----
continuous = ['age','BMI','SBP','energy_kcal','fruit_veg_g_d','red_meat_g_d','salt_g_d']
categorical = ['sex','smoking_status','physical_activity','SES_class','IMD_quintile','menopausal_status']

t1 = make_table1(
    df,
    group="CVD_incident",
    continuous=continuous,
    categorical=categorical,
    digits=2,
    include_overall=True
)

t1.head(20)


### Discussion
- Which variables differ most clearly between incident vs non-incident CVD?
- How would you decide which variables to adjust for in regression later?
- What are the limits of “Table 1 p-values”?

## 2. Missing data overview
Missingness can be:
- **MCAR** (completely at random).
- **MAR** (depends on observed variables, e.g. age, SES).
- **MNAR** (depends on unobserved values — hard to detect).

Synthetic data were generated with ~2–3% MCAR, 5–8% MAR, tiny MNAR. Let’s explore.

In [None]:
# Overall % missing per variable
missing = df.isnull().mean().sort_values(ascending=False).round(3)
missing.head(12)

In [None]:
# Heatmap of missingness (simple)
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
plt.imshow(df[missing.index[:20]].isnull(), aspect='auto', interpolation='none', cmap='gray_r')
plt.xlabel('Variables (top 20 by missingness)')
plt.ylabel('Individuals')
plt.title('Missingness pattern (white=missing)')
plt.show()

### Simple associations with missingness
Check whether missingness correlates with observed variables — a clue for MAR.

In [None]:
# Example: missing vitamin C vs SES
miss_vitC = df['plasma_vitC_umol_L'].isnull().astype(int)
pd.crosstab(df['SES_class'], miss_vitC, normalize='index').round(3)

## 3. Handling missing data (teaching only)
- **Complete case analysis**: drop missing rows.
- **Simple imputation**: mean/median — acceptable for teaching but not best practice.
- **Better**: multiple imputation (beyond this course).

⚠️ We use only crude strategies here to highlight the issues.

In [None]:
df_cc = df.dropna(subset=['plasma_vitC_umol_L','urinary_sodium_mmol_L'])
print("Original n=",len(df),"; complete-case n=",len(df_cc))

In [None]:
# Simple median imputation example for BMI
bmi_imp = df['BMI'].fillna(df['BMI'].median())
print("BMI: before missing=",df['BMI'].isnull().sum(),"; after=",bmi_imp.isnull().sum())

> ## Key takeaways
>
> - A reproducible “Table 1” builds the foundation for all later analyses.
> - Always check **missingness patterns**; assume MCAR is rare.
> - Simple imputations are for teaching — in practice, use multiple imputation.
> - Missingness itself may hold **epidemiological meaning** (e.g. low SES → more missing biomarker data).

> **Next:** analyse exposures — distributions and how biomarkers track with intake.