# FB2NEP Workbook 9 – Regression and Modelling (Part 2)

This workbook:

- Continues regression modelling.
- Introduces missing data in the context of regression.
- Shows how complete‑case analysis can bias results.
- Provides a preview of imputation.

In [None]:
from __future__ import annotations

import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm

%matplotlib inline

DATA_PATH = pathlib.Path("data") / "fb2nep_synthetic.csv"
df = pd.read_csv(DATA_PATH)
df.head()

## 1. Revisiting the SBP regression model with missing data

In [None]:
if "sex" in df.columns:
    df["sex"] = df["sex"].astype("category")

cols = [c for c in ["sbp", "bmi", "age", "sex"] if c in df.columns]
df_model = df[cols].copy()
df_model.isna().mean()

In [None]:
# For teaching: induce missingness in age if it is currently complete

if "age" in df_model.columns and df_model["age"].isna().sum() == 0:
    np.random.seed(11088)
    idx = np.random.choice(df_model.index, size=int(0.2 * len(df_model)), replace=False)
    df_model.loc[idx, "age"] = np.nan

df_model.isna().mean()

In [None]:
# Complete‑case analysis

df_complete = df_model.dropna()
print(f"Number of observations – complete cases: {len(df_complete)}")
model_cc = smf.ols("sbp ~ bmi + age + C(sex)", data=df_complete).fit()
model_cc.params

In [None]:
# Simple mean imputation for age (illustrative only)

df_imp = df_model.copy()
if "age" in df_imp.columns:
    mean_age = df_imp["age"].mean(skipna=True)
    df_imp["age_imp"] = df_imp["age"].fillna(mean_age)

model_imp = smf.ols("sbp ~ bmi + age_imp + C(sex)", data=df_imp.dropna(subset=["sbp", "bmi", "age_imp", "sex"])).fit()
comparison = pd.DataFrame({"complete_case": model_cc.params, "simple_impute": model_imp.params})
comparison

## 2. Logistic regression with a missing confounder

In [None]:
df_log = df_model.copy()
if "high_upf" not in df_log.columns:
    if "energy_kcal" in df.columns:
        median_energy = df["energy_kcal"].median()
        df_log["high_upf"] = (df["energy_kcal"] > median_energy).astype(int)
    else:
        np.random.seed(11088)
        df_log["high_upf"] = np.random.randint(0, 2, size=len(df_log))

df_cc_log = df_log.dropna()
logit_cc = smf.logit("high_upf ~ bmi + age + C(sex)", data=df_cc_log).fit()
logit_naive = smf.logit("high_upf ~ bmi + C(sex)", data=df_cc_log).fit()

or_cc = np.exp(logit_cc.params["bmi"])
or_naive = np.exp(logit_naive.params["bmi"])
print(f"OR for BMI (complete case, adjusted): {or_cc:.2f}")
print(f"OR for BMI (adjusted only for sex): {or_naive:.2f}")

## 3. Toy multiple imputation example

In [None]:
if "age" in df_model.columns:
    m = 5
    results = []
    mean_age = df_model["age"].mean(skipna=True)
    sd_age = df_model["age"].std(skipna=True)
    for i in range(m):
        tmp = df_model.copy()
        mask = tmp["age"].isna()
        n_miss = mask.sum()
        tmp.loc[mask, "age_imp"] = np.random.normal(mean_age, sd_age, size=n_miss)
        tmp.loc[~mask, "age_imp"] = tmp.loc[~mask, "age"]
        res = smf.ols("sbp ~ bmi + age_imp + C(sex)", data=tmp.dropna(subset=["sbp", "bmi", "age_imp", "sex"])).fit()
        results.append(res.params)
    imputed_params = pd.concat(results, axis=1)
    imputed_params.columns = [f"imp_{i+1}" for i in range(m)]
    imputed_params