# FB2NEP Workbook 11 – Missing Data and Sensitivity Analysis

This workbook:

- Compares complete‑case analysis with simple and multiple imputation.
- Introduces sensitivity analyses.
- Briefly mentions Bayesian approaches.

In [None]:
from __future__ import annotations

import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm

%matplotlib inline

DATA_PATH = pathlib.Path("data") / "fb2nep_synthetic.csv"
df = pd.read_csv(DATA_PATH)
df.head()

## 1. Overview of missingness

In [None]:
vars_of_interest = [v for v in ["sbp", "bmi", "age", "sex", "smoking_cat"] if v in df.columns]
df_an = df[vars_of_interest].copy()
if "sex" in df_an.columns:
    df_an["sex"] = df_an["sex"].astype("category")
if "smoking_cat" in df_an.columns:
    df_an["smoking_cat"] = df_an["smoking_cat"].astype("category")
df_an.isna().mean()

## 2. Complete‑case analysis

In [None]:
df_cc = df_an.dropna()
print(f"Number of complete cases: {len(df_cc)}")

formula = "sbp ~ bmi + age"
if "sex" in df_cc.columns:
    formula += " + C(sex)"
if "smoking_cat" in df_cc.columns:
    formula += " + C(smoking_cat)"

model_cc = smf.ols(formula, data=df_cc).fit()
model_cc.params

## 3. Simple single imputation

In [None]:
df_si = df_an.copy()
for col in df_si.columns:
    if df_si[col].dtype.kind in "biufc":
        df_si[col] = df_si[col].fillna(df_si[col].mean())
    else:
        df_si[col] = df_si[col].fillna(df_si[col].mode().iloc[0])
model_si = smf.ols(formula, data=df_si).fit()
comparison = pd.DataFrame({"complete_case": model_cc.params, "single_impute": model_si.params})
comparison

## 4. Multiple imputation with MICE (basic example)

In [None]:
from statsmodels.imputation.mice import MICEData, MICE

df_mice = pd.get_dummies(df_an, drop_first=True)
mice_data = MICEData(df_mice)
endog = "sbp"
predictors = [c for c in df_mice.columns if c != endog]
formula_mice = endog + " ~ " + " + ".join(predictors)
mice = MICE(smf.ols, formula_mice, mice_data)
result_mice = mice.fit(10)  # small number of imputations for teaching
result_mice.summary()

## 5. Sensitivity analysis example

In [None]:
if {"bmi"}.issubset(df_cc.columns):
    df_cc_restricted = df_cc[df_cc["bmi"] < 40]
    model_cc_rest = smf.ols(formula, data=df_cc_restricted).fit()
    pd.DataFrame({"original_cc": model_cc.params, "restricted_cc": model_cc_rest.params})

## 6. Brief Bayesian note

Bayesian models treat parameters (and even missing values) as random variables with prior distributions.
Implementation requires specialised tools (for example, PyMC or Stan) and is beyond the scope of this module.

Conceptually, Bayesian approaches can:

- Integrate prior knowledge with data.
- Handle complex missing data structures.
- Provide full posterior distributions for quantities of interest.