# 🥗📊 Epidemiology in Nutrition: Cross-Sectional & Prospective Analyses

**Design**: large observational cohort (n≈25,000; age 45–80) with baseline + 2/4/6-year follow-up.

**Variables**: sex, smoking, physical activity, UK social class (A/B/C1/C2/D/E), BMI (baseline, 2y, 4y, 6y), BP, sugar intake, SFA intake, time-to-CVD and CVD incidence (event), random missingness.

**Endpoints**
- Cross-sectional: baseline BMI.
- Survival: incident CVD.
- Prospective: BMI trajectories and prospective CVD prediction.

This notebook **loads a CSV if present** or **simulates** a realistic dataset (seed=11088). It then runs frequentist and Bayesian models with compact explanations suitable for reuse in student projects.

## Environment
Uncomment installs if running in a fresh environment.

In [None]:
%pip install pandas numpy matplotlib seaborn scipy statsmodels lifelines pymc arviz lifelines
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from lifelines import CoxPHFitter
from lifelines.statistics import proportional_hazard_test
import pymc as pm
import arviz as az
import pytensor.tensor as pt

sns.set_context('notebook'); sns.set_style('whitegrid')
RANDOM_SEED = 11088
rng = np.random.default_rng(RANDOM_SEED)
DATA_DIR = Path('data'); DATA_DIR.mkdir(exist_ok=True)
DATA_PATH = DATA_DIR / 'epidemiological_study.csv'

## Load or Simulate Data
If `data/epidemiological_study.csv` is absent, we simulate a cohort with plausible associations:
- Higher sugar → BMI up over time; higher SFA → higher CVD hazard; smoking/age → higher CVD hazard; activity → lower BMI/CVD.
- Missingness: MCAR ~ 5–10% per variable.

You can reduce `n` (e.g. 10_000) if RAM is tight.

In [None]:
def simulate_epidemiology(n=25_000, seed=RANDOM_SEED):
    rng = np.random.default_rng(seed)
    age = rng.normal(62, 8.5, n).clip(45, 85)
    sex = rng.choice(['Female','Male'], n)
    smoking = rng.choice(['Non-smoker','Smoker'], n, p=[0.7,0.3])
    activity = rng.choice(['Low','Medium','High'], n, p=[0.3,0.45,0.25])
    social = rng.choice(['A','B','C1','C2','D','E'], n, p=[0.08,0.15,0.22,0.22,0.2,0.13])
    sugar = rng.normal(90, 35, n).clip(10, 250)  # g/day
    sfa = rng.normal(28, 10, n).clip(5, 80)      # g/day

    # Baseline BMI influenced by age/sex/activity/sugar
    x_smoke = (smoking=='Smoker').astype(int)
    x_male = (sex=='Male').astype(int)
    x_act = pd.Series(activity).map({'Low':0,'Medium':1,'High':2}).values
    baseline_bmi = (rng.normal(27, 3.5, n)
                    + 0.08*(age-62) + 0.6*x_male - 0.5*x_act
                    + 0.015*(sugar-90))
    bp_base = rng.normal(132, 14, n) + 0.12*(age-62) + 3*x_smoke + 1.2*(baseline_bmi-27)

    # Longitudinal BMI changes (drift up with sugar, down with activity)
    def next_bmi(prev):
        drift = 0.04*(sugar-90) - 0.25*x_act + rng.normal(0, 0.9, n)
        return prev + drift
    bmi_y2 = next_bmi(baseline_bmi)
    bmi_y4 = next_bmi(bmi_y2)
    bmi_y6 = next_bmi(bmi_y4)

    # CVD time-to-event (months), Weibull AFT with covariates; censor at 72 months (6y)
    # log(T) = mu + sigma * EVd; encode higher hazard for age/smoke/sfa, protective activity
    # Implement via Weibull(scale=lambda, shape=alpha); larger lambda => longer survival
    alpha = 1.4  # shape
    lp = (3.9  # baseline log-scale
          - 0.025*(age-62)
          - 0.30*x_smoke
          - 0.018*(sfa-28)
          + 0.10*x_act
          - 0.02*(baseline_bmi-27))
    lam = np.exp(lp)  # scale
    u = rng.uniform(0,1,n)
    t_true = lam * (-np.log(u))**(1/alpha)   # Weibull inverse CDF (months)
    censor_time = np.full(n, 72.0)
    time_to_cvd = np.minimum(t_true, censor_time)
    event = (t_true <= censor_time).astype(int)

    df = pd.DataFrame({
        'ID': np.arange(1, n+1),
        'Age': age.round(1), 'Sex': sex, 'Smoking': smoking,
        'Physical_Activity': activity, 'Social_Class': social,
        'Sugar_Intake': sugar.round(1), 'SFA_Intake': sfa.round(1),
        'BMI_Baseline': baseline_bmi.round(2),
        'BMI_Year2': bmi_y2.round(2), 'BMI_Year4': bmi_y4.round(2), 'BMI_Year6': bmi_y6.round(2),
        'BP_Baseline': bp_base.round(1),
        'Time_to_CVD': time_to_cvd.round(2), 'CVD_Incidence': event.astype(int)
    })
    # Inject MCAR missingness 5–10%
    miss_cols = ['Sugar_Intake','SFA_Intake','Physical_Activity','Social_Class','BMI_Baseline','BMI_Year2','BMI_Year4','BMI_Year6','BP_Baseline']
    for c in miss_cols:
        m = rng.random(n) < rng.uniform(0.05,0.10)
        df.loc[m, c] = np.nan
    return df

if DATA_PATH.exists():
    data = pd.read_csv(DATA_PATH)
else:
    data = simulate_epidemiology()
    data.to_csv(DATA_PATH, index=False)
    print(f"Simulated and saved to {DATA_PATH}")

# Light QA
assert {'ID','Age','Sex','Smoking','Physical_Activity','Social_Class','Sugar_Intake','SFA_Intake','BMI_Baseline','BMI_Year2','BMI_Year4','BMI_Year6','BP_Baseline','Time_to_CVD','CVD_Incidence'} <= set(data.columns)
data.head()

## Table 1 — Baseline Characteristics
Overall and (optionally) stratified summaries.

In [None]:
def table1(df, group=None):
    num_cols = ['Age','BMI_Baseline','BP_Baseline','Sugar_Intake','SFA_Intake']
    cat_cols = ['Sex','Smoking','Physical_Activity','Social_Class']
    if group is None:
        cont = df[num_cols].agg(['mean','std','median','min','max','count']).T.round(2)
        cats = {}
        for c in cat_cols:
            vc = df[c].value_counts(dropna=False)
            p = (vc/vc.sum()*100).round(1)
            cats[c] = pd.DataFrame({'n': vc, '%': p})
        return cont, cats
    else:
        gvals = df[group].dropna().unique()
        cont = df.groupby(group)[num_cols].agg(['mean','std','count']).round(2)
        cats = {c: pd.crosstab(df[c], df[group], dropna=False) for c in cat_cols}
        cats_pct = {c: (tab / tab.sum(axis=0) * 100).round(1) for c, tab in cats.items()}
        return cont, cats, cats_pct

cont_overall, cats_overall = table1(data)
cont_overall

## Missing Data — Extent & Pattern

In [None]:
miss_pct = data.isna().mean().sort_values(ascending=False)*100
miss_df = miss_pct[miss_pct>0].round(2).to_frame('Missing_%')
display(miss_df)
plt.figure(figsize=(10,5))
sns.heatmap(data.sample(min(2000, len(data))).isna(), cbar=False)
plt.title('Missingness heatmap (sample rows)'); plt.xlabel('Variables'); plt.ylabel('Participants'); plt.tight_layout(); plt.show()

## Cross-Sectional: Baseline BMI
Predictors: age, sex, smoking, physical activity, social class, sugar, SFA.

Encoding via one-hot (reference: Female, Non-smoker, Medium activity, Social class C1). Simple **mean imputation** for teaching clarity; consider **multiple imputation** in real analyses.

In [None]:
xsec = data[['BMI_Baseline','Age','Sex','Smoking','Physical_Activity','Social_Class','Sugar_Intake','SFA_Intake']].copy()
xsec['Physical_Activity'] = xsec['Physical_Activity'].astype('category').cat.set_categories(['Low','Medium','High'])
xsec['Social_Class'] = xsec['Social_Class'].astype('category').cat.set_categories(['A','B','C1','C2','D','E'])

# Mean impute numeric, mode impute categoricals
for c in ['Age','Sugar_Intake','SFA_Intake','BMI_Baseline']:
    xsec[c] = xsec[c].fillna(xsec[c].mean())
for c in ['Sex','Smoking','Physical_Activity','Social_Class']:
    xsec[c] = xsec[c].fillna(xsec[c].mode().iloc[0])

# OLS with categorical references
formula = 'BMI_Baseline ~ Age + C(Sex) + C(Smoking) + C(Physical_Activity, Treatment(reference="Medium")) + C(Social_Class, Treatment(reference="C1")) + Sugar_Intake + SFA_Intake'
ols_xsec = smf.ols(formula, data=xsec).fit()
print(ols_xsec.summary())

# Tidy table of estimates with 95% CI
ci = ols_xsec.conf_int(); ci.columns = ['2.5%','97.5%']
est = ols_xsec.params.to_frame('coef').join(ci).round(3)
est

### Bayesian Linear Regression (vectorised)
Weakly-informative priors; standardise numeric predictors for better geometry.

In [None]:
# --- Design matrix: dummies + standardise numeric cols (only if present) ---
x_dum = pd.get_dummies(xsec.drop(columns=['BMI_Baseline']), drop_first=True)

num_cols = ['Age', 'Sugar_Intake', 'SFA_Intake']
present = [c for c in num_cols if c in x_dum.columns]
if present:
    sub = x_dum[present]
    # standardise safely (avoid chained indexing)
    x_dum.loc[:, present] = (sub - sub.mean()) / sub.std(ddof=0)

names = x_dum.columns.tolist()
X = x_dum.astype(float).to_numpy()
y = xsec['BMI_Baseline'].astype(float).to_numpy()

coords = {
    "obs": np.arange(X.shape[0]),
    "predictor": names,
}

with pm.Model(coords=coords) as blm:
    X_data = pm.Data('X', X, dims=('obs', 'predictor'))
    y_data = pm.Data('y', y, dims=('obs',))

    intercept = pm.Normal('Intercept', 0, 5)
    beta = pm.Normal('Beta', 0, 1.5, dims=('predictor',))
    sigma = pm.HalfNormal('Sigma', 3)

    mu = intercept + pm.math.dot(X_data, beta)
    pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_data)

    trace_blm = pm.sample(
        draws=1500, tune=1000, target_accept=0.9,
        random_seed=RANDOM_SEED, return_inferencedata=True
    )

# Coefficient forest plot with proper names
az.plot_forest(trace_blm, var_names=['Beta'], combined=True)
plt.title('Cross-sectional BMI: posterior 95% HDIs')
plt.show()

print(az.summary(trace_blm, var_names=['Intercept','Beta','Sigma'], hdi_prob=0.95))

## Survival: Incident CVD (Frequentist & Bayesian)
We treat `Time_to_CVD` (months) with event indicator `CVD_Incidence` (1=event, 0=censored).

In [None]:
surv = data[['Time_to_CVD','CVD_Incidence','Age','Sex','Smoking','Physical_Activity','Social_Class','Sugar_Intake','SFA_Intake','BMI_Baseline']].copy()
for c in ['Age','Sugar_Intake','SFA_Intake','BMI_Baseline']:
    surv[c] = surv[c].fillna(surv[c].mean())
for c in ['Sex','Smoking','Physical_Activity','Social_Class']:
    surv[c] = surv[c].fillna(surv[c].mode().iloc[0])
surv_d = pd.get_dummies(surv, drop_first=True)
cph = CoxPHFitter()
cph.fit(surv_d, duration_col='Time_to_CVD', event_col='CVD_Incidence')
cph.print_summary()

# PH assumption check (global test)
pht = proportional_hazard_test(cph, surv_d, time_transform='rank')
print(pht.summary)

### Bayesian Weibull AFT with Censoring (PyMC)
We specify a Weibull(shape=α, scale=λ_i), where log(λ_i) = β·x_i. For **events** we use log f(t_i), for **censored** we use log S(t_i).

In [None]:

# --- Design matrix: dummies + standardise continuous predictors ---
bx = surv.copy()

bx_dum = pd.get_dummies(
    bx[["Age","Sex","Smoking","Physical_Activity","Social_Class",
        "Sugar_Intake","SFA_Intake","BMI_Baseline"]],
    drop_first=True
)

conts = ["Age","Sugar_Intake","SFA_Intake","BMI_Baseline"]
present = [c for c in conts if c in bx_dum.columns]
if present:
    sub = bx_dum[present]
    bx_dum.loc[:, present] = (sub - sub.mean()) / sub.std(ddof=0)

names = bx_dum.columns.tolist()
X = bx_dum.astype(float).to_numpy()
t = surv["Time_to_CVD"].astype(float).to_numpy()
d = surv["CVD_Incidence"].astype(int).to_numpy()

# Coords for nice labels in the posterior
coords = {"obs": np.arange(X.shape[0]), "predictor": names}

with pm.Model(coords=coords) as wb_aft:
    # v5-style data containers (no MutableData, no get_data)
    X_data = pm.Data("X", X, dims=("obs","predictor"))
    t_data = pm.Data("t", t, dims=("obs",))
    d_data = pm.Data("d", d, dims=("obs",))

    # Priors
    intercept = pm.Normal("Intercept", 0, 2)
    beta = pm.Normal("Beta", 0, 1.5, dims=("predictor",))
    alpha = pm.LogNormal("Alpha", mu=np.log(1.2), sigma=0.4)   # shape > 0

    # AFT linear predictor: log(lambda) = intercept + X beta  => lambda = exp(...)
    log_lambda = intercept + pm.math.dot(X_data, beta)
    lam = pm.Deterministic("Lambda", pm.math.exp(log_lambda))

    # Weibull event-time likelihood (custom): 
    #   log f(t) = log(alpha) - log(lam) + (alpha-1)[log t - log lam] - (t/lam)^alpha
    #   log S(t) = -(t/lam)^alpha
    # Combine events (d=1) and right-censored (d=0):
    #   ll = sum( d*log f(t) + (1-d)*log S(t) )
    z = (t_data / lam) ** alpha
    logf = pt.log(alpha) - pt.log(lam) + (alpha - 1.0) * (pt.log(t_data) - pt.log(lam)) - z
    logS = -z
    ll = pt.sum(d_data * logf + (1 - d_data) * logS)
    pm.Potential("loglike", ll)

    trace_wb = pm.sample(
        draws=1500, tune=1000, target_accept=0.9,
        random_seed=RANDOM_SEED, return_inferencedata=True
    )

# Coefficients come out already named by "predictor"
az.plot_forest(trace_wb, var_names=["Beta"], combined=True)
plt.title("Weibull AFT (CVD): posterior 95% HDIs")
plt.show()

print(az.summary(trace_wb, var_names=["Intercept","Beta","Alpha"], hdi_prob=0.95))

## Prospective: BMI Trajectories
Long format with random intercepts by participant.

In [None]:
long = pd.melt(
    data,
    id_vars=['ID','Age','Sex','Smoking','Physical_Activity','Social_Class','Sugar_Intake','SFA_Intake'],
    value_vars=['BMI_Baseline','BMI_Year2','BMI_Year4','BMI_Year6'],
    var_name='Time', value_name='BMI'
)
long['Time'] = long['Time'].map({'BMI_Baseline':0,'BMI_Year2':2,'BMI_Year4':4,'BMI_Year6':6})
for c in ['Age','Sugar_Intake','SFA_Intake','BMI','Time']:
    long[c] = long[c].astype(float)
for c in ['Sex','Smoking','Physical_Activity','Social_Class']:
    long[c] = long[c].fillna(long[c].mode().iloc[0])
for c in ['Age','Sugar_Intake','SFA_Intake','BMI']:
    long[c] = long[c].fillna(long[c].mean())

# Frequentist mixed effects: random intercept by ID
mix = smf.mixedlm('BMI ~ Time + Age + C(Sex) + C(Smoking) + C(Physical_Activity) + C(Social_Class) + Sugar_Intake + SFA_Intake',
                  long, groups=long['ID'])
mix_res = mix.fit()
print(mix_res.summary())

### Bayesian Mixed-Effects (Random Intercepts)
Efficient hierarchical model with per-participant intercepts.

In [None]:
# --- Encode IDs and build design matrix ---
ids = long['ID'].astype('category')
id_idx = ids.cat.codes.to_numpy()
id_levels = ids.cat.categories.tolist()
N_id = len(id_levels)

Xd = pd.get_dummies(long[['Sex','Smoking','Physical_Activity','Social_Class']], drop_first=True)

Xc = long[['Time','Age','Sugar_Intake','SFA_Intake']].astype(float)
Xc = (Xc - Xc.mean()) / Xc.std(ddof=0)  # stable std

X_all = pd.concat([Xc, Xd], axis=1).astype(float)
X_mat = X_all.to_numpy()
y_bmi = long['BMI'].astype(float).to_numpy()
predictors = X_all.columns.tolist()

coords = {
    "obs": np.arange(X_mat.shape[0]),
    "id": id_levels,              # nice labels for random intercepts
    "predictor": predictors,      # nice labels for Betas
}

with pm.Model(coords=coords) as hm:
    # Data containers
    X_data    = pm.Data("X", X_mat, dims=("obs","predictor"))
    id_data   = pm.Data("id_idx", id_idx, dims=("obs",))
    y_data    = pm.Data("y", y_bmi, dims=("obs",))

    # Fixed effects
    Beta      = pm.Normal("Beta", 0, 1.5, dims=("predictor",))

    # Random intercepts (non-centred parameterisation)
    mu_a      = pm.Normal("mu_a", 0, 5)
    sigma_a   = pm.HalfNormal("sigma_a", 5)
    a_offset  = pm.Normal("a_offset", 0, 1, dims=("id",))
    a         = pm.Deterministic("a", mu_a + a_offset * sigma_a, dims=("id",))

    # Observation noise
    Sigma     = pm.HalfNormal("Sigma", 5)

    # Linear predictor
    mu        = a[id_data] + pm.math.dot(X_data, Beta)

    pm.Normal("y_obs", mu=mu, sigma=Sigma, observed=y_data)

    trace_hm = pm.sample(
        draws=1500, tune=1000, target_accept=0.9,
        random_seed=RANDOM_SEED, return_inferencedata=True
    )

# Fixed effects (already named by 'predictor')
az.plot_forest(trace_hm, var_names=["Beta"], combined=True)
plt.title("BMI trajectory: fixed-effect posteriors")
plt.show()

print(az.summary(trace_hm, var_names=["mu_a","sigma_a","Beta","Sigma"], hdi_prob=0.95))


## Prospective CVD (Logistic)
Aggregate BMI over follow-up and predict CVD incidence (teaching-oriented simplification).

In [None]:
agg = long.groupby('ID').agg({
    'BMI':'mean','Age':'first','Sex':'first','Smoking':'first','Physical_Activity':'first','Social_Class':'first','Sugar_Intake':'first','SFA_Intake':'first'
}).merge(data[['ID','CVD_Incidence']], on='ID', how='left')
# Impute any residual missing
for c in ['BMI','Age','Sugar_Intake','SFA_Intake']:
    agg[c] = agg[c].fillna(agg[c].mean())
for c in ['Sex','Smoking','Physical_Activity','Social_Class']:
    agg[c] = agg[c].fillna(agg[c].mode().iloc[0])

logit_df = pd.get_dummies(agg, drop_first=True)
y = logit_df['CVD_Incidence'].values
X = logit_df.drop(columns=['CVD_Incidence','ID']).astype(float)
X = sm.add_constant(X)
logit = sm.Logit(y, X).fit(disp=False)
or_tab = pd.DataFrame({
    'OR': np.exp(logit.params),
    '2.5%': np.exp(logit.conf_int()[0]),
    '97.5%': np.exp(logit.conf_int()[1])
}).round(3)
or_tab

### Bayesian Logistic (Prospective CVD)

In [None]:
Xb = logit_df.drop(columns=['CVD_Incidence','ID']).astype(float)
Xb = (Xb - Xb.mean())/Xb.std()
X_mat = Xb.values
y_vec = y
with pm.Model() as blog:
    pm.MutableData('X', X_mat)
    pm.MutableData('y', y_vec)
    beta0 = pm.Normal('Intercept', 0, 2)
    beta = pm.Normal('Beta', 0, 1.5, shape=X_mat.shape[1])
    logit_p = beta0 + pm.math.dot(pm.get_data('X'), beta)
    pm.Bernoulli('y_obs', logit_p=logit_p, observed=pm.get_data('y'))
    trace_blog = pm.sample(1500, tune=1000, target_accept=0.9, random_seed=RANDOM_SEED, return_inferencedata=True)

trace_blog.posterior = trace_blog.posterior.rename({'Beta_dim_0':'coef'})
trace_blog.posterior = trace_blog.posterior.assign_coords({'coef': Xb.columns.tolist()})
az.plot_forest(trace_blog, var_names=['Beta'], combined=True);
plt.title('Prospective CVD: log-odds posteriors'); plt.show()
az.summary(trace_blog, var_names=['Intercept','Beta'], hdi_prob=0.95)

## Take-aways (from simulated data)
- **Cross-sectional BMI**: increases with sugar, male sex; lower with higher activity; effect sizes align with data-generating process.
- **CVD risk**: age, smoking, SFA raise risk; activity protective. Cox and Bayesian AFT agree qualitatively.
- **BMI trajectories**: rise with sugar; participant heterogeneity captured via random intercepts.

**Teaching notes**: Replace mean/mode imputation with **multiple imputation** for realism; examine **interactions** (e.g., age×SFA), **non-linearity** (splines), PH diagnostics, and **prior sensitivity**.