# Assessment 1 — Nutritional Epidemiology (Template)

**Task**: Analyse the provided dataset to inform a public-health recommendation.

**Deliverables**
- A clear diagram (DAG/workflow) showing reasoning and adjustment set.
- ~500 words: succinct methods, key results, interpretation, limitations.
- Minimum analyses: defensible Table 1; one primary logistic regression on chosen outcome (unadjusted + adjusted); justification of adjustment set.

> Tip: run all cells top-to-bottom. Use British English. Keep your code and prose clear.

## 0) Setup (works in Colab or locally)

In [None]:
# Robust bootstrap loader (no assumptions about CWD)
import pathlib, runpy, sys, os
candidates = [
    pathlib.Path("notebooks/_bootstrap.py"),
    pathlib.Path("_bootstrap.py"),
    pathlib.Path("../notebooks/_bootstrap.py")
]
for p in candidates:
    if p.exists():
        print(f"Bootstrapping via: {p}")
        runpy.run_path(str(p))
        break
else:
    # minimal fallback: try to load dataset directly
    import pandas as pd
    CSV_REL = "data/synthetic/fb2nep.csv"
    assert os.path.exists(CSV_REL), "Dataset missing and _bootstrap.py not found."
    df = pd.read_csv(CSV_REL)
    IN_COLAB = "google.colab" in sys.modules

import pandas as pd, numpy as np
print(df.shape, "— dataset ready")

## 1) Choose your analysis
Pick one **primary outcome** and one **primary exposure**. Suggested: `Cancer_incident` (outcome) and `red_meat_g_d` (exposure).

In [None]:
# >>>> EDIT THESE <<<<
OUTCOME = "Cancer_incident"   # or "CVD_incident"
EXPOSURE = "red_meat_g_d"     # e.g. fruit_veg_g_d, salt_g_d, alcohol_units_wk, BMI

# Proposed adjustment set (edit as justified by your DAG)
ADJUST_VARS = [
    "age", "BMI", "IMD_quintile", "SES_class", "sex", "smoking_status"
]

assert OUTCOME in df.columns and EXPOSURE in df.columns
for v in ADJUST_VARS:
    assert v in df.columns, f"Missing covariate: {v}"
print("Outcome:", OUTCOME, "Exposure:", EXPOSURE)
print("Adjustment:", ADJUST_VARS)

## 2) Table 1 (baseline characteristics)
Simple, defensible table — impute *only* for table display (do not use imputed data for models). Provide means/SDs or medians/IQR; counts/percentages for categorical variables.

In [None]:
num_cols = [c for c in df.select_dtypes(include=["float64","int64"]).columns if c not in ["id"]]
cat_cols = [c for c in df.columns if df[c].dtype=="object" or str(df[c].dtype).startswith("category")]
cat_cols = [c for c in cat_cols if c not in ["CVD_date","Cancer_date","baseline_date"]]

df_imp = df.copy()
for c in num_cols:
    df_imp[c] = df_imp[c].fillna(df_imp[c].median())
for c in cat_cols:
    mode = df_imp[c].mode(dropna=True)
    if len(mode):
        df_imp[c] = df_imp[c].fillna(mode.iloc[0])

by = OUTCOME
cont = ["age","BMI","SBP","energy_kcal","fruit_veg_g_d","red_meat_g_d","ssb_ml_d","fibre_g_d","salt_g_d"]
cont = [c for c in cont if c in df_imp.columns]
cat  = ["sex","smoking_status","physical_activity","IMD_quintile","SES_class"]
cat = [c for c in cat if c in df_imp.columns]

import pandas as pd
blocks = []
for c in cont:
    g = df_imp.groupby(by)[c].agg(["mean","std","median","count"]).round(2)
    g.columns = pd.MultiIndex.from_product([[c], g.columns])
    blocks.append(g)
cont_block = pd.concat(blocks, axis=1) if blocks else pd.DataFrame()
cat_blocks = []
for c in cat:
    g = df_imp.groupby([by, c]).size().unstack(fill_value=0)
    g.columns = pd.MultiIndex.from_product([[c], g.columns])
    cat_blocks.append(g)
cat_block = pd.concat(cat_blocks, axis=1) if cat_blocks else pd.DataFrame()

table1 = pd.concat([cont_block, cat_block], axis=1)
table1.head(10)

### Save Table 1 (CSV) for submission bundle

In [None]:
import os
os.makedirs("submission", exist_ok=True)
table1.to_csv("submission/table1.csv")
print("Saved submission/table1.csv")

## 3) Primary analysis — Logistic regression (unadjusted & adjusted)
Unadjusted: `OUTCOME ~ EXPOSURE`  
Adjusted: `OUTCOME ~ EXPOSURE + ADJUST_VARS` (justify via DAG/epidemiological reasoning).

In [None]:
import statsmodels.api as sm
from patsy import dmatrices

model_df = df[[OUTCOME, EXPOSURE] + ADJUST_VARS].dropna().copy()
print("Complete-case rows:", model_df.shape[0])

# Unadjusted
y_u, X_u = dmatrices(f"{OUTCOME} ~ {EXPOSURE}", data=model_df, return_type="dataframe")
fit_u = sm.Logit(y_u, X_u).fit(disp=False)
OR_u = (np.exp(fit_u.params).rename("OR")
        .to_frame().join(np.exp(fit_u.conf_int()).rename(columns={0:"2.5%",1:"97.5%"})))

# Adjusted (categoricals to factors via patsy C())
def fmt_term(v):
    return f"C({v})" if (model_df[v].dtype=="object" or str(model_df[v].dtype).startswith("category")) else v
rhs_terms = [fmt_term(v) for v in [EXPOSURE] + ADJUST_VARS]
formula_a = f"{OUTCOME} ~ " + " + ".join(rhs_terms)
y_a, X_a = dmatrices(formula_a, data=model_df, return_type="dataframe")
fit_a = sm.Logit(y_a, X_a).fit(disp=False)
OR_a = (np.exp(fit_a.params).rename("OR")
        .to_frame().join(np.exp(fit_a.conf_int()).rename(columns={0:"2.5%",1:"97.5%"})))

OR_u.round(3), OR_a.round(3)

### Save model outputs for submission bundle

In [None]:
OR_u.round(4).to_csv("submission/or_unadjusted.csv")
OR_a.round(4).to_csv("submission/or_adjusted.csv")
with open("submission/formula.txt","w") as f:
    f.write(formula_a + "\n")
print("Saved submission/or_*.csv and submission/formula.txt")

## 4) DAG (optional but recommended)
Provide a simple DAG for your question (e.g., `red_meat_g_d → Cancer_incident`).

In [None]:
try:
    import networkx as nx, matplotlib.pyplot as plt
    G = nx.DiGraph()
    exposure = EXPOSURE
    outcome  = OUTCOME
    # >>> EDIT: propose your confounders here (mirrors ADJUST_VARS)
    conf = [v for v in ADJUST_VARS]
    for c in conf:
        G.add_edge(c, exposure)
        G.add_edge(c, outcome)
    G.add_edge(exposure, outcome)
    pos = nx.spring_layout(G, seed=11088)
    plt.figure(figsize=(6,4))
    nx.draw_networkx(G, pos=pos, node_size=1200, font_size=9)
    plt.axis('off'); os.makedirs("submission", exist_ok=True)
    plt.tight_layout(); plt.savefig("submission/dag.png", dpi=200)
    plt.show()
    print("Saved submission/dag.png")
except Exception as e:
    print("DAG skipped:", e)
    pass

## 5) 500-word write-up (methods, results, interpretation, limitations)
Write **~500 words**. The cell below will check length and save `submission/summary.txt`.

In [None]:
TEXT = r'''
Replace this with ~500 words covering:
- **Question** & rationale (exposure, outcome)
- **Data & methods** (cohort; Table 1 approach; model; adjustment set; missingness handling)
- **Results** (OR [95% CI] for exposure; any sensitivity observation)
- **Interpretation** (direction/size; plausibility; comparison to literature if relevant)
- **Limitations** (measurement error, residual confounding, selection, model simplifications)
'''
wc = len(TEXT.split())
print(f"Word count: {wc}")
assert 350 <= wc <= 650, "Aim for roughly 500 words (350–650 permitted)."  # relaxed band
os.makedirs("submission", exist_ok=True)
with open("submission/summary.txt","w", encoding="utf-8") as f:
    f.write(TEXT.strip()+"\n")
print("Saved submission/summary.txt")

## 6) Export submission bundle (zip)
Creates `submission_<exposure>_to_<outcome>.zip` containing: Table 1 CSV, OR tables, formula, DAG PNG (if any), 500-word text, and an HTML export of this notebook.

In [None]:
import time, zipfile, pathlib, subprocess, shlex
stem = f"submission_{EXPOSURE}_to_{OUTCOME}"
zip_path = f"{stem}.zip"

# HTML export (best-effort; works in Colab and Jupyter if nbconvert present)
try:
    r = subprocess.run(shlex.split("jupyter nbconvert --to html --output submission/notebook.html --execute --inplace False"), check=False)
    if r.returncode != 0:
        raise RuntimeError("nbconvert failed or missing; continuing without HTML export.")
except Exception as e:
    print("Notebook HTML export skipped:", e)

files = [
    "submission/table1.csv",
    "submission/or_unadjusted.csv",
    "submission/or_adjusted.csv",
    "submission/formula.txt",
    "submission/summary.txt",
    "submission/notebook.html",
    "submission/dag.png"
]
files = [f for f in files if pathlib.Path(f).exists()]

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
    for f in files:
        z.write(f)
print("Created:", zip_path)
zip_path