# FB2NEP — Data Analysis Notebook (Assessment)

This notebook supports the **FB2NEP assessment**. Complete the **Data mapping** cell and use the provided code to answer the question paper. The notebook allows:
- Describing the study population (Table 1) with comparisons by factors like sex, deprivation, or disease incidence.
- Comparing groups with/without cancer or CVD.
- Assessing whether data are missing at random.
- Analysing associations between nutrient intake and blood pressure (BP) or disease (via logistic or Cox regression).
- Exploring different models and data transformations.
- Drawing conclusions based on results.

Set `ADD_JITTER` to `True` to add random noise to continuous variables (for varied results). If running on Google Colab, the repository will be cloned automatically.


In [None]:
# Imports
import os, sys, math, json, textwrap, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from lifelines import CoxPHFitter  # For Cox regression

warnings.filterwarnings("ignore")

# Display settings
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 200)

print("Versions:")
print("pandas", pd.__version__)
print("statsmodels", sm.__version__)
print("numpy", np.__version__)

# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Option to add jitter to continuous variables
ADD_JITTER = False  # Set to True to add random noise
JITTER_SCALE = 0.05  # Standard deviation of noise as proportion of variable std


## Data mapping (REQUIRED)

Assign the correct column names from `fb2nep.csv` to the variables below. Run the next cell to list columns if unsure.


In [None]:
# Clone repository if in Colab
if IN_COLAB:
    !git clone https://github.com/ggkuhnle/fb2nep-epi.git
    %cd fb2nep-epi

from scripts.bootstrap import init
df, ctx = init()

# Add jitter to continuous variables if enabled
if ADD_JITTER:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        noise = np.random.normal(0, df[col].std() * JITTER_SCALE, size=len(df))
        df[col] = df[col] + noise
    print("Jitter added to numeric columns")

print(df.shape, "— dataset ready")
df.head()


In [None]:
# === EDIT THIS CELL ===
# Map dataset columns here (strings). Use exact column names from fb2nep.csv.

MAPPING = {
    # Primary outcome (binary recommended: 0/1). Example: "CVD_Incidence"
    "outcome": "<OUTCOME_COL>",
    # Time-to-event for Cox regression (if applicable). Example: "time_to_event"
    "time": "<TIME_COL>",
    # Primary exposure: biomarker variable (continuous). Example: "flavanol_biomarker"
    "exposure_biomarker": "<BIOMARKER_COL>",
    # Secondary exposure: diet diary (DD) variable (continuous). Example: "flavanol_dd"
    "exposure_dd": "<DD_COL>",
    # Blood pressure (continuous). Example: "systolic_bp"
    "bp": "<BP_COL>",
    # Demographics/covariates
    "id": "<ID_COL>",                 # e.g., "ID" or "participant_id"
    "age": "<AGE_COL>",               # e.g., "age"
    "sex": "<SEX_COL>",               # e.g., "sex" coded as 0/1 or 'M'/'F'
    "bmi": "<BMI_COL>",               # e.g., "BMI"
    "smoking": "<SMOKING_COL>",       # e.g., 'never','former','current'
    "ses": "<SES_COL>",               # socioeconomic status
    # Add other confounders as needed:
    # "physical_activity": "<PA_COL>",
    # "energy_intake": "<ENERGY_COL>",
}

# Candidate confounders for models
CANDIDATE_CONFOUNDERS = ["age", "sex", "bmi", "smoking", "ses"]

# Outcome type
OUTCOME_IS_BINARY = True  # Set to False if outcome is continuous
USE_COX = False  # Set to True for Cox regression (requires 'time' in MAPPING)

# Data transformation options
TRANSFORM = None  # Options: None, 'log', 'sqrt', or lambda x: <custom>


## Load and preprocess data

In [None]:
# Load fb2nep.csv
def load_fb2nep():
    DATA_PATHS = [Path(p) for p in ["./data/fb2nep.csv", "./fb2nep.csv", "../data/fb2nep.csv"]]
    for p in DATA_PATHS:
        if p.exists():
            return pd.read_csv(p)
    raise FileNotFoundError("fb2nep.csv not found. Place it in ./data, ./, or ../data")

df = load_fb2nep()

# Apply transformations
def apply_transform(data, col, transform):
    if transform == 'log':
        return np.log1p(data[col])
    elif transform == 'sqrt':
        return np.sqrt(data[col])
    elif callable(transform):
        return transform(data[col])
    return data[col]

if TRANSFORM:
    for key in ['exposure_biomarker', 'exposure_dd', 'bp']:
        if MAPPING.get(key) in df.columns:
            df[MAPPING[key]] = apply_transform(df, MAPPING[key], TRANSFORM)
            print(f"Applied {TRANSFORM} transformation to {MAPPING[key]}")

print(df.shape)
df.head()


In [None]:
# Validate mapping and coerce types
def validate_mapping(df, mapping):
    missing = [k for k, v in mapping.items() if isinstance(v, str) and v.startswith("<")]
    if missing:
        raise ValueError(f"Please fill in MAPPING for: {missing}")
    for k, v in mapping.items():
        if v and v not in df.columns:
            raise KeyError(f"MAPPING[{k}] refers to '{v}', which is not a column in the dataset.")
    if USE_COX and (not mapping.get("time") or mapping["time"] not in df.columns):
        raise ValueError("Cox regression requires 'time' column in MAPPING.")
    return True

validate_mapping(df, MAPPING)

def coerce_types(d, mapping, outcome_is_binary=True):
    d = d.copy()
    if mapping.get("sex") in d:
        if d[mapping["sex"]].dtype == object:
            d[mapping["sex"]] = d[mapping["sex"]].astype(str).str.strip().str[0].str.upper().map({"M":0, "F":1})
    if outcome_is_binary and mapping.get("outcome") in d:
        if d[mapping["outcome"]].dtype == object:
            d[mapping["outcome"]] = d[mapping["outcome"]].astype(str).str.strip().str.lower().map({"no":0, "yes":1})
        d[mapping["outcome"]] = pd.to_numeric(d[mapping["outcome"]], errors="coerce")
    return d

df = coerce_types(df, MAPPING, OUTCOME_IS_BINARY)
df.head()


## Table 1: Describe study population

In [None]:
# Summarise variables
def summarise_series(s):
    if pd.api.types.is_numeric_dtype(s):
        return pd.Series({
            "n": s.notna().sum(),
            "mean": s.mean(),
            "sd": s.std(),
            "median": s.median(),
            "iqr": s.quantile(0.75) - s.quantile(0.25)
        })
    else:
        vc = s.value_counts(dropna=False)
        total = len(s)
        return pd.Series({f"{k} (n,%)": f"{v} ({v/total*100:.1f}%)" for k, v in vc.items()})

def table1(df, cols, by=None):
    out = {}
    if by is None or by not in df.columns:
        for c in cols:
            out[c] = summarise_series(df[c])
        res = pd.concat(out, axis=1)
    else:
        groups = df.groupby(by, dropna=False)
        parts = []
        for lvl, dsub in groups:
            part = pd.concat({c: summarise_series(dsub[c]) for c in cols}, axis=1)
            part.columns = pd.MultiIndex.from_product([[f"{by}={lvl}"], part.columns])
            parts.append(part)
        res = pd.concat(parts, axis=1)
    return res

# Select columns for Table 1
cols = [c for c in [MAPPING.get(k) for k in ["age", "sex", "bmi", "smoking", "ses", "exposure_biomarker", "exposure_dd", "bp"]] if c in df.columns]

# Compare by factor (edit 'by' to 'sex', 'ses', 'outcome', etc.)
COMPARE_BY = "sex"  # Change to MAPPING key (e.g., 'ses', 'outcome') to compare groups
tbl1 = table1(df, cols, by=MAPPING.get(COMPARE_BY))
tbl1


## Missingness audit: Are data missing at random?

In [None]:
# Missingness summary
def missingness_summary(d):
    miss = d.isna().mean().sort_values(ascending=False)
    out = pd.DataFrame({"missing_prop": miss, "missing_%": (miss*100).round(1)})
    out["n_missing"] = d.isna().sum()
    out["n"] = len(d)
    return out

miss = missingness_summary(df)
miss.head(20)

# Test for missing at random (example: compare missingness of exposure_biomarker by outcome)
def test_mar(df, var, group_by):
    if var not in df.columns or group_by not in df.columns:
        return "Invalid column names"
    miss = df[var].isna()
    if df[group_by].dtype == object or pd.api.types.is_categorical_dtype(df[group_by]):
        contingency = pd.crosstab(miss, df[group_by])
        chi2, p = stats.chi2_contingency(contingency)[:2]
        return {"chi2": chi2, "p_value": p, "contingency": contingency}
    else:
        miss_val = df.loc[miss, group_by]
        not_miss_val = df.loc[~miss, group_by]
        t_stat, p = stats.ttest_ind(miss_val.dropna(), not_miss_val.dropna(), equal_var=False)
        return {"t_stat": t_stat, "p_value": p}

mar_test = test_mar(df, MAPPING.get("exposure_biomarker"), MAPPING.get("outcome"))
print("MAR test (exposure_biomarker by outcome):")
print(mar_test)


In [None]:
# Visualise missingness matrix
def plot_missingness_matrix(d, max_cols=30):
    d = d.copy()
    if d.shape[1] > max_cols:
        d = d.iloc[:, :max_cols]
        print(f"(Showing first {max_cols} columns)")
    plt.figure(figsize=(10, 6))
    plt.imshow(d.isna(), aspect='auto', interpolation='nearest')
    plt.xlabel("Columns (subset if large)")
    plt.ylabel("Rows")
    plt.title("Missingness matrix")
    plt.show()

plot_missingness_matrix(df)


## Biomarker vs Diet Diary (DD) comparison

In [None]:
biom = MAPPING["exposure_biomarker"]
dd = MAPPING["exposure_dd"]

biom_valid = df[biom].astype(float)
dd_valid = df[dd].astype(float)

# Scatter & correlation
plt.figure(figsize=(6, 5))
plt.scatter(dd_valid, biom_valid, alpha=0.6)
plt.xlabel(f"{dd}")
plt.ylabel(f"{biom}")
plt.title("Biomarker vs DD: scatter")
plt.show()

valid = df[[biom, dd]].dropna()
r = valid[biom].corr(valid[dd])
print(f"Pearson r = {r:.3f} (n={len(valid)})")

# Bland–Altman
def bland_altman(a, b):
    a, b = np.asarray(a), np.asarray(b)
    diff = a - b
    mean = (a + b) / 2
    mdiff = np.mean(diff)
    sd = np.std(diff, ddof=1)
    loa = (mdiff - 1.96*sd, mdiff + 1.96*sd)
    return mean, diff, mdiff, loa

mean_ab, diff_ab, mdiff, loa = bland_altman(valid[biom], valid[dd])
plt.figure(figsize=(6, 5))
plt.scatter(mean_ab, diff_ab, alpha=0.6)
plt.axhline(mdiff)
plt.axhline(loa[0])
plt.axhline(loa[1])
plt.xlabel("Mean of biomarker and DD")
plt.ylabel("Difference (biomarker − DD)")
plt.title("Bland–Altman plot")
plt.show()

print(f"Mean difference: {mdiff:.3f}; 95% LoA: [{loa[0]:.3f}, {loa[1]:.3f}]")


## Association: Nutrient intake and blood pressure

In [None]:
# Linear regression: nutrient intake vs BP
def fit_linear_model(df, outcome, exposure, covars=None):
    covars = covars or []
    rhs = exposure if not covars else f"{exposure} + {' + '.join(covars)}"
    formula = f"{outcome} ~ {rhs}"
    d = df[[outcome, exposure] + covars].dropna()
    model = smf.ols(formula, data=d).fit()
    return model, formula

bp = MAPPING.get("bp")
exposure = MAPPING.get("exposure_biomarker")
if bp and exposure in df.columns:
    # Minimal model
    bp_model, bp_formula = fit_linear_model(df, bp, exposure)
    print(f"Minimal model: {bp_formula}")
    print(bp_model.summary())

    # Adjusted model
    prespec_covars = [MAPPING[k] for k in CANDIDATE_CONFOUNDERS if MAPPING.get(k) in df.columns]
    bp_adj_model, bp_adj_formula = fit_linear_model(df, bp, exposure, prespec_covars)
    print(f"Adjusted model: {bp_adj_formula}")
    print(bp_adj_model.summary())


## Association: Nutrient intake and disease

In [None]:
# Logistic or Cox regression
def make_formula(outcome, exposure, covars=None, binary=True):
    rhs = exposure if covars is None or len(covars) == 0 else f"{exposure} + {' + '.join(covars)}"
    return f"{outcome} ~ {rhs}"

def fit_model(df, formula, covars=None, binary=True, use_cox=False):
    d = df.dropna(subset=(covars or []) + formula.split("~")[1].split(" + ") + ([MAPPING["time"]] if use_cox else []))
    if use_cox:
        covars = covars or []
        model = CoxPHFitter()
        d = d[[MAPPING["outcome"], MAPPING["time"], exposure] + covars]
        model.fit(d, duration_col=MAPPING["time"], event_col=MAPPING["outcome"])
        return model, d
    elif binary:
        model = smf.logit(formula, data=d).fit(disp=False)
        return model, d
    else:
        model = smf.ols(formula, data=d).fit()
        return model, d

outcome = MAPPING["outcome"]
exposure = MAPPING["exposure_biomarker"]

# Minimal model
min_formula = make_formula(outcome, exposure, covars=[], binary=OUTCOME_IS_BINARY)
min_model, min_data = fit_model(df, min_formula, binary=OUTCOME_IS_BINARY, use_cox=USE_COX)
print(f"Minimal model: {min_formula}")
if USE_COX:
    min_model.print_summary()
else:
    print(min_model.summary())

# Adjusted model
prespec_covars = [MAPPING[k] for k in CANDIDATE_CONFOUNDERS if MAPPING.get(k) in df.columns]
adj_formula = make_formula(outcome, exposure, covars=prespec_covars, binary=OUTCOME_IS_BINARY)
adj_model, adj_data = fit_model(df, adj_formula, covars=prespec_covars, binary=OUTCOME_IS_BINARY, use_cox=USE_COX)
print(f"Adjusted model: {adj_formula}")
if USE_COX:
    adj_model.print_summary()
else:
    print(adj_model.summary())


### Change-in-estimate (≥10%) procedure

In [None]:
def get_effect(model, exposure, binary=True, use_cox=False):
    if use_cox:
        b = model.params_[exposure]
        se = model.standard_errors_[exposure]
        hr = np.exp(b)
        lo = np.exp(b - 1.96*se)
        hi = np.exp(b + 1.96*se)
        return {"effect": hr, "lo": lo, "hi": hi, "scale": "HR"}
    elif binary:
        b = model.params[exposure]
        se = model.bse[exposure]
        OR = np.exp(b)
        lo = np.exp(b - 1.96*se)
        hi = np.exp(b + 1.96*se)
        return {"effect": OR, "lo": lo, "hi": hi, "scale": "OR"}
    else:
        b = model.params[exposure]
        se = model.bse[exposure]
        lo = b - 1.96*se
        hi = b + 1.96*se
        return {"effect": b, "lo": lo, "hi": hi, "scale": "beta"}

base_eff = get_effect(min_model, exposure, OUTCOME_IS_BINARY, USE_COX)["effect"]

results = []
for k in CANDIDATE_CONFOUNDERS:
    cov = MAPPING.get(k)
    if not cov or cov not in df.columns:
        continue
    f = make_formula(outcome, exposure, covars=[cov], binary=OUTCOME_IS_BINARY)
    m, _ = fit_model(df, f, covars=[cov], binary=OUTCOME_IS_BINARY, use_cox=USE_COX)
    eff = get_effect(m, exposure, OUTCOME_IS_BINARY, USE_COX)["effect"]
    change = 100 * (eff - base_eff) / base_eff if base_eff != 0 else np.nan
    results.append({"added": cov, "effect": eff, "% change vs minimal": change})

cei = pd.DataFrame(results).sort_values("% change vs minimal", key=lambda s: s.abs(), ascending=False)
cei


## Diagnostics

In [None]:
# Standardised residuals
def standardised_residuals(model, use_cox=False):
    if use_cox:
        return pd.Series(model.residuals, name="martingale_resid")
    elif OUTCOME_IS_BINARY:
        return pd.Series(model.resid_pearson, name="std_resid")
    else:
        return pd.Series(model.get_influence().resid_studentized_internal, name="std_resid")

resid = standardised_residuals(adj_model, USE_COX)
plt.figure(figsize=(6, 4))
plt.plot(resid.values, marker='o', linestyle='none', alpha=0.6)
plt.axhline(3, color='red', linestyle='--')
plt.axhline(-3, color='red', linestyle='--')
plt.title("Residuals (approx.)")
plt.xlabel("Observation (index in complete-case sample)")
plt.ylabel("Residual")
plt.show()


## Model exploration helpers

In [None]:
# Run custom models
def run_model_with_covars(outcome, exposure, covars, use_cox=False):
    f = make_formula(outcome, exposure, covars=covars, binary=OUTCOME_IS_BINARY)
    m, _ = fit_model(df, f, covars=covars, binary=OUTCOME_IS_BINARY, use_cox=use_cox)
    print(f"Formula: {f}")
    if use_cox:
        m.print_summary()
    else:
        print(m.summary())

# Example: try different exposure or outcome
print("Use run_model_with_covars(outcome, exposure, covars, use_cox=False) to experiment.")
print("Example: run_model_with_covars(MAPPING['bp'], MAPPING['exposure_dd'], prespec_covars)")
print("Set use_cox=True for Cox regression (requires 'time' in MAPPING).")


## Save mapping snapshot

In [None]:
snapshot = {
    "mapping": MAPPING,
    "candidates": CANDIDATE_CONFOUNDERS,
    "binary_outcome": OUTCOME_IS_BINARY,
    "use_cox": USE_COX,
    "transform": str(TRANSFORM),
    "jitter": ADD_JITTER,
}

Path("./artifacts").mkdir(exist_ok=True, parents=True)
with open("./artifacts/data_mapping_snapshot.json", "w") as f:
    json.dump(snapshot, f, indent=2)
print("Saved ./artifacts/data_mapping_snapshot.json")
