# FB2NEP — Data Analysis Notebook (Assessment)

This notebook supports the **FB2NEP assessment**. Fill in the **Data mapping** cell before running the analysis.


In [None]:

# Imports
import os, sys, math, json, textwrap, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

warnings.filterwarnings("ignore")

# Display settings
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 200)

print("Versions:")
print("pandas", pd.__version__)
print("statsmodels", sm.__version__)
print("numpy", np.__version__)


## Data mapping (REQUIRED)

Assign the correct column names from `fb2nep.csv` to the variables below.  
If unsure, run the next cell to list columns.


In [None]:

# List columns helper (run this first if needed)
from pathlib import Path
DATA_PATHS = [Path("./data/fb2nep.csv"), Path("./fb2nep.csv"), Path("../data/fb2nep.csv")]
for p in DATA_PATHS:
    if p.exists():
        df_head = pd.read_csv(p, nrows=5)
        print(f"Found: {p} — columns:")
        print(list(df_head.columns))
        break
else:
    print("fb2nep.csv not found in ./data, ./, or ../data — please place it accordingly.")


In [None]:

# === EDIT THIS CELL ===
# Map dataset columns here (strings). Use exact column names from fb2nep.csv.

MAPPING = {
    # Primary outcome (binary recommended: 0/1). Example: "CVD_Incidence"
    "outcome": "<OUTCOME_COL>",

    # Primary exposure: biomarker variable (continuous). Example: "flavanol_biomarker"
    "exposure_biomarker": "<BIOMARKER_COL>",

    # Secondary exposure: diet diary (DD) variable (continuous). Example: "flavanol_dd"
    "exposure_dd": "<DD_COL>",

    # Demographics/covariates (edit as applicable)
    "id": "<ID_COL>",                 # e.g., "ID" or "participant_id" (optional but useful)
    "age": "<AGE_COL>",               # e.g., "age"
    "sex": "<SEX_COL>",               # e.g., "sex" coded as 0/1 or 'M'/'F'
    "bmi": "<BMI_COL>",               # e.g., "BMI"
    "smoking": "<SMOKING_COL>",       # e.g., 'never','former','current' (categorical)
    "ses": "<SES_COL>",               # socioeconomic status (categorical or continuous)
    # Add other candidate confounders as needed:
    # "physical_activity": "<PA_COL>",
    # "energy_intake": "<ENERGY_COL>",
}

# Candidate confounders used in change-in-estimate search (a list of keys from MAPPING)
CANDIDATE_CONFOUNDERS = ["age", "sex", "bmi", "smoking", "ses"]

# Optional: set outcome scale if continuous
OUTCOME_IS_BINARY = True  # set to False if outcome is continuous


## Load data

In [None]:

# Load fb2nep.csv
def load_fb2nep():
    for p in DATA_PATHS:
        if p.exists():
            return pd.read_csv(p)
    raise FileNotFoundError("fb2nep.csv not found. Place it in ./data, ./, or ../data")

df = load_fb2nep()
print(df.shape)
df.head()


In [None]:

# Utilities to validate mapping and coerce variable types
def validate_mapping(df, mapping):
    missing = [k for k,v in mapping.items() if isinstance(v, str) and v.startswith("<")]
    if missing:
        raise ValueError(f"Please fill in MAPPING for: {missing}")
    for k, v in mapping.items():
        if v and v not in df.columns:
            raise KeyError(f"MAPPING[{k}] refers to '{v}', which is not a column in the dataset.")
    return True

validate_mapping(df, MAPPING)

# Coerce common types
def coerce_types(d, mapping, outcome_is_binary=True):
    d = d.copy()
    if mapping.get("sex") in d:
        # Attempt to standardise sex to 0/1 if string-coded
        if d[mapping["sex"]].dtype == object:
            d[mapping["sex"]] = d[mapping["sex"]].astype(str).str.strip().str[0].str.upper().map({"M":0, "F":1})
    if outcome_is_binary and mapping.get("outcome") in d:
        # Try to coerce to 0/1 if appears boolean or categorical
        if d[mapping["outcome"]].dtype == object:
            d[mapping["outcome"]] = d[mapping["outcome"]].astype(str).str.strip().str.lower().map({"no":0, "yes":1})
        d[mapping["outcome"]] = pd.to_numeric(d[mapping["outcome"]], errors="coerce")
    return d

df = coerce_types(df, MAPPING, OUTCOME_IS_BINARY)
df.head()


## Table 1

In [None]:

def summarise_series(s):
    if pd.api.types.is_numeric_dtype(s):
        return pd.Series({
            "n": s.notna().sum(),
            "mean": s.mean(),
            "sd": s.std(),
            "median": s.median(),
            "iqr": s.quantile(0.75) - s.quantile(0.25)
        })
    else:
        vc = s.value_counts(dropna=False)
        total = len(s)
        return pd.Series({f"{k} (n,%)": f"{v} ({v/total*100:.1f}%)" for k,v in vc.items()})

def table1(df, cols, by=None):
    out = {}
    if by is None:
        for c in cols:
            out[c] = summarise_series(df[c])
        res = pd.concat(out, axis=1)
    else:
        groups = df.groupby(by, dropna=False)
        parts = []
        for lvl, dsub in groups:
            part = pd.concat({c: summarise_series(dsub[c]) for c in cols}, axis=1)
            part.columns = pd.MultiIndex.from_product([[f"{by}={lvl}"], part.columns])
            parts.append(part)
        res = pd.concat(parts, axis=1)
    return res

cols = [c for c in [MAPPING.get("age"), MAPPING.get("sex"), MAPPING.get("bmi"), 
                    MAPPING.get("smoking"), MAPPING.get("ses"), 
                    MAPPING.get("exposure_biomarker"), MAPPING.get("exposure_dd")]
        if c and c in df.columns]

tbl1_overall = table1(df, cols)
tbl1_overall


## Missingness audit

In [None]:

def missingness_summary(d):
    miss = d.isna().mean().sort_values(ascending=False)
    out = pd.DataFrame({"missing_prop": miss, "missing_%": (miss*100).round(1)})
    out["n_missing"] = d.isna().sum()
    out["n"] = len(d)
    return out

miss = missingness_summary(df)
miss.head(20)


In [None]:

# Visualise missingness matrix (simple)
def plot_missingness_matrix(d, max_cols=30):
    d = d.copy()
    if d.shape[1] > max_cols:
        d = d.iloc[:, :max_cols]
        print(f"(Showing first {max_cols} columns)")
    plt.figure(figsize=(10, 6))
    plt.imshow(d.isna(), aspect='auto', interpolation='nearest')
    plt.xlabel("Columns (subset if large)")
    plt.ylabel("Rows")
    plt.title("Missingness matrix")
    plt.show()

plot_missingness_matrix(df)


## Biomarker vs Diet Diary (DD) comparison

In [None]:

biom = MAPPING["exposure_biomarker"]
dd   = MAPPING["exposure_dd"]

biom_valid = df[biom].astype(float)
dd_valid   = df[dd].astype(float)

# Scatter & correlation
plt.figure(figsize=(6,5))
plt.scatter(dd_valid, biom_valid, alpha=0.6)
plt.xlabel(f"{dd}")
plt.ylabel(f"{biom}")
plt.title("Biomarker vs DD: scatter")
plt.show()

valid = df[[biom, dd]].dropna()
r = valid[biom].corr(valid[dd])
print(f"Pearson r = {r:.3f} (n={len(valid)})")

# Bland–Altman (biomarker vs DD)
def bland_altman(a, b):
    a, b = np.asarray(a), np.asarray(b)
    diff = a - b
    mean = (a + b) / 2
    mdiff = np.mean(diff)
    sd = np.std(diff, ddof=1)
    loa = (mdiff - 1.96*sd, mdiff + 1.96*sd)
    return mean, diff, mdiff, loa

mean_ab, diff_ab, mdiff, loa = bland_altman(valid[biom], valid[dd])
plt.figure(figsize=(6,5))
plt.scatter(mean_ab, diff_ab, alpha=0.6)
plt.axhline(mdiff)
plt.axhline(loa[0])
plt.axhline(loa[1])
plt.xlabel("Mean of biomarker and DD")
plt.ylabel("Difference (biomarker − DD)")
plt.title("Bland–Altman plot")
plt.show()

print(f"Mean difference: {mdiff:.3f}; 95% LoA: [{loa[0]:.3f}, {loa[1]:.3f}]")


## Primary association & confounding

In [None]:

# Build formulas
outcome = MAPPING["outcome"]
exposure = MAPPING["exposure_biomarker"]

def make_formula(outcome, exposure, covars=None, binary=True):
    rhs = exposure if covars is None or len(covars)==0 else exposure + " + " + " + ".join(covars)
    return f"{outcome} ~ {rhs}"

def fit_model(df, formula, binary=True):
    d = df.dropna()
    if binary:
        model = smf.logit(formula, data=d).fit(disp=False)
    else:
        model = smf.ols(formula, data=d).fit()
    return model

# Minimal model
min_formula = make_formula(outcome, exposure, covars=[], binary=OUTCOME_IS_BINARY)
min_model = fit_model(df, min_formula, OUTCOME_IS_BINARY)
print(min_model.summary())

# Confounder-adjusted model (pre-specified set)
prespec_covars = [MAPPING[k] for k in CANDIDATE_CONFOUNDERS if MAPPING.get(k) in df.columns]
adj_formula = make_formula(outcome, exposure, covars=prespec_covars, binary=OUTCOME_IS_BINARY)
adj_model = fit_model(df, adj_formula, OUTCOME_IS_BINARY)
print(adj_model.summary())


### Change‑in‑estimate (≥10%) procedure

In [None]:

def get_effect(model, exposure, binary=True):
    # For logit, coefficient is log-odds; return OR. For OLS, return beta.
    if binary:
        b = model.params[exposure]
        se = model.bse[exposure]
        OR = np.exp(b)
        lo = np.exp(b - 1.96*se)
        hi = np.exp(b + 1.96*se)
        return {"effect": OR, "lo": lo, "hi": hi, "scale": "OR"}
    else:
        b = model.params[exposure]
        se = model.bse[exposure]
        lo = b - 1.96*se
        hi = b + 1.96*se
        return {"effect": b, "lo": lo, "hi": hi, "scale": "beta"}

base_eff = get_effect(min_model, exposure, OUTCOME_IS_BINARY)["effect"]

results = []
for k in CANDIDATE_CONFOUNDERS:
    cov = MAPPING.get(k)
    if not cov or cov not in df.columns: 
        continue
    f = make_formula(outcome, exposure, covars=[cov], binary=OUTCOME_IS_BINARY)
    m = fit_model(df, f, OUTCOME_IS_BINARY)
    eff = get_effect(m, exposure, OUTCOME_IS_BINARY)["effect"]
    change = 100 * (eff - base_eff) / base_eff if base_eff != 0 else np.nan
    results.append({"added": cov, "effect": eff, "% change vs minimal": change})

cei = pd.DataFrame(results).sort_values("% change vs minimal", key=lambda s: s.abs(), ascending=False)
cei


## Diagnostics (brief)

In [None]:

# Outlier check (standardised residuals)
def standardised_residuals(model):
    if OUTCOME_IS_BINARY:
        # Use Pearson residuals as a simple check
        resid = model.resid_pearson
    else:
        resid = model.get_influence().resid_studentized_internal
    return pd.Series(resid, name="std_resid")

resid = standardised_residuals(adj_model)
plt.figure(figsize=(6,4))
plt.plot(resid.values, marker='o', linestyle='none', alpha=0.6)
plt.axhline(3); plt.axhline(-3)
plt.title("Standardised residuals (approx.)")
plt.xlabel("Observation (index in complete-case sample)")
plt.ylabel("Std residual")
plt.show()


## Model exploration helpers

In [None]:

def run_model_with_covars(covars):
    f = make_formula(outcome, exposure, covars=covars, binary=OUTCOME_IS_BINARY)
    m = fit_model(df, f, OUTCOME_IS_BINARY)
    print(f"Formula: {f}")
    print(m.summary())

# Example: try a different exposure (DD)
def run_with_dd_as_exposure(covars):
    f = make_formula(outcome, MAPPING["exposure_dd"], covars=covars, binary=OUTCOME_IS_BINARY)
    m = fit_model(df, f, OUTCOME_IS_BINARY)
    print(f"Formula: {f}")
    print(m.summary())

print("Use run_model_with_covars([...]) to experiment; try prespec_covars or subsets.")
print("Use run_with_dd_as_exposure([...]) to compare biomarker vs DD.")


## Save mapping snapshot (for reproducibility)

In [None]:

snapshot = {
    "mapping": MAPPING,
    "candidates": CANDIDATE_CONFOUNDERS,
    "binary_outcome": OUTCOME_IS_BINARY,
}

Path("./artifacts").mkdir(exist_ok=True, parents=True)
with open("./artifacts/data_mapping_snapshot.json", "w") as f:
    json.dump(snapshot, f, indent=2)
print("Saved ./artifacts/data_mapping_snapshot.json")
