# FB2NEP — Practice Notebook

This notebook is for practising data analysis for the **FB2NEP assessment**. It automatically loads the dataset and libraries, so you can focus on exploring the data and answering questions like:
- Describing the study population (Table 1) and comparing groups (e.g., by sex, deprivation, or disease).
- Checking if data are missing at random.
- Analysing associations between nutrient intake and blood pressure (BP) or disease (logistic or Cox regression).
- Experimenting with models and data transformations.

**Instructions**: Run all cells in order. Edit the cells marked **"Try it!"** to practice. Use the accompanying cheat sheet for function explanations.


In [None]:
# Imports and setup
import os, sys, math, json, textwrap, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from lifelines import CoxPHFitter

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 200)

# Detect Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Clone repository if in Colab
if IN_COLAB:
    !git clone https://github.com/ggkuhnle/fb2nep-epi.git
    %cd fb2nep-epi

from scripts.bootstrap import init
df, ctx = init()
print(df.shape, "— dataset ready")
df.head()


In [None]:
# Data mapping (pre-filled for practice)
MAPPING = {
    "outcome": "CVD_Incidence",       # Binary: 0 (no event), 1 (event)
    "time": "Time_to_Event",          # Time-to-event for Cox regression
    "exposure_biomarker": "Flavanol_Biomarker",  # Nutrient biomarker (continuous)
    "exposure_dd": "Flavanol_DD",     # Diet diary nutrient (continuous)
    "bp": "Systolic_BP",              # Blood pressure (continuous)
    "id": "Participant_ID",           # Participant identifier
    "age": "Age",                     # Age in years
    "sex": "Sex",                     # 0 (male), 1 (female)
    "bmi": "BMI",                     # Body mass index
    "smoking": "Smoking_Status",      # 'never', 'former', 'current'
    "ses": "SES_Index",               # Socioeconomic status
}

CANDIDATE_CONFOUNDERS = ["age", "sex", "bmi", "smoking", "ses"]
OUTCOME_IS_BINARY = True
USE_COX = False  # Change to True for Cox regression
TRANSFORM = None  # Options: None, 'log', 'sqrt'


In [None]:
# Preprocess data
def load_fb2nep():
    DATA_PATHS = [Path(p) for p in ["./data/fb2nep.csv", "./fb2nep.csv", "../data/fb2nep.csv"]]
    for p in DATA_PATHS:
        if p.exists():
            return pd.read_csv(p)
    raise FileNotFoundError("fb2nep.csv not found")

df = load_fb2nep()

# Apply transformation
def apply_transform(data, col, transform):
    if transform == 'log':
        return np.log1p(data[col])
    elif transform == 'sqrt':
        return np.sqrt(data[col])
    return data[col]

if TRANSFORM:
    for key in ['exposure_biomarker', 'exposure_dd', 'bp']:
        if MAPPING.get(key) in df.columns:
            df[MAPPING[key]] = apply_transform(df, MAPPING[key], TRANSFORM)
            print(f"Applied {TRANSFORM} transformation to {MAPPING[key]}")

# Coerce types
def coerce_types(d, mapping, outcome_is_binary):
    d = d.copy()
    if mapping.get("sex") in d:
        if d[mapping["sex"]].dtype == object:
            d[mapping["sex"]] = d[mapping["sex"]].astype(str).str.strip().str[0].str.upper().map({"M":0, "F":1})
    if outcome_is_binary and mapping.get("outcome") in d:
        if d[mapping["outcome"]].dtype == object:
            d[mapping["outcome"]] = d[mapping["outcome"]].astype(str).str.strip().str.lower().map({"no":0, "yes":1})
        d[mapping["outcome"]] = pd.to_numeric(d[mapping["outcome"]], errors="coerce")
    return d

df = coerce_types(df, MAPPING, OUTCOME_IS_BINARY)
df.head()


## Practice 1: Describe study population (Table 1)

**Goal**: Summarise variables and compare groups (e.g., by sex, SES, or disease).
**Try it**: Change `COMPARE_BY` to 'ses', 'outcome', or another MAPPING key.


In [None]:
def summarise_series(s):
    if pd.api.types.is_numeric_dtype(s):
        return pd.Series({
            "n": s.notna().sum(),
            "mean": s.mean(),
            "sd": s.std(),
            "median": s.median(),
        })
    else:
        vc = s.value_counts(dropna=False)
        total = len(s)
        return pd.Series({f"{k} (n,%)": f"{v} ({v/total*100:.1f}%)" for k, v in vc.items()})

def table1(df, cols, by=None):
    if by is None or by not in df.columns:
        return pd.concat({c: summarise_series(df[c]) for c in cols}, axis=1)
    groups = df.groupby(by, dropna=False)
    parts = [pd.concat({c: summarise_series(dsub[c]) for c in cols}, axis=1).set_axis(pd.MultiIndex.from_product([[f"{by}={lvl}"], cols]), axis=1) for lvl, dsub in groups]
    return pd.concat(parts, axis=1)

cols = [MAPPING.get(k) for k in ["age", "sex", "bmi", "smoking", "ses", "exposure_biomarker", "exposure_dd", "bp"] if MAPPING.get(k) in df.columns]
COMPARE_BY = "sex"  # Try 'ses', 'outcome', etc.
tbl1 = table1(df, cols, MAPPING.get(COMPARE_BY))
tbl1


## Practice 2: Missingness audit

**Goal**: Check missing data patterns and test if missingness is random.
**Try it**: Change `var` or `group_by` to test different variables (e.g., 'exposure_dd', 'ses').


In [None]:
def missingness_summary(d):
    miss = d.isna().mean().sort_values(ascending=False)
    return pd.DataFrame({"missing_%": (miss*100).round(1), "n_missing": d.isna().sum()})

def test_mar(df, var, group_by):
    if var not in df.columns or group_by not in df.columns:
        return "Invalid column names"
    miss = df[var].isna()
    if df[group_by].dtype == object or pd.api.types.is_categorical_dtype(df[group_by]):
        contingency = pd.crosstab(miss, df[group_by])
        chi2, p = stats.chi2_contingency(contingency)[:2]
        return {"chi2": chi2, "p_value": p, "contingency": contingency}
    else:
        miss_val = df.loc[miss, group_by]
        not_miss_val = df.loc[~miss, group_by]
        t_stat, p = stats.ttest_ind(miss_val.dropna(), not_miss_val.dropna(), equal_var=False)
        return {"t_stat": t_stat, "p_value": p}

miss = missingness_summary(df)
miss.head(10)

# Test missingness
var = MAPPING.get("exposure_biomarker")
group_by = MAPPING.get("outcome")
mar_test = test_mar(df, var, group_by)
print(f"MAR test for {var} by {group_by}:")
print(mar_test)


## Practice 3: Nutrient intake vs. Blood Pressure

**Goal**: Explore association between nutrient intake and BP.
**Try it**: Change `exposure` to 'exposure_dd' or add/remove covariates.


In [None]:
def fit_linear_model(df, outcome, exposure, covars=None):
    covars = covars or []
    rhs = exposure if not covars else f"{exposure} + {' + '.join(covars)}"
    formula = f"{outcome} ~ {rhs}"
    d = df[[outcome, exposure] + covars].dropna()
    model = smf.ols(formula, data=d).fit()
    return model, formula

bp = MAPPING.get("bp")
exposure = MAPPING.get("exposure_biomarker")
covars = [MAPPING.get(k) for k in ["age", "sex", "bmi"] if MAPPING.get(k) in df.columns]

bp_model, bp_formula = fit_linear_model(df, bp, exposure, covars)
print(f"Model: {bp_formula}")
print(bp_model.summary())


## Practice 4: Nutrient intake vs. Disease

**Goal**: Test association between nutrient intake and disease (logistic or Cox).
**Try it**: Toggle `USE_COX`, change `exposure`, or adjust `covars`.


In [None]:
def make_formula(outcome, exposure, covars=None):
    rhs = exposure if not covars else f"{exposure} + {' + '.join(covars)}"
    return f"{outcome} ~ {rhs}"

def fit_model(df, formula, covars=None, binary=True, use_cox=False):
    d = df.dropna(subset=(covars or []) + formula.split("~")[1].split(" + ") + ([MAPPING["time"]] if use_cox else []))
    if use_cox:
        covars = covars or []
        model = CoxPHFitter()
        d = d[[MAPPING["outcome"], MAPPING["time"], exposure] + covars]
        model.fit(d, duration_col=MAPPING["time"], event_col=MAPPING["outcome"])
        return model, d
    elif binary:
        model = smf.logit(formula, data=d).fit(disp=False)
        return model, d
    else:
        model = smf.ols(formula, data=d).fit()
        return model, d

outcome = MAPPING["outcome"]
exposure = MAPPING["exposure_biomarker"]
covars = [MAPPING.get(k) for k in CANDIDATE_CONFOUNDERS if MAPPING.get(k) in df.columns]

formula = make_formula(outcome, exposure, covars)
model, data = fit_model(df, formula, covars, OUTCOME_IS_BINARY, USE_COX)
print(f"Model: {formula}")
if USE_COX:
    model.print_summary()
else:
    print(model.summary())


## Practice 5: Experiment with transformations

**Goal**: Try transforming variables (e.g., log, sqrt) and re-run models.
**Try it**: Change `TRANSFORM` in the mapping cell to 'log' or 'sqrt', then re-run earlier cells.


In [None]:
# Example: Re-run BP model with transformed exposure
if TRANSFORM:
    bp_model, bp_formula = fit_linear_model(df, MAPPING["bp"], MAPPING["exposure_biomarker"], covars)
    print(f"Model with {TRANSFORM} transformation: {bp_formula}")
    print(bp_model.summary())
