# FB2NEP Workbook 4 – Data Exploration and “Table 1”

This workbook covers:

- Descriptive statistics and visual inspection.
- Construction of baseline characteristics tables (“Table 1”).
- Group comparisons (t-tests, χ², ANOVA) using the synthetic FB2NEP cohort.

Run the first two code cells before starting the analyses.

In [None]:
import os
import sys
import runpy
import pathlib
import subprocess

REPO_URL = "https://github.com/ggkuhnle/fb2nep-epi.git"
REPO_NAME = "fb2nep-epi"

# 1. If we are in Colab and scripts/bootstrap.py is not present,
#    clone the repository and change into it.
if "google.colab" in sys.modules and not pathlib.Path("scripts/bootstrap.py").exists():
    root = pathlib.Path("/content")
    repo_dir = root / REPO_NAME

    if not repo_dir.exists():
        print(f"Cloning {REPO_URL} …")
        subprocess.run(["git", "clone", REPO_URL], check=True)

    os.chdir(repo_dir)
    print("Changed working directory to:", os.getcwd())

# 2. Now try to locate and run scripts/bootstrap.py
for p in ["scripts/bootstrap.py", "../scripts/bootstrap.py", "../../scripts/bootstrap.py"]:
    if pathlib.Path(p).exists():
        print(f"Bootstrapping via: {p}")
        runpy.run_path(p)
        break
else:
    print("⚠️ scripts/bootstrap.py not found – "
          "please check that the FB2NEP repository is available.")


In [None]:
import pandas as pd

# Load the main synthetic cohort used in all FB2NEP workbooks
df = pd.read_csv("data/synthetic/fb2nep.csv")

# Quick check: first rows
df.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

## 1. Basic descriptive statistics

We begin with central tendency and spread for key continuous variables.

In [None]:
continuous_vars = [
    v for v in ["age", "BMI", "SBP", "energy_kcal",
               "fruit_veg_g_d", "red_meat_g_d"] if v in df.columns
]
df[continuous_vars].describe()

In [None]:
categorical_vars = [v for v in ["sex", "SES_class", "IMD_quintile",
                                "smoking_status", "physical_activity"]
                    if v in df.columns]
for col in categorical_vars:
    print(f"\nVariable: {col}")
    print(df[col].value_counts(dropna=False))
    print("-" * 40)

## 2. Defining an exposure group

For illustration, we define a binary exposure: high vs low red meat intake.

In [None]:
if "red_meat_g_d" in df.columns:
    cut = df["red_meat_g_d"].median()
    df["high_red_meat"] = (df["red_meat_g_d"] > cut).astype(int)
    df["high_red_meat"].value_counts()

## 3. Constructing a simple “Table 1”

We summarise selected variables by `high_red_meat`.

In [None]:
def summarise_continuous_by_group(data: pd.DataFrame, var: str, group: str) -> pd.DataFrame:
    """Return count, mean, and SD for `var` by `group`."""
    return data.groupby(group)[var].agg(["count", "mean", "std"])

for var in continuous_vars:
    if "high_red_meat" in df.columns:
        print(f"\n=== {var} by high_red_meat ===")
        display(summarise_continuous_by_group(df, var, "high_red_meat"))

In [None]:
def summarise_categorical_by_group(data: pd.DataFrame, var: str, group: str):
    tab = pd.crosstab(data[var], data[group])
    row_percent = tab.div(tab.sum(axis=1), axis=0) * 100
    return tab, row_percent

if "high_red_meat" in df.columns:
    for var in categorical_vars:
        print(f"\n=== {var} by high_red_meat (counts) ===")
        counts, perc = summarise_categorical_by_group(df, var, "high_red_meat")
        display(counts)
        print("Row percentages (%):")
        display(perc.round(1))

## 4. Group comparisons

We now add simple hypothesis tests to the descriptive summaries.

In [None]:
# Example: age difference between high vs low red meat
if {"age", "high_red_meat"}.issubset(df.columns):
    g0 = df.loc[df["high_red_meat"] == 0, "age"].dropna()
    g1 = df.loc[df["high_red_meat"] == 1, "age"].dropna()
    t_stat, p_val = stats.ttest_ind(g0, g1, equal_var=False)
    print(f"Age difference (Welch t-test): t = {t_stat:.2f}, p = {p_val:.3g}")

In [None]:
# Example: χ² test for sex vs high_red_meat
if {"sex", "high_red_meat"}.issubset(df.columns):
    contingency = pd.crosstab(df["sex"], df["high_red_meat"])
    chi2, p, dof, expected = stats.chi2_contingency(contingency)
    print("Chi-squared test: sex vs high_red_meat")
    print(f"χ² = {chi2:.2f}, df = {dof}, p = {p:.3g}")

## 5. Visualisation

Plots provide an immediate sense of group differences.

In [None]:
if {"BMI", "high_red_meat"}.issubset(df.columns):
    plt.figure(figsize=(6, 4))
    df.boxplot(column="BMI", by="high_red_meat")
    plt.xlabel("High red meat (0 = low, 1 = high)")
    plt.ylabel("BMI (kg/m²)")
    plt.title("BMI by red meat intake")
    plt.suptitle("")
    plt.tight_layout()
    plt.show()