# FB2NEP Workbook 6 – Data Exploration and “Table 1”

This workbook covers:

- Descriptive statistics and visual inspection.
- Baseline characteristics tables (“Table 1”).
- Group comparisons: *t*‑test, χ² test, ANOVA.
- Visualisation as first insight.
- Statistical versus practical significance.

In [None]:
from __future__ import annotations

import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

%matplotlib inline

DATA_PATH = pathlib.Path("data") / "fb2nep_synthetic.csv"
df = pd.read_csv(DATA_PATH)
df.head()

## 1. Basic descriptive statistics

In [None]:
# Continuous variables

continuous_vars = [c for c in ["age", "bmi", "sbp", "dbp", "energy_kcal"] if c in df.columns]
df[continuous_vars].describe()

In [None]:
# Categorical variables

categorical_vars = [c for c in ["sex", "smoking_cat", "smoking"] if c in df.columns]
for col in categorical_vars:
    print(f"\nVariable: {col}")
    print(df[col].value_counts(dropna=False))
    print("-" * 40)

## 2. Creating a simple “Table 1” by exposure group

In [None]:
# Define a grouping variable, e.g. high vs low UPF (or high vs low energy)

if "high_upf" not in df.columns:
    if "energy_kcal" in df.columns:
        median_energy = df["energy_kcal"].median()
        df["high_upf"] = (df["energy_kcal"] > median_energy).astype(int)
    else:
        np.random.seed(11088)
        df["high_upf"] = np.random.randint(0, 2, size=len(df))

df["high_upf"].value_counts()

In [None]:
def summarise_continuous_by_group(data: pd.DataFrame, var: str, group: str) -> pd.DataFrame:
    """Return count, mean, and SD of `var` by `group`."""
    return data.groupby(group)[var].agg(["count", "mean", "std"])

for var in continuous_vars:
    print(f"\n=== {var} by high_upf ===")
    display(summarise_continuous_by_group(df, var, "high_upf"))

In [None]:
def summarise_categorical_by_group(data: pd.DataFrame, var: str, group: str):
    """Return counts and row percentages for `var` within levels of `group`."""
    tab = pd.crosstab(data[var], data[group])
    row_percent = tab.div(tab.sum(axis=1), axis=0) * 100
    return tab, row_percent

for var in categorical_vars:
    print(f"\n=== {var} by high_upf (counts) ===")
    counts, perc = summarise_categorical_by_group(df, var, "high_upf")
    display(counts)
    print("Row percentages (%):")
    display(perc.round(1))

## 3. Group comparisons: *t*‑test, χ² test, ANOVA

In [None]:
# Example: t‑test for age between high vs low UPF

if {"age", "high_upf"}.issubset(df.columns):
    g0 = df.loc[df["high_upf"] == 0, "age"].dropna()
    g1 = df.loc[df["high_upf"] == 1, "age"].dropna()
    t_stat, p_val = stats.ttest_ind(g0, g1, equal_var=False)
    print(f"Age difference (Welch t‑test): t = {t_stat:.2f}, p = {p_val:.3g}")

In [None]:
# Example: χ² test for sex vs high_upf

if {"sex", "high_upf"}.issubset(df.columns):
    contingency = pd.crosstab(df["sex"], df["high_upf"])
    chi2, p, dof, expected = stats.chi2_contingency(contingency)
    print("Chi‑squared test: sex vs high_upf")
    print(f"χ² = {chi2:.2f}, df = {dof}, p = {p:.3g}")

In [None]:
# Example: ANOVA – BMI across smoking categories

if {"bmi", "smoking_cat"}.issubset(df.columns):
    groups = [g["bmi"].dropna().values for _, g in df.groupby("smoking_cat")]
    f_stat, p_val = stats.f_oneway(*groups)
    print(f"ANOVA BMI ~ smoking_cat: F = {f_stat:.2f}, p = {p_val:.3g}")

## 4. Visualisation as first insight

In [None]:
# Boxplot of BMI by high_upf

if {"bmi", "high_upf"}.issubset(df.columns):
    plt.figure(figsize=(6, 4))
    df.boxplot(column="bmi", by="high_upf")
    plt.xlabel("High UPF (0 = low, 1 = high)")
    plt.ylabel("BMI (kg/m²)")
    plt.title("BMI by high UPF intake")
    plt.suptitle("")
    plt.tight_layout()
    plt.show()

In [None]:
# Histogram of age

if "age" in df.columns:
    plt.figure(figsize=(6, 4))
    df["age"].hist(bins=20)
    plt.xlabel("Age (years)")
    plt.ylabel("Number of participants")
    plt.title("Age distribution")
    plt.tight_layout()
    plt.show()

## 5. Statistical versus practical significance

Large samples can make very small differences statistically significant.
Always consider whether an observed difference is meaningful in public health terms.