# FB2NEP Workbook 5 – Data Transformation and Preparation for Modelling

We consider:

- Why we transform data (skewness, comparability, interpretability).
- Log transformation and z-scoring.
- Pitfalls of categorisation.

Run the first two code cells to set up the repository and load the data.

In [None]:
import os
import sys
import runpy
import pathlib
import subprocess

REPO_URL = "https://github.com/ggkuhnle/fb2nep-epi.git"
REPO_NAME = "fb2nep-epi"

# 1. If we are in Colab and scripts/bootstrap.py is not present,
#    clone the repository and change into it.
if "google.colab" in sys.modules and not pathlib.Path("scripts/bootstrap.py").exists():
    root = pathlib.Path("/content")
    repo_dir = root / REPO_NAME

    if not repo_dir.exists():
        print(f"Cloning {REPO_URL} …")
        subprocess.run(["git", "clone", REPO_URL], check=True)

    os.chdir(repo_dir)
    print("Changed working directory to:", os.getcwd())

# 2. Now try to locate and run scripts/bootstrap.py
for p in ["scripts/bootstrap.py", "../scripts/bootstrap.py", "../../scripts/bootstrap.py"]:
    if pathlib.Path(p).exists():
        print(f"Bootstrapping via: {p}")
        runpy.run_path(p)
        break
else:
    print("⚠️ scripts/bootstrap.py not found – "
          "please check that the FB2NEP repository is available.")


In [None]:
import pandas as pd

# Load the main synthetic cohort used in all FB2NEP workbooks
df = pd.read_csv("data/synthetic/fb2nep.csv")

# Quick check: first rows
df.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

## 1. Skewness and transformation

We inspect the distribution of a skewed variable, for example `red_meat_g_d`.

In [None]:
var = "red_meat_g_d" if "red_meat_g_d" in df.columns else "energy_kcal"
plt.figure(figsize=(6, 4))
df[var].hist(bins=30)
plt.xlabel(var)
plt.ylabel("Number of participants")
plt.title(f"Distribution of {var}")
plt.tight_layout()
plt.show()
print("Skewness:", stats.skew(df[var].dropna()))

## 2. Log transformation

We apply a simple log transformation to reduce skewness.

In [None]:
def log_transform(x: pd.Series, constant: float = 0.0) -> pd.Series:
    return np.log(x + constant)

df["log_" + var] = log_transform(df[var], constant=0.1)
plt.figure(figsize=(6, 4))
df["log_" + var].hist(bins=30)
plt.xlabel(f"log({var})")
plt.ylabel("Number of participants")
plt.title(f"Distribution of log-transformed {var}")
plt.tight_layout()
plt.show()

## 3. Standardisation (z-scores)

We standardise BMI and SBP to have mean 0 and standard deviation 1.

In [None]:
def z_score(x: pd.Series) -> pd.Series:
    return (x - x.mean()) / x.std()

for v in ["BMI", "SBP"]:
    if v in df.columns:
        df["z_" + v] = z_score(df[v])
        print(f"\nSummary of z-scored {v}:")
        display(df["z_" + v].describe())

## 4. Categorisation and information loss

We compare continuous BMI with BMI tertiles.

In [None]:
if {"BMI", "SBP"}.issubset(df.columns):
    df["BMI_tertile"] = pd.qcut(df["BMI"], 3, labels=["low", "medium", "high"])
    plt.figure(figsize=(6, 4))
    plt.scatter(df["BMI"], df["SBP"], alpha=0.3)
    plt.xlabel("BMI (kg/m²)")
    plt.ylabel("SBP (mmHg)")
    plt.title("SBP vs continuous BMI")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(6, 4))
    df.boxplot(column="SBP", by="BMI_tertile")
    plt.xlabel("BMI tertile")
    plt.ylabel("SBP (mmHg)")
    plt.title("SBP by BMI tertile")
    plt.suptitle("")
    plt.tight_layout()
    plt.show()