# FB2NEP Workbook 7 – Data Transformation and Preparation for Modelling

This workbook discusses:

- Why we transform data (skewness, comparability, interpretability).
- Log transformation, z‑scoring, Box–Cox transforms.
- Pitfalls of categorisation.
- Visual comparison of original versus transformed variables.

In [None]:
from __future__ import annotations

import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

%matplotlib inline

DATA_PATH = pathlib.Path("data") / "fb2nep_synthetic.csv"
df = pd.read_csv(DATA_PATH)
df.head()

## 1. Skewness and motivation for transformation

In [None]:
# Choose an example variable likely to be skewed

var = None
for candidate in ["energy_kcal", "lde_flav", "lfe_flav"]:
    if candidate in df.columns:
        var = candidate
        break
if var is None:
    raise ValueError("No suitable skewed variable found – please adjust.")

plt.figure(figsize=(6, 4))
df[var].hist(bins=30)
plt.xlabel(var)
plt.ylabel("Number of participants")
plt.title(f"Distribution of {var}")
plt.tight_layout()
plt.show()

print("Skewness:", stats.skew(df[var].dropna()))

## 2. Log transformation

In [None]:
def log_transform(x: pd.Series, constant: float = 0.0) -> pd.Series:
    """Apply a natural log transform to a Series, adding a constant if needed."""
    return np.log(x + constant)

df["log_" + var] = log_transform(df[var], constant=0.1)

plt.figure(figsize=(6, 4))
df["log_" + var].hist(bins=30)
plt.xlabel(f"log({var})")
plt.ylabel("Number of participants")
plt.title(f"Distribution of log‑transformed {var}")
plt.tight_layout()
plt.show()

df[[var, "log_" + var]].describe()

## 3. Standardisation (z‑scoring)

In [None]:
def z_score(x: pd.Series) -> pd.Series:
    """Return z‑scores: (x − mean) / SD."""
    return (x - x.mean()) / x.std()

for v in ["bmi", "sbp"]:
    if v in df.columns:
        df["z_" + v] = z_score(df[v])
        print(f"\nSummary of z‑scored {v}:")
        display(df["z_" + v].describe())

## 4. Box–Cox transformation

In [None]:
positive = df[var].dropna()
positive = positive[positive > 0]
bc_values, bc_lambda = stats.boxcox(positive)
print(f"Estimated Box–Cox lambda for {var}: {bc_lambda:.2f}")

plt.figure(figsize=(6, 4))
plt.hist(bc_values, bins=30)
plt.xlabel(f"Box–Cox transformed {var}")
plt.ylabel("Number of participants")
plt.title(f"Distribution of Box–Cox transformed {var}")
plt.tight_layout()
plt.show()

## 5. Pitfalls of categorisation

In [None]:
# Example: categorise BMI into tertiles and compare with continuous treatment

if "bmi" in df.columns:
    df["bmi_tertile"] = pd.qcut(df["bmi"], q=3, labels=["low", "medium", "high"])
    display(df[["bmi", "bmi_tertile"]].head())

    if "sbp" in df.columns:
        print("\nSBP by BMI tertile (mean and SD):")
        display(df.groupby("bmi_tertile")["sbp"].agg(["mean", "std"]))

In [None]:
# Scatter plot using continuous BMI

if {"bmi", "sbp"}.issubset(df.columns):
    plt.figure(figsize=(6, 4))
    plt.scatter(df["bmi"], df["sbp"], alpha=0.5)
    plt.xlabel("BMI (kg/m²)")
    plt.ylabel("SBP (mmHg)")
    plt.title("SBP vs continuous BMI")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(6, 4))
    df.boxplot(column="sbp", by="bmi_tertile")
    plt.xlabel("BMI tertile")
    plt.ylabel("SBP (mmHg)")
    plt.title("SBP by BMI tertile")
    plt.suptitle("")
    plt.tight_layout()
    plt.show()