# FB2NEP Workbook 6 – Regression and Modelling (Part 1)

We introduce:

- Linear regression.
- Logistic regression.
- Basic model diagnostics.

All analyses use the synthetic FB2NEP cohort.

Run the first two code cells to set up the repository and load the data.

In [None]:
import os
import sys
import runpy
import pathlib
import subprocess

REPO_URL = "https://github.com/ggkuhnle/fb2nep-epi.git"
REPO_NAME = "fb2nep-epi"

# 1. If we are in Colab and scripts/bootstrap.py is not present,
#    clone the repository and change into it.
if "google.colab" in sys.modules and not pathlib.Path("scripts/bootstrap.py").exists():
    root = pathlib.Path("/content")
    repo_dir = root / REPO_NAME

    if not repo_dir.exists():
        print(f"Cloning {REPO_URL} …")
        subprocess.run(["git", "clone", REPO_URL], check=True)

    os.chdir(repo_dir)
    print("Changed working directory to:", os.getcwd())

# 2. Now try to locate and run scripts/bootstrap.py
for p in ["scripts/bootstrap.py", "../scripts/bootstrap.py", "../../scripts/bootstrap.py"]:
    if pathlib.Path(p).exists():
        print(f"Bootstrapping via: {p}")
        runpy.run_path(p)
        break
else:
    print("⚠️ scripts/bootstrap.py not found – "
          "please check that the FB2NEP repository is available.")


In [None]:
import pandas as pd

# Load the main synthetic cohort used in all FB2NEP workbooks
df = pd.read_csv("data/synthetic/fb2nep.csv")

# Quick check: first rows
df.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline

## 1. Linear regression: SBP on BMI, age, sex

In [None]:
if "sex" in df.columns:
    df["sex"] = df["sex"].astype("category")
formula = "SBP ~ BMI + age + C(sex)"
model_lin = smf.ols(formula, data=df).fit()
model_lin.summary()

In [None]:
print(model_lin.params)
print("\n95 % confidence intervals:")
print(model_lin.conf_int())

## 2. Diagnostics for the linear model

In [None]:
fitted = model_lin.fittedvalues
residuals = model_lin.resid

plt.figure(figsize=(6, 4))
plt.scatter(fitted, residuals, alpha=0.4)
plt.axhline(0, color="black", linestyle="--")
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.title("Residuals vs fitted values")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 4))
plt.hist(residuals, bins=30)
plt.xlabel("Residual")
plt.ylabel("Number of observations")
plt.title("Distribution of residuals")
plt.tight_layout()
plt.show()

## 3. Logistic regression: incident CVD as outcome

We model incident CVD as a function of age, BMI, sex, and smoking status.

In [None]:
if "smoking_status" in df.columns:
    df["smoking_status"] = df["smoking_status"].astype("category")
if "CVD_incident" in df.columns:
    formula_logit = "CVD_incident ~ age + BMI + C(sex) + C(smoking_status)"
    df_logit = df.dropna(subset=["CVD_incident", "age", "BMI", "sex", "smoking_status"])
    model_logit = smf.logit(formula_logit, data=df_logit).fit()
    model_logit.summary()

In [None]:
if "CVD_incident" in df.columns:
    params = model_logit.params
    conf = model_logit.conf_int()
    or_ = np.exp(params)
    or_ci = np.exp(conf)
    pd.DataFrame({"OR": or_, "CI_lower": or_ci[0], "CI_upper": or_ci[1]})