# FB2NEP Workbook 7 – Regression and Modelling (Part 2)

We build on the previous workbook to:

- Explore more complex modelling scenarios.
- Introduce missing data in the context of regression.
- Demonstrate complete-case analysis versus simple imputation.

Run the first two code cells to set up the repository and load the data.

In [None]:
import os
import sys
import runpy
import pathlib
import subprocess

REPO_URL = "https://github.com/ggkuhnle/fb2nep-epi.git"
REPO_NAME = "fb2nep-epi"

# 1. If we are in Colab and scripts/bootstrap.py is not present,
#    clone the repository and change into it.
if "google.colab" in sys.modules and not pathlib.Path("scripts/bootstrap.py").exists():
    root = pathlib.Path("/content")
    repo_dir = root / REPO_NAME

    if not repo_dir.exists():
        print(f"Cloning {REPO_URL} …")
        subprocess.run(["git", "clone", REPO_URL], check=True)

    os.chdir(repo_dir)
    print("Changed working directory to:", os.getcwd())

# 2. Now try to locate and run scripts/bootstrap.py
for p in ["scripts/bootstrap.py", "../scripts/bootstrap.py", "../../scripts/bootstrap.py"]:
    if pathlib.Path(p).exists():
        print(f"Bootstrapping via: {p}")
        runpy.run_path(p)
        break
else:
    print("⚠️ scripts/bootstrap.py not found – "
          "please check that the FB2NEP repository is available.")


In [None]:
import pandas as pd

# Load the main synthetic cohort used in all FB2NEP workbooks
df = pd.read_csv("data/synthetic/fb2nep.csv")

# Quick check: first rows
df.head()

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
%matplotlib inline

## 1. Linear regression with missing covariates

In [None]:
cols = [c for c in ["SBP", "BMI", "age", "sex", "smoking_status"] if c in df.columns]
df_model = df[cols].copy()
if "sex" in df_model.columns:
    df_model["sex"] = df_model["sex"].astype("category")
if "smoking_status" in df_model.columns:
    df_model["smoking_status"] = df_model["smoking_status"].astype("category")
df_model.isna().mean()

## 2. Complete-case analysis

In [None]:
df_cc = df_model.dropna()
print(f"Number of complete cases: {len(df_cc)}")
formula = "SBP ~ BMI + age"
if "sex" in df_cc.columns:
    formula += " + C(sex)"
if "smoking_status" in df_cc.columns:
    formula += " + C(smoking_status)"
model_cc = smf.ols(formula, data=df_cc).fit()
model_cc.params

## 3. Simple single imputation

For illustration we replace missing values with the mean (for numeric variables)
or the most frequent category (for categorical variables). This is not recommended
for real analyses but it highlights the impact of a naïve approach.

In [None]:
df_si = df_model.copy()
for col in df_si.columns:
    if df_si[col].dtype.kind in "biufc":
        df_si[col] = df_si[col].fillna(df_si[col].mean())
    else:
        df_si[col] = df_si[col].fillna(df_si[col].mode().iloc[0])
model_si = smf.ols(formula, data=df_si).fit()
pd.DataFrame({"complete_case": model_cc.params, "single_impute": model_si.params})