# This notebook:
- creates a synthetic bioinformatics-style dataset (continuous outcome, ≥3 predictors, ≥1 categorical predictor)

In [2]:
import numpy as np
import pandas as pd

In [3]:
rng = np.random.default_rng(42)

# ---- Synthetic bioinformatics-style dataset ----
n = 160
age = rng.normal(45, 12, n).clip(18, 80)
bmi = rng.normal(27, 5, n).clip(16, 45)
sex = rng.choice(["F", "M"], size=n, p=[0.55, 0.45])               # categorical
batch = rng.choice(["A", "B", "C"], size=n, p=[0.45, 0.35, 0.20])  # categorical
smoking = rng.choice([0, 1], size=n, p=[0.78, 0.22])

# True effects for the outcome (log2 metabolite)
beta0 = 6.2
beta_age = 0.015
beta_bmi = 0.040
beta_smoke = 0.35
beta_sexM = -0.12
batch_effect = {"A": 0.00, "B": 0.25, "C": -0.18}

# Slightly heteroscedastic noise
sigma = 0.25 + 0.01*(bmi - bmi.mean()).clip(min=0)
eps = rng.normal(0, sigma, n)

log2_metabolite = (
    beta0
    + beta_age*age
    + beta_bmi*bmi
    + beta_smoke*smoking
    + beta_sexM*(sex == "M").astype(float)
    + np.vectorize(batch_effect.get)(batch)
    + eps
)

df = pd.DataFrame({
    "log2_metabolite": log2_metabolite,
    "age": age,
    "bmi": bmi,
    "smoking": smoking,
    "sex": sex,
    "batch": batch,
})

df.head()

out_file_name = '../data/simulated_data_for_MLR_2.csv'
df.to_csv(out_file_name)