In [None]:
# Author: Hayden Hedman
# Purpose: Simulate panel data for diabetes treatment impact study
# Date: 2025-02-10

In [9]:
# Imports and setup
import numpy as np
import pandas as pd
import os

In [15]:
# Define panel simulation function for creating simulated patient data
def simulate_diabetes_panel(n_patients=7528, n_months=144, seed=64):
    np.random.seed(seed)
    ids = np.arange(1, n_patients + 1)

    age = np.random.normal(68, 10, n_patients).astype(int)
    sex = np.random.binomial(1, 0.5, n_patients)
    comorb_idx = np.random.poisson(1, n_patients)
    comorbidities = np.random.binomial(1, 0.3, size=(n_patients, 5))

    rows = []
    for pid, a, s, c, comorbs in zip(ids, age, sex, comorb_idx, comorbidities):
        start = np.random.choice(list(range(12, 48)) + [999], p=[0.02]*36 + [0.28])
        baseline_a1c = np.random.normal(8, 1)
        treated_group = int(start < 999)
        tx_effect = -0.3 + np.random.normal(0, 0.05)

        for t in range(n_months):
            treat = int(t >= start and start < 999)
            months_since_tx = t - start if start < 999 else 0
            a1c = baseline_a1c + (tx_effect * max(0, months_since_tx)) + np.random.normal(0, 0.5)
            hosp = np.random.binomial(1, 0.02 + 0.01 * (1 - treat))
            post_treatment = int(treat)
            time_since_tx = months_since_tx
            cohort_month = t

            rows.append([
                pid, t, a, s, c, *comorbs, treat, baseline_a1c,
                a1c, hosp, start, treated_group, post_treatment,
                time_since_tx, cohort_month
            ])

    columns = [
        "patient_id", "month", "age", "sex", "comorb_idx",
        "hypertension", "copd", "ckd", "depression", "obesity",
        "treat", "baseline_a1c", "a1c", "hosp", "start_month",
        "treated_group", "post_treatment", "time_since_tx", "cohort_month"
    ]

    return pd.DataFrame(rows, columns=columns)

In [16]:
# Automate simulated data 
# automate pulling data directory above in root /data folder
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Now define the shared data directory from project root
data_dir = os.path.join(project_root, "data")
os.makedirs(data_dir, exist_ok=True)

# Simulate data
df = simulate_diabetes_panel()

# Full save path
save_path = os.path.join(data_dir, "simulated_diabetes_patient_data.csv")
df.to_csv(save_path, index=False)

print(f"Confirmed: simulated diabetes patient dataset saved to: {save_path}")


Confirmed: simulated diabetes patient dataset saved to: C:\Users\hayde\Desktop\diabetes\data\simulated_diabetes_patient_data.csv
