## Generate synthetic data based on MIMIC-III

References: https://www.nature.com/articles/sdata201635 ; https://physionet.org/content/mimiciii/1.4/

In [None]:
import numpy as np
import pandas as pd

# Parameters
n_patients = 1000
n_timepoints = 48  # hourly measurements
time_index = [f"t{i}" for i in range(n_timepoints)]

# Features to simulate
features = [
    "heart_rate", "systolic_bp", "diastolic_bp", "resp_rate",
    "temperature", "spo2", "glucose", "creatinine", "wbc", "hematocrit"
]

# Demographic categories
gender_choices = ['Male', 'Female']
race_choices = ['White', 'Black', 'Hispanic', 'Asian', 'Other']

# Simulate data
data = []
for patient_id in range(n_patients):
    patient_data = {
        "patient_id": patient_id,
        "age": np.random.randint(18, 90),
        "gender": np.random.choice(gender_choices),
        "race": np.random.choice(race_choices)
    }
    
    # Simulate time-series features
    for feature in features:
        mean = {
            "heart_rate": 75,
            "systolic_bp": 120,
            "diastolic_bp": 80,
            "resp_rate": 18,
            "temperature": 98.6,
            "spo2": 97,
            "glucose": 100,
            "creatinine": 1.0,
            "wbc": 7.0,
            "hematocrit": 42
        }[feature]
        std = mean * 0.1
        values = np.clip(np.random.normal(loc=mean, scale=std, size=n_timepoints), 0, None)
        for t, v in zip(time_index, values):
            patient_data[f"{feature}_{t}"] = v

    data.append(patient_data)

# Convert to DataFrame
df = pd.DataFrame(data)

# Add readmission outcome (probability based on demographics)
def simulate_readmission(row):
    p = 0.15
    if row['age'] > 70:
        p += 0.05
    if row['gender'] == 'Male':
        p += 0.02
    if row['race'] in ['Black', 'Hispanic']:
        p += 0.03
    return int(np.random.rand() < p)

df["readmitted"] = df.apply(simulate_readmission, axis=1)

# Done
print(df.head())

3