In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)
N = 500

# -----------------------------
# 1. DEMOGRAPHICS
# -----------------------------
age = np.clip(np.random.normal(55, 12, N), 20, 85).astype(int)
sex = np.random.binomial(1, 0.55, N)  # 1 = male, 0 = female

# -----------------------------
# 2. BASELINE VITALS (CORRELATED)
# -----------------------------
resting_hr = np.clip(
    np.random.normal(70 + 0.15*(age-50), 8, N),
    50, 105
)

systolic_bp = np.clip(
    np.random.normal(120 + 0.4*(age-50), 12, N),
    90, 170
)

diastolic_bp = np.clip(
    systolic_bp * 0.65 + np.random.normal(0, 6, N),
    55, 105
)

# -----------------------------
# 3. ECG FEATURES
# -----------------------------
hrv_rmssd = np.clip(
    60 - 0.35*(resting_hr-60) - 0.25*(age-40) + np.random.normal(0, 6, N),
    10, 65
)

qtc_interval = np.clip(
    410 + 0.6*(age-50) + 0.4*(resting_hr-70) + np.random.normal(0, 12, N),
    360, 520
)

# -----------------------------
# 4. BASELINE CARDIAC STRUCTURE
# -----------------------------
baseline_lvef = np.clip(
    65 - 0.15*(age-50) - 0.08*(systolic_bp-120) + np.random.normal(0, 3, N),
    40, 70
)

# -----------------------------
# 5. DOXORUBICIN EXPOSURE
# -----------------------------
num_cycles = np.random.randint(4, 9, N)
dose_per_cycle = np.random.uniform(50, 75, N)
cumulative_dose = num_cycles * dose_per_cycle

# -----------------------------
# 6. DOSEâ€“RESPONSE FUNCTIONS
# -----------------------------
def dose_effect(dose, steepness=0.03, midpoint=380):
    return 1 / (1 + np.exp(-steepness*(dose-midpoint)))

dose_risk = dose_effect(cumulative_dose)

# -----------------------------
# 7. CARDIAC RESPONSE
# -----------------------------
delta_lvef = np.clip(
    - (2 + 10*dose_risk)
    - 0.05*(age-50)
    + np.random.normal(0, 1.5, N),
    -25, 2
)

qtc_change = np.clip(
    5 + 35*dose_risk + np.random.normal(0, 6, N),
    0, 90
)

hrv_change = np.clip(
    - (3 + 18*dose_risk) + np.random.normal(0, 4, N),
    -40, 2
)

post_lvef = baseline_lvef + delta_lvef
post_qtc = qtc_interval + qtc_change

# -----------------------------
# 8. RISK LABELS (CLINICAL LOGIC)
# -----------------------------
risk_label = np.where(
    (delta_lvef <= -10) | (post_qtc >= 500),
    "High",
    np.where(
        (delta_lvef <= -5) | (post_qtc >= 480),
        "Moderate",
        "Low"
    )
)

# -----------------------------
# 9. FINAL DATAFRAME
# -----------------------------
df = pd.DataFrame({
    "age": age,
    "sex": sex,
    "resting_hr": resting_hr,
    "systolic_bp": systolic_bp,
    "diastolic_bp": diastolic_bp,
    "hrv_rmssd": hrv_rmssd,
    "qtc_baseline": qtc_interval,
    "baseline_lvef": baseline_lvef,
    "num_cycles": num_cycles,
    "dose_per_cycle": dose_per_cycle,
    "cumulative_dose": cumulative_dose,
    "delta_lvef": delta_lvef,
    "qtc_change": qtc_change,
    "hrv_change": hrv_change,
    "post_lvef": post_lvef,
    "post_qtc": post_qtc,
    "risk_label": risk_label
})

print(df.head())
print(df["risk_label"].value_counts())


   age  sex  resting_hr  systolic_bp  diastolic_bp  hrv_rmssd  qtc_baseline  \
0   60    0   70.163055   107.834485     68.539639  45.911772    391.055867   
1   53    0   71.623709   109.540632     80.793294  46.657957    433.145844   
2   62    1   81.452072   139.204967     93.848745  48.235379    418.331458   
3   73    0   66.914515   121.317269     77.083343  49.745986    426.013753   
4   52    0   73.249386   108.237068     74.535821  48.032289    411.953608   

   baseline_lvef  num_cycles  dose_per_cycle  cumulative_dose  delta_lvef  \
0      65.990009           7       50.154596       351.082169   -7.203461   
1      69.854089           7       70.267089       491.869625   -9.065603   
2      68.477952           8       73.709239       589.673911  -13.681388   
3      60.231426           7       52.283143       365.982004   -8.118581   
4      67.115323           8       54.036192       432.289537  -11.689548   

   qtc_change  hrv_change  post_lvef    post_qtc risk_label  


In [2]:
df.to_csv("risk.csv", index=False)


In [3]:
df.head(10)

Unnamed: 0,age,sex,resting_hr,systolic_bp,diastolic_bp,hrv_rmssd,qtc_baseline,baseline_lvef,num_cycles,dose_per_cycle,cumulative_dose,delta_lvef,qtc_change,hrv_change,post_lvef,post_qtc,risk_label
0,60,0,70.163055,107.834485,68.539639,45.911772,391.055867,65.990009,7,50.154596,351.082169,-7.203461,10.939169,-3.682422,58.786547,401.995036,Moderate
1,53,0,71.623709,109.540632,80.793294,46.657957,433.145844,69.854089,7,70.267089,491.869625,-9.065603,38.660997,-19.114306,60.788486,471.806841,Moderate
2,62,1,81.452072,139.204967,93.848745,48.235379,418.331458,68.477952,8,73.709239,589.673911,-13.681388,47.162785,-26.740911,54.796563,465.494244,High
3,73,0,66.914515,121.317269,77.083343,49.745986,426.013753,60.231426,7,52.283143,365.982004,-8.118581,19.968793,-7.387475,52.112846,445.982546,Moderate
4,52,0,73.249386,108.237068,74.535821,48.032289,411.953608,67.115323,8,54.036192,432.289537,-11.689548,37.727596,-15.952058,55.425775,449.681204,High
5,52,1,67.15329,127.239833,80.702975,55.557274,404.97048,65.830094,4,74.018658,296.07463,-1.123555,5.949466,-5.644399,64.706539,410.919945,Low
6,73,1,73.679959,143.42845,100.26724,43.681934,418.433988,60.262169,4,63.438504,253.754014,-5.251717,8.48774,2.0,55.010453,426.921728,Moderate
7,64,1,82.327615,134.22744,89.465689,44.555404,427.285151,61.465269,4,50.354916,201.419663,-2.713234,0.0,-5.55713,58.752035,427.285151,Low
8,49,1,71.378793,131.552572,84.865359,63.808135,391.745433,65.53372,5,56.4408,282.204,-2.853472,3.769315,-4.585017,62.680248,395.514748,Low
9,61,1,72.021492,115.318459,77.6433,58.585244,426.415547,56.127785,5,61.248592,306.242959,-1.981375,7.835507,-1.700474,54.14641,434.251054,Low


In [4]:
df.isnull().sum()

age                0
sex                0
resting_hr         0
systolic_bp        0
diastolic_bp       0
hrv_rmssd          0
qtc_baseline       0
baseline_lvef      0
num_cycles         0
dose_per_cycle     0
cumulative_dose    0
delta_lvef         0
qtc_change         0
hrv_change         0
post_lvef          0
post_qtc           0
risk_label         0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,age,sex,resting_hr,systolic_bp,diastolic_bp,hrv_rmssd,qtc_baseline,baseline_lvef,num_cycles,dose_per_cycle,cumulative_dose,delta_lvef,qtc_change,hrv_change,post_lvef,post_qtc
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,54.552,0.538,71.223155,122.572496,79.655038,52.425697,413.002769,63.940504,6.086,62.551523,381.262828,-7.17144,21.845005,-11.937456,56.769063,434.847775
std,11.587573,0.499053,8.138694,12.427279,9.709388,7.087537,14.562178,3.527632,1.425014,7.256092,101.991558,4.206211,14.42612,8.307276,5.550128,19.568375
min,20.0,0.0,50.0,90.0,55.0,27.874604,369.642874,51.319374,4.0,50.154596,200.908703,-15.949526,0.0,-32.498625,41.650522,369.642874
25%,46.0,0.0,66.101958,114.140862,73.316921,47.372485,402.340434,61.719723,5.0,56.317718,292.499528,-10.966694,8.941273,-18.965084,52.863034,420.79357
50%,55.0,1.0,71.32795,122.090383,79.495952,52.628973,413.153217,64.038789,6.0,62.587605,369.785938,-7.083255,20.387358,-12.304944,56.903397,434.200172
75%,62.0,1.0,76.459948,130.547515,85.791882,57.496965,423.966378,66.360031,7.0,68.502673,460.111634,-3.495098,34.967155,-4.606862,60.754736,448.727787
max,85.0,1.0,94.027325,160.317291,104.168462,65.0,454.661229,70.0,8.0,74.983758,597.235359,2.0,52.36627,2.0,72.0,491.318201


In [10]:
df.columns

Index(['age', 'sex', 'resting_hr', 'systolic_bp', 'diastolic_bp', 'hrv_rmssd',
       'qtc_baseline', 'baseline_lvef', 'num_cycles', 'dose_per_cycle',
       'cumulative_dose', 'delta_lvef', 'qtc_change', 'hrv_change',
       'post_lvef', 'post_qtc', 'risk_label'],
      dtype='object')