In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay

import lightgbm as lgb
import matplotlib.pyplot as plt


In [6]:
df = pd.read_csv("risk.csv")

In [3]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------------------------------ --- 1.3/1.5 MB 10.6 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 7.0 MB/s  0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import lightgbm as lgb

In [8]:
print(lgb.__version__)

4.6.0


In [9]:

print(df.shape)
df.head()


(500, 17)


Unnamed: 0,age,sex,resting_hr,systolic_bp,diastolic_bp,hrv_rmssd,qtc_baseline,baseline_lvef,num_cycles,dose_per_cycle,cumulative_dose,delta_lvef,qtc_change,hrv_change,post_lvef,post_qtc,risk_label
0,60,0,70.163055,107.834485,68.539639,45.911772,391.055867,65.990009,7,50.154596,351.082169,-7.203461,10.939169,-3.682422,58.786547,401.995036,Moderate
1,53,0,71.623709,109.540632,80.793294,46.657957,433.145844,69.854089,7,70.267089,491.869625,-9.065603,38.660997,-19.114306,60.788486,471.806841,Moderate
2,62,1,81.452072,139.204967,93.848745,48.235379,418.331458,68.477952,8,73.709239,589.673911,-13.681388,47.162785,-26.740911,54.796563,465.494244,High
3,73,0,66.914515,121.317269,77.083343,49.745986,426.013753,60.231426,7,52.283143,365.982004,-8.118581,19.968793,-7.387475,52.112846,445.982546,Moderate
4,52,0,73.249386,108.237068,74.535821,48.032289,411.953608,67.115323,8,54.036192,432.289537,-11.689548,37.727596,-15.952058,55.425775,449.681204,High


In [10]:
df["risk_label"].value_counts()


risk_label
Low         191
High        164
Moderate    145
Name: count, dtype: int64

In [11]:
le = LabelEncoder()
df["risk_encoded"] = le.fit_transform(df["risk_label"])

print(le.classes_)


['High' 'Low' 'Moderate']


In [12]:
drop_cols = [
    "risk_label",
    "delta_lvef",
    "qtc_change",
    "hrv_change",
    "post_lvef",
    "post_qtc"
]

X = df.drop(columns=drop_cols)
y = df["risk_encoded"]

print(X.shape)


(500, 12)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [14]:
model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)


In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    model,
    X_train,
    y_train,
    cv=cv,
    scoring="roc_auc_ovr"
)

print("CV AUC scores:", scores)
print("Mean CV AUC:", scores.mean())


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 915
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 12
[LightGBM] [Info] Start training from score -1.114361
[LightGBM] [Info] Start training from score -0.956137
[LightGBM] [Info] Start training from score -1.246532
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 911
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 12
[LightGBM] [Info] Start training from score -1.123930
[LightGBM] [Info] Start training from score -0.956137
[LightGBM] [Info] Start training from score -1.235722
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.


In [16]:
import pandas as pd

corr = df.corr(numeric_only=True)["risk_encoded"].sort_values(ascending=False)
print(corr)


risk_encoded       1.000000
delta_lvef         0.449761
hrv_change         0.384383
post_lvef          0.308389
sex                0.042006
qtc_baseline       0.040502
age                0.027611
systolic_bp       -0.017649
diastolic_bp      -0.021403
resting_hr        -0.044777
baseline_lvef     -0.051081
hrv_rmssd         -0.061254
dose_per_cycle    -0.224329
post_qtc          -0.266625
num_cycles        -0.363863
qtc_change        -0.402549
cumulative_dose   -0.437702
Name: risk_encoded, dtype: float64
