In [1]:
# data wrangling
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt

import time

# sklearn
from sklearn.model_selection import train_test_split
import xgboost as xgb
import graphviz

# local files
import prediction_metrics as pm

***Load Data***

In [2]:
DATA_FILEPATH = "../data/all_hourly_data.h5"

patients = pd.read_hdf(DATA_FILEPATH, "patients")
vitals_labs_mean = pd.read_hdf(DATA_FILEPATH, "vitals_labs_mean")
interventions = pd.read_hdf(DATA_FILEPATH, "interventions")

***prepare data***

In [3]:
"""
Task Formulation: predict whether a patient will die, given the first 24 hours of their stay
"""

# SETTINGS
window_size = 24  # the first WINDOW_SIZE hours of the patient's stay
gap_time = 6  # the number of hours the patient lived at least after the first WINDOW_SIZE hours (to avoid label leakage, see MIMIC-III Extract paper)
test_size = 0.2  # proportion of the data that wil lbe used for testing
val_size = 0.125  # proportion of the training data that will be used for validation
random_state = 42  # random state is used to set a seed for randomness, which is only relevant for reproducibility purposes
max_missing = 0.8  # maximum percentage of missing values after forward fill for a measurement to be dropped

In [4]:
patients_new = patients.reset_index(level=["hadm_id", "icustay_id"], drop=True)
interventions_new = interventions.reset_index(level=["hadm_id", "icustay_id"], drop=True)
vitals_labs_mean_new = vitals_labs_mean.reset_index(level=["hadm_id", "icustay_id"], drop=True)
vitals_labs_mean_new = vitals_labs_mean_new.replace(np.nan,0)

vitals_labs_mean_new = vitals_labs_mean_new.droplevel(
    level="Aggregation Function", axis=1
)

patients_new = interventions_new.join(patients_new, how='inner')
print(patients_new.shape)

(2200954, 42)


In [5]:
interventions_new = interventions_new.groupby(level=["subject_id"]).cumsum()
vitals_labs_mean_new = vitals_labs_mean_new.groupby(level=["subject_id"]).cumsum()

In [6]:
start_time = time.time()

"""PREPARE VITALS LABES AND INTERVENTIONS"""
# get target variable (died in ICU) of patients that stayed at least GAP_TIME + WINDOW_SIZE hours in the ICU
y = patients_new["hospital_expire_flag"]

"""ADD DEMOGRAPHICS"""
X = pd.get_dummies(patients_new[["gender"]], drop_first=True)
X["age"] = patients_new["age"]
X = X.join(pd.get_dummies(patients_new["ethnicity"], drop_first=True))

X = interventions_new.join(X, how="inner")
X = X.join(vitals_labs_mean_new)

"""SPLIT DATA"""
# define train/test split based on index
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=val_size, random_state=random_state
)

"""RESET INDEX"""
y_train = y_train.reindex(X_train.index)
y_test = y_test.reindex(X_test.index)
y_val = y_val.reindex(X_val.index)

"""PRINT STATS"""
print("Time: %.2fs" % (time.time() - start_time))
print("Original set: %s rows, %s columns" % interventions.shape)
print("Train set: %s rows, %s columns" % X_train.shape)
print("Validation set: %s rows, %s columns" % X_val.shape)
print("Test set: %s rows, %s columns" % X_test.shape)

Time: 26.07s
Original set: 2200954 rows, 14 columns
Train set: 1540667 rows, 160 columns
Validation set: 220096 rows, 160 columns
Test set: 440191 rows, 160 columns


***train linear regression model***

In [7]:
XGB_model = xgb.XGBRegressor(max_depth=100, n_estimators=10)
XGB_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=100, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=10, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

***test linear regression model***

In [8]:
y_true = y_test.to_list()
y_pred = np.round(XGB_model.predict(X_test))

total_df = patients_new.loc[np.intersect1d(patients_new.index, X_test.index)]
total_df["true"] = y_true
total_df["pred"] = y_pred

genders = total_df.groupby("gender").apply(pm.recall_df)
print(genders)

ethnicity = total_df.groupby("ethnicity").apply(pm.recall_df)
ethnicity = pd.DataFrame(ethnicity)
ethnicity["count"] = total_df[["true", "ethnicity"]].groupby("ethnicity").count()
print(ethnicity)

print("TP:", pm.TP(y_true, y_pred)/X_test.shape[0])
print("FP:", pm.FP(y_true, y_pred)/X_test.shape[0])
print("FN:", pm.FN(y_true, y_pred)/X_test.shape[0])
print("TN:", pm.TN(y_true, y_pred)/X_test.shape[0])
print("Recall:", pm.recall(y_true, y_pred))
print("FPR:", pm.FPR(y_true, y_pred))

gender
F    0.978822
M    0.980681
dtype: float64
                                                           0   count
ethnicity                                                           
AMERICAN INDIAN/ALASKA NATIVE                       0.875000     201
AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNI...  0.500000      12
ASIAN                                               0.986022    7056
ASIAN - ASIAN INDIAN                                0.975610     659
ASIAN - CAMBODIAN                                   0.950000     139
ASIAN - CHINESE                                     0.981132    1986
ASIAN - FILIPINO                                    1.000000     210
ASIAN - JAPANESE                                    1.000000      79
ASIAN - KOREAN                                      1.000000     116
ASIAN - OTHER                                       1.000000     113
ASIAN - THAI                                        1.000000      37
ASIAN - VIETNAMESE                                  1