In [1]:
# data wrangling
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt

import time

# sklearn
from sklearn.model_selection import train_test_split
import xgboost as xgb
import graphviz

# local files
import prediction_metrics as pm

***Load Data***

In [2]:
DATA_FILEPATH = "../data/all_hourly_data.h5"

patients = pd.read_hdf(DATA_FILEPATH, "patients")
vitals_labs_mean = pd.read_hdf(DATA_FILEPATH, "vitals_labs_mean")
interventions = pd.read_hdf(DATA_FILEPATH, "interventions")

***prepare data***

In [3]:
"""
Task Formulation: predict whether a patient will die, given the first 24 hours of their stay
"""

# SETTINGS
window_size = 24  # the first WINDOW_SIZE hours of the patient's stay
gap_time = 6  # the number of hours the patient lived at least after the first WINDOW_SIZE hours (to avoid label leakage, see MIMIC-III Extract paper)
test_size = 0.2  # proportion of the data that wil lbe used for testing
val_size = 0.125  # proportion of the training data that will be used for validation
random_state = 42  # random state is used to set a seed for randomness, which is only relevant for reproducibility purposes
max_missing = 0.8  # maximum percentage of missing values after forward fill for a measurement to be dropped

In [4]:
interventions_new = interventions.reset_index(level=["hadm_id", "icustay_id"], drop=True)
interventions_new = interventions_new.groupby(level="subject_id").max()

vitals_labs_mean_new = vitals_labs_mean.reset_index(level=["hadm_id", "icustay_id"], drop=True)
vitals_labs_mean_new = vitals_labs_mean_new.groupby(level="subject_id").sum()

vitals_labs_mean_new = vitals_labs_mean_new.droplevel(
    level="Aggregation Function", axis=1
)
vitals_labs_mean_new.head()
# new_columns = []
# for column in vitals_labs_mean_new.columns:
#     new_columns.append(column[0])
# vitals_labs_mean_new.columns = new_columns

LEVEL2,alanine aminotransferase,albumin,albumin ascites,albumin pleural,albumin urine,alkaline phosphate,anion gap,asparate aminotransferase,basophils,bicarbonate,...,total protein,total protein urine,troponin-i,troponin-t,venous pvo2,weight,white blood cell count,white blood cell count urine,ph,ph urine
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,45.0,1.8,0.0,0.0,0.0,162.0,131.666667,136.0,0.3,167.333333,...,0.0,0.0,45.0,0.0,35.0,214.0,117.142857,35.0,199.272,20.0
4,24.0,2.8,0.0,0.0,0.0,837.0,30.0,64.0,0.3,42.0,...,0.0,0.0,0.0,0.0,0.0,53.599998,17.0,0.0,7.47,7.0
6,50.0,6.1,0.0,0.0,0.0,209.0,99.0,198.0,0.1,77.0,...,5.2,0.0,0.0,0.0,0.0,0.0,47.7,0.0,0.0,0.0
9,25.0,5.9,0.0,0.0,0.0,133.0,88.0,34.0,1.2,169.0,...,0.0,0.0,0.0,0.01,0.0,202.600006,108.0,0.0,96.53,13.0
11,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,22.1,0.0,0.0,0.0


In [5]:
start_time = time.time()

# gender_cat = patients["gender"].unique()
# patients["gender"].replace(gender_cat, [i for i in range(len(gender_cat))], inplace=True)
# patients["gender"].astype("int")


patients_new = patients.reset_index(level=["hadm_id", "icustay_id"], drop=True)

"""PREPARE VITALS LABES AND INTERVENTIONS"""
# get target variable (died in ICU) of patients that stayed at least GAP_TIME + WINDOW_SIZE hours in the ICU
y = patients_new["mort_icu"]

"""ADD DEMOGRAPHICS"""
X = pd.DataFrame()
X = pd.get_dummies(patients_new[["gender"]], drop_first=True)
X["age"] = patients_new["age"]
X = X.join(pd.get_dummies(patients_new["ethnicity"], drop_first=True))
X = X.join(interventions_new)
X = X.join(vitals_labs_mean_new)

"""SPLIT DATA"""
# define train/test split based on index
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=val_size, random_state=random_state
)

"""RESET INDEX"""
y_train = y_train.reindex(X_train.index)
y_test = y_test.reindex(X_test.index)
y_val = y_val.reindex(X_val.index)

"""Scale data to 0 mean and 1 standard deviation"""
for column in X_train.columns:
    X_test[column] = (X_test[column] - X_train[column].mean())/X_train[column].std()
    X_val[column] = (X_val[column] - X_train[column].mean())/X_train[column].std()
    X_train[column] = (X_train[column] - X_train[column].mean())/X_train[column].std()


"""PRINT STATS"""
print("Time: %.2fs" % (time.time() - start_time))
print("Original set: %s rows, %s columns" % patients.shape)
print("Train set: %s rows, %s columns" % X_train.shape)
print("Validation set: %s rows, %s columns" % X_val.shape)
print("Test set: %s rows, %s columns" % X_test.shape)

Time: 1.80s
Original set: 34472 rows, 28 columns
Train set: 24129 rows, 160 columns
Validation set: 3448 rows, 160 columns
Test set: 6895 rows, 160 columns


***train linear regression model***

In [10]:
XGB_model = xgb.XGBRegressor()
XGB_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

***test linear regression model***

In [11]:
y_true = y_test.to_list()
y_pred = np.round(XGB_model.predict(X_test))

print("TP:", pm.TP(y_true, y_pred))
print("FP:", pm.FP(y_true, y_pred))
print("TN:", pm.TN(y_true, y_pred))
print("FN:", pm.FN(y_true, y_pred))
print("Recall:", pm.recall(y_true, y_pred))

TP: 251
FP: 71
TN: 6386
FN: 185
Recall: 0.5756880733944955
