In [1]:
# data wrangling
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt

import time

from functools import reduce

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

# local files
import prediction_metrics as pm
from prediction_metrics import recall_df

***Load Data***

In [2]:
DATA_FILEPATH = "../data/all_hourly_data.h5"

patients = pd.read_hdf(DATA_FILEPATH, "patients")
vitals_labs_mean = pd.read_hdf(DATA_FILEPATH, "vitals_labs_mean")
interventions = pd.read_hdf(DATA_FILEPATH, "interventions")

***prepare data***

In [3]:
"""
Task Formulation: predict whether a patient will die, given the first 24 hours of their stay
"""

# SETTINGS
window_size = 24  # the first WINDOW_SIZE hours of the patient's stay
gap_time = 6  # the number of hours the patient lived at least after the first WINDOW_SIZE hours (to avoid label leakage, see MIMIC-III Extract paper)
test_size = 0.2  # proportion of the data that wil lbe used for testing
val_size = 0.125  # proportion of the training data that will be used for validation
random_state = 42  # random state is used to set a seed for randomness, which is only relevant for reproducibility purposes
max_missing = 0.8  # maximum percentage of missing values after forward fill for a measurement to be dropped

In [4]:
patients_new = patients.reset_index(level=["hadm_id", "icustay_id"], drop=True)
interventions_new = interventions.reset_index(level=["hadm_id", "icustay_id"], drop=True)
vitals_labs_mean_new = vitals_labs_mean.reset_index(level=["hadm_id", "icustay_id"], drop=True)
vitals_labs_mean_new = vitals_labs_mean_new.replace(np.nan,0)

vitals_labs_mean_new = vitals_labs_mean_new.droplevel(
    level="Aggregation Function", axis=1
)

patients_new["ethnicity"] = patients_new["ethnicity"].astype(str)
patients_new["ethnicity"].mask(patients_new['ethnicity'].str.contains("WHITE"), "WHITE", inplace=True)
patients_new["ethnicity"].mask(patients_new['ethnicity'].str.contains("BLACK"), "BLACK", inplace=True)
patients_new["ethnicity"].mask(patients_new['ethnicity'].str.contains("ASIAN"), "ASIAN", inplace=True)
patients_new["ethnicity"].mask(patients_new['ethnicity'].str.contains("HISPANIC"), "HISPANIC", inplace=True)
patients_new["ethnicity"].mask(~(
    patients_new['ethnicity'].str.contains("WHITE") | 
    patients_new['ethnicity'].str.contains("BLACK") | 
    patients_new['ethnicity'].str.contains("ASIAN") | 
    patients_new['ethnicity'].str.contains("HISPANIC")
), "OTHER/UNKOWN", inplace=True)
patients_new["ethnicity"] = patients_new["ethnicity"].astype('category')
patients_new = interventions_new.join(patients_new, how='inner')
print(patients_new["ethnicity"].unique())
print(patients_new.shape)

['WHITE', 'OTHER/UNKOWN', 'BLACK', 'ASIAN', 'HISPANIC']
Categories (5, object): ['ASIAN', 'BLACK', 'HISPANIC', 'OTHER/UNKOWN', 'WHITE']
(2200954, 42)


In [5]:
interventions_new = interventions_new.groupby(level=["subject_id"]).cumsum()
vitals_labs_mean_new = vitals_labs_mean_new.groupby(level=["subject_id"]).cumsum()

In [6]:
start_time = time.time()

"""PREPARE VITALS LABES AND INTERVENTIONS"""
# get target variable (died in ICU) of patients that stayed at least GAP_TIME + WINDOW_SIZE hours in the ICU
y = patients_new["hospital_expire_flag"]

"""ADD DEMOGRAPHICS"""
X = pd.get_dummies(patients_new[["gender"]], drop_first=True)
X["age"] = patients_new["age"]
X = X.join(pd.get_dummies(patients_new["ethnicity"], drop_first=True))

X = interventions_new.join(X, how="inner")
X = X.join(vitals_labs_mean_new)

"""SPLIT DATA"""
# define train/test split based on index
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=val_size, random_state=random_state
)

"""RESET INDEX"""
y_train = y_train.reindex(X_train.index)
y_test = y_test.reindex(X_test.index)
y_val = y_val.reindex(X_val.index)

"""PRINT STATS"""
print("Time: %.2fs" % (time.time() - start_time))
print("Original set: %s rows, %s columns" % interventions.shape)
print("Train set: %s rows, %s columns" % X_train.shape)
print("Validation set: %s rows, %s columns" % X_val.shape)
print("Test set: %s rows, %s columns" % X_test.shape)

Time: 17.53s
Original set: 2200954 rows, 14 columns
Train set: 1540667 rows, 124 columns
Validation set: 220096 rows, 124 columns
Test set: 440191 rows, 124 columns


***train linear regression model***

In [7]:
random_forest_classifier_model = RandomForestClassifier(n_estimators=100)
random_forest_classifier_model.fit(X_train, y_train)

RandomForestClassifier()

***test linear regression model***

In [8]:
y_true = y_val.to_list()
y_pred = np.round(random_forest_classifier_model.predict(X_val))

total_df = patients_new.loc[np.intersect1d(patients_new.index, X_val.index)]
total_df["true"] = y_true
total_df["pred"] = y_pred

print("TP:", pm.TP(y_true, y_pred), pm.TP(y_true, y_pred)/X_val.shape[0])
print("FP:", pm.FP(y_true, y_pred), pm.FP(y_true, y_pred)/X_val.shape[0])
print("FN:", pm.FN(y_true, y_pred), pm.FN(y_true, y_pred)/X_val.shape[0])
print("TN:", pm.TN(y_true, y_pred), pm.TN(y_true, y_pred)/X_val.shape[0])
print("TPR:", pm.recall(y_true, y_pred))
print("FPR:", pm.FPR(y_true, y_pred))

genders = pd.DataFrame()
genders["TPR"] = total_df.groupby("gender").apply(pm.recall_df)
genders["FPR"] = total_df.groupby("gender").apply(pm.FPR_df)
genders["count"] = total_df[["true", "gender"]].groupby("gender").count()
print(genders)

ethnicity = pd.DataFrame()
ethnicity["TPR"] = total_df.groupby("ethnicity").apply(pm.recall_df)
ethnicity["FPR"] = total_df.groupby("ethnicity").apply(pm.FPR_df)
ethnicity["count"] = total_df[["true", "ethnicity"]].groupby("ethnicity").count()
print(ethnicity)

TP: 26856 0.12201948240767665
FP: 9 4.089124745565572e-05
FN: 631 0.0028669307938354173
TN: 192600 0.8750726955510323
TPR: 0.9770436933823262
FPR: 4.6726788467828604e-05
             TPR       FPR   count
gender                            
F       0.977177  0.000059   96880
M       0.976939  0.000037  123216
                   TPR       FPR   count
ethnicity                               
ASIAN         0.974460  0.000000    5469
BLACK         0.974009  0.000209   16434
HISPANIC      0.973552  0.000000    6961
OTHER/UNKOWN  0.974550  0.000032   35329
WHITE         0.978195  0.000037  155903
