In [1]:
# data wrangling
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt

import time

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# local files
from sofascores import compute_sofa
import prediction_metrics as pm

***Load Data***

In [2]:
DATA_FILEPATH = "../data/all_hourly_data.h5"

patients = pd.read_hdf(DATA_FILEPATH, "patients")
# vitals_labs_mean = pd.read_hdf(DATA_FILEPATH, "vitals_labs_mean")
# interventions = pd.read_hdf(DATA_FILEPATH, "interventions")

***prepare data***

In [3]:
"""
Task Formulation: predict whether a patient will die, given the first 24 hours of their stay
"""

# SETTINGS
window_size = 24  # the first WINDOW_SIZE hours of the patient's stay
gap_time = 6  # the number of hours the patient lived at least after the first WINDOW_SIZE hours (to avoid label leakage, see MIMIC-III Extract paper)
test_size = 0.2  # proportion of the data that wil lbe used for testing
val_size = 0.125  # proportion of the training data that will be used for validation
random_state = 42  # random state is used to set a seed for randomness, which is only relevant for reproducibility purposes
max_missing = 0.8  # maximum percentage of missing values after forward fill for a measurement to be dropped

In [4]:
start_time = time.time()

patients_new = patients.reset_index(level=["hadm_id", "icustay_id"], drop=True)

"""PREPARE VITALS LABES AND INTERVENTIONS"""
# get target variable (died in ICU) of patients that stayed at least GAP_TIME + WINDOW_SIZE hours in the ICU
y = patients_new["mort_icu"]

"""ADD DEMOGRAPHICS"""
X = pd.get_dummies(patients_new[["gender"]], drop_first=True)
X["age"] = patients_new["age"]
X = X.join(pd.get_dummies(patients_new["ethnicity"], drop_first=True))

"""SPLIT DATA"""
# define train/test split based on index
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=val_size, random_state=random_state
)

"""RESET INDEX"""
y_train = y_train.reindex(X_train.index)
y_test = y_test.reindex(X_test.index)
y_val = y_val.reindex(X_val.index)

"""PRINT STATS"""
print("Time: %.2fs" % (time.time() - start_time))
print("Original set: %s rows, %s columns" % patients.shape)
print("Train set: %s rows, %s columns" % X_train.shape)
print("Validation set: %s rows, %s columns" % X_val.shape)
print("Test set: %s rows, %s columns" % X_test.shape)

Time: 0.06s
Original set: 34472 rows, 28 columns
Train set: 24129 rows, 42 columns
Validation set: 3448 rows, 42 columns
Test set: 6895 rows, 42 columns


In [5]:
print(X_train.columns)

Index(['gender_M', 'age',
       'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE', 'ASIAN',
       'ASIAN - ASIAN INDIAN', 'ASIAN - CAMBODIAN', 'ASIAN - CHINESE',
       'ASIAN - FILIPINO', 'ASIAN - JAPANESE', 'ASIAN - KOREAN',
       'ASIAN - OTHER', 'ASIAN - THAI', 'ASIAN - VIETNAMESE', 'BLACK/AFRICAN',
       'BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/HAITIAN',
       'CARIBBEAN ISLAND', 'HISPANIC OR LATINO',
       'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)',
       'HISPANIC/LATINO - COLOMBIAN', 'HISPANIC/LATINO - CUBAN',
       'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN',
       'HISPANIC/LATINO - HONDURAN', 'HISPANIC/LATINO - MEXICAN',
       'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - SALVADORAN',
       'MIDDLE EASTERN', 'MULTI RACE ETHNICITY',
       'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'OTHER',
       'PATIENT DECLINED TO ANSWER', 'PORTUGUESE', 'SOUTH AMERICAN',
       'UNABLE TO OBTAIN', 'UNKNOWN/NOT SPECIFIED', 'WHI

***train linear regression model***

In [6]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

LinearRegression()

***test linear regression model***

In [7]:
y_true = y_test.to_list()
y_pred = np.round(linear_regression_model.predict(X_test))

print("TP:", pm.TP(y_true, y_pred))
print("FP:", pm.FP(y_true, y_pred))
print("TN:", pm.TN(y_true, y_pred))
print("FN:", pm.FN(y_true, y_pred))
print("Recall:", pm.recall(y_true, y_pred))

TP: 0
FP: 0
TN: 6457
FN: 438
Recall: 0.0
