# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)

mimic_extract_filename = '../data/all_hourly_data.h5'
mimic_sqlalchemy_db_uri = 'postgresql:///mimic'
cutoff_hours = 24  # number of hours in the period of stay to be used for mortality prediction
gap_hours = 12     # gap between end of observation period and prediction period to minimize label leakage

In [2]:
def transform_into_zscores(x, mean_dict, stdev_dict):
    """ 
    Transforms features values into z-scores between -4 and 4
    rounded to the closest integer. Missing values are assigned 9.
    
    Parameters
    ----------
    x : float
        Variable that needs to be transformed.
    mean_dict: dict of float
        Dictionary of mean values by vital/lab.
    stdev_dict: dict of float
        Dictionary of standard deviation values by vital/lab.

    Return
    ------
    int
        z-score clipped to [-4, 4] or 9 if it is a missing value.
    """

    zscore = 1.0 * (x - mean_dict[x.name]) / stdev_dict[x.name]
    zscore = zscore.round()
    zscore = zscore.clip(-4, 4)
    zscore = zscore.fillna(9)
    zscore = zscore.round(0).astype(int)
    return zscore

In [3]:
def hours_to_pad(df, max_hours):
    """
    Returns set of hours needed to complete the `max_hours` set on a per patient basis.
    Returns -1 if patient has measurements for all hours.
    """
    
    cur_hours = set(df.index.get_level_values(1))
    pad_hours = set(range(max_hours)) - cur_hours
    if len(pad_hours) > 0:
        return pad_hours
    else:
        return -1

In [4]:
# the next two MIMIC-Extract pipeline dataframes are needed to reproduce the paper
patients_df = pd.read_hdf(mimic_extract_filename, 'patients')
vitals_labs_mean_df = pd.read_hdf(mimic_extract_filename, 'vitals_labs_mean')

# MIMIC-Extract pipeline does not provide the feature `timecmo_chart` that is
# needed to reproduce the paper; we need to get it from the MIMIC PostgreSQL database
# inside the concept table `code_status` and add it to the `patients_df` dataframe
code_status_df = pd.read_sql_table('code_status', mimic_sqlalchemy_db_uri)
code_status_df.set_index(['subject_id', 'hadm_id', 'icustay_id'], inplace=True)
patients_df = pd.merge(patients_df, code_status_df['timecmo_chart'], left_index=True, right_index=True)

In [5]:
#--------------------------------------------------------------------------
# paper considers in-hospital mortality as any of these three events:
#  1) Death = `deathtime` feature not null
#  2) A note of "Do Not Resuscitate" (DNR) = `dnr_first_charttime` not null
#  3) A note of "Comfort Measures Only" (CMO) = `timecmo_chart` not null
# earliest time of the three events is considered the mortality time
# `mort_time` for all experiments described in the paper
patients_df['morttime'] = patients_df[['deathtime', 'dnr_first_charttime', 'timecmo_chart']].min(axis=1)
# `mort_flag` is True if patient dies in hospital or False if not
# this flag will be used as our prediction label (`Y`)
patients_df['mort_flag'] = np.where(patients_df['morttime'].isnull(), False, True)

# calculate hours elapsed between patient admitted into the ICU
# and same patient being discharged from the hospital
# (this is called period of stay in the paper)
patients_df['hours_in_icu'] = patients_df['dischtime'] - patients_df['intime']
patients_df['hours_in_icu'] = patients_df['hours_in_icu'].apply(lambda x: x.total_seconds() / 3600)

# calculate hours elapsed between patient admitted into the ICU
# and same patient dying (or reaching either DNR or CMO condition)
patients_df['hours_until_mort'] = patients_df['morttime'] - patients_df['intime']
patients_df['hours_until_mort'] = patients_df['hours_until_mort'].apply(lambda x: x.total_seconds() / 3600)

# exclusion criteria 1: remove patients with a period of stay lower than `cutoff_hours` (e.g. 24 hours)
patients_df = patients_df[patients_df['hours_in_icu'] >= cutoff_hours]

# exclusion criteria 2: remove patients that died in the period of stay or the gap period (e.g. first 24+12 hours)
patients_df = patients_df[patients_df['hours_in_icu'] >= cutoff_hours + gap_hours]

In [6]:
#--------------------------------------
# Time to switch to physiological data!

# paper considers the following 29 vitals and labs 
vitals_labs_to_keep_list = [
    'anion gap',
    'bicarbonate',
    'blood urea nitrogen',
    'chloride',
    'creatinine',
    'diastolic blood pressure',
    'fraction inspired oxygen',
    'glascow coma scale total',
    'glucose',
    'heart rate',
    'hematocrit',
    'hemoglobin',
    'lactate',
    'magnesium',
    'mean blood pressure',
    'oxygen saturation',
    'partial thromboplastin time',
    'phosphate',
    'platelets',
    'potassium',
    'prothrombin time inr',
    'prothrombin time pt',
    'respiratory rate',
    'sodium',
    'systolic blood pressure',
    'temperature',
    'weight',
    'white blood cell count',
    'ph'
]

# subset MIMIC-Extract data to the list of vitals/labs used in the paper
vitals_labs_df = vitals_labs_mean_df[vitals_labs_to_keep_list]

In [7]:
# let's discretize the physiological features by:
#  1) Converting them into z-scores
#  2) Rounding the resulting z-scores to integers and clipping them to [-4, 4]
#  3) Replacing z-scores with value 9 if they are NaN
#  4) Dummifying the resulting columns and removing the NaN columns

# create two dictionaries of mean and standard deviation values by vital/lab
# since these dictionaries will be used to calculate the z-scores next
mean_dict = vitals_labs_df.groupby(['subject_id']).mean().mean().to_dict()
stdev_dict = vitals_labs_df.std().to_dict()

# convert values for every vital/lab into z-scores rounded to the nearest integer,
# clipped between [-4, 4] and replaced with 9 if NaN
vitals_labs_df = vitals_labs_df.apply(lambda x: transform_into_zscores(x, mean_dict, stdev_dict), axis=0)

# dummify all columns
vitals_labs_df = pd.get_dummies(vitals_labs_df, columns=vitals_labs_df.columns)

# remove NaN columns (those ending in '_9')
nan_columns = [column for column in vitals_labs_df.columns if '_9' in column]
vitals_labs_df.drop(nan_columns, axis=1, inplace=True)

# just keep `cutoff_hours` hours of data (e.g. 24 hours)
vitals_labs_df = vitals_labs_df.query(f'hours_in < {cutoff_hours}')

# remove `hadm_id` and `icustay_id`
vitals_labs_df.reset_index(['hadm_id', 'icustay_id'], drop=True, inplace=True)

In [17]:
# pad patients whose records stopped earlier than `cutoff_hours` with zeroes
pad_hours_df = vitals_labs_df.groupby(level=0).apply(hours_to_pad, cutoff_hours)
pad_hours_df = pad_hours_df[pad_hours_df != -1].reset_index()
pad_hours_df.columns = ['subject_id', 'pad_hours']
padding_list_of_tuples = []
for subject_id in pad_hours_df.subject_id:
    for hour in list(pad_hours_df[pad_hours_df.subject_id == subject_id].pad_hours)[0]:
        padding_list_of_tuples.append((subject_id, hour))
pad_hours_df_idx = pd.MultiIndex.from_tuples(padding_list_of_tuples, names=('subject_id', 'hours_in'))
pad_hours_df = pd.DataFrame(0, pad_hours_df_idx, columns=vitals_labs_df.columns)
vitals_labs_df = pd.concat([vitals_labs_df, pad_hours_df], axis=0)

# after padding, now we have a dataframe with number of patients times `cutoff_hours` records

In [11]:
# select, categorize, and dummify the three static variables
# selected by the paper: gender, age, and ethnicity
static = patients_df[['gender', 'age', 'ethnitcity']]


subject_id
24       {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
34                           {18, 19, 20, 21, 22, 23}
46                               {19, 20, 21, 22, 23}
63       {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
84           {14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
                             ...                     
99814                                        {22, 23}
99832                    {17, 18, 19, 20, 21, 22, 23}
99923                        {18, 19, 20, 21, 22, 23}
99935                {16, 17, 18, 19, 20, 21, 22, 23}
99946                                {20, 21, 22, 23}
Length: 4397, dtype: object

In [25]:
patients = set(vitals_labs_df.index.get_level_values('subject_id'))
len(patients)

34472

In [27]:
len(vitals_labs_df)

827328

In [28]:
print(len(patients) * 24)

827328


In [19]:
vitals_labs_df

Unnamed: 0_level_0,Unnamed: 1_level_0,"('anion gap', 'mean')_-2","('anion gap', 'mean')_-1","('anion gap', 'mean')_0","('anion gap', 'mean')_1","('anion gap', 'mean')_2","('anion gap', 'mean')_3","('anion gap', 'mean')_4","('bicarbonate', 'mean')_-4","('bicarbonate', 'mean')_-3","('bicarbonate', 'mean')_-2",...,"('white blood cell count', 'mean')_4","('ph', 'mean')_-4","('ph', 'mean')_-3","('ph', 'mean')_-2","('ph', 'mean')_-1","('ph', 'mean')_0","('ph', 'mean')_1","('ph', 'mean')_2","('ph', 'mean')_3","('ph', 'mean')_4"
subject_id,hours_in,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3,0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99935,23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99946,20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99946,21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99946,22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
