## MIMICIII Mechanical Ventilation MAgECs

In [1]:
import numpy as np
import pandas as pd
import psycopg2
import os 
import random
import datetime
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

%matplotlib inline

random.seed(22891)

In [2]:
pd.set_option('display.max_columns', None)

### Get data

In [3]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

engine = create_engine("postgresql+psycopg2://{}:{}@/{}".format(sqluser, sqluser, dbname))

schema_name = 'mimiciii'
conn = engine.connect()
conn.execute('SET search_path to ' + schema_name)

df = pd.read_sql("SELECT * FROM mimic_users_study;", conn)
conn.close()

### Featurize

In [4]:
vitals = ['heartrate_mean', 'sysbp_mean', 'diasbp_mean', 'meanbp_mean',
          'resprate_mean', 'tempc_mean', 'spo2_mean', 'glucose_mean']
labs = ['aniongap', 'albumin', 'bicarbonate', 'bilirubin', 'creatinine', 
        'chloride', 'glucose', 'hemoglobin', 'lactate', 
        'magnesium', 'phosphate', 'platelet', 'potassium', 'ptt', 'inr', 
        'pt', 'sodium', 'bun', 'wbc']  # -hematocrit
comobs = ['congestive_heart_failure', 'chronic_pulmonary', 'pulmonary_circulation']
others = ['age', 'gender']

In [5]:
def last_val(x):
    vals = x[~np.isnan(x)]
    if len(vals):
        return vals[-1]
    else:
        return None
    
def featurize_time(df):
    out = dict()
    for i in range(len(df)):
        for lab in labs:
            val = last_val(df[lab].values[:i+1])
            if lab not in out:
                out[lab] = [val]
            else:
                out[lab].append(val)
        for vital in vitals:    
            val = last_val(df[vital].values[:i+1])
            if vital not in out:
                out[vital] = [val]
            else:
                out[vital].append(val)
        for comob in comobs:    
            val = last_val(df[comob].values[:i+1])
            if comob not in out:
                out[comob] = [val]
            else:
                out[comob].append(val)
        for other in others:
            val = last_val(df[other].values[:i+1])
            if other not in out:
                out[other] = [val]
            else:
                out[other].append(val)
        out['timepoint'] = df.timepoint.values
        out['label'] = [int(x) for x in df.ventilated.values]
    return pd.Series(out)

def featurize(df):
    out = dict()
    for lab in labs:
        out[lab] = last_val(df[lab])
    for vital in vitals:
        out[vital] = last_val(df[vital])
    for comob in comobs:
        out[comob] = last_val(df[comob])
    for other in others:
        out[other] = last_val(df[other])
    out['label'] = int(df.ventilated.iloc[-1])
    return pd.Series(out)

### Example from 'original' dataframe

In [6]:
df[df['subject_id']==4].head()

Unnamed: 0,subject_id,hadm_id,icustay_id,timepoint,event_time,ventilated,mv_start,aniongap,albumin,bicarbonate,bilirubin,creatinine,chloride,glucose,hematocrit,hemoglobin,lactate,magnesium,phosphate,platelet,potassium,ptt,inr,pt,sodium,bun,wbc,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean,age,first_icu_stay,adult_icu,first_careunit,diagnosis,curr_service,dischtime,admission_type,mort_icu,gender,admittime,los_icu,mv_hours,los_icu_hr,mv_end,los_hospital,first_hosp_stay,outtime,intime,ventnum,prev_service,transfertime,ethnicity,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,pulmonary_circulation,peripheral_vascular,hypertension,paralysis,other_neurological,chronic_pulmonary,diabetes_uncomplicated,diabetes_complicated,hypothyroidism,renal_failure,liver_disease,peptic_ulcer,aids,lymphoma,metastatic_cancer,solid_tumor,rheumatoid_arthritis,coagulopathy,obesity,weight_loss,fluid_electrolyte,blood_loss_anemia,deficiency_anemias,alcohol_abuse,drug_abuse,psychoses,depression
0,4,185777,294638,3,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,97.0,119.0,69.0,85.666702,28.0,,98.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,185777,294638,4,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,94.0,,,,,,97.0,153.0,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,185777,294638,5,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,99.0,133.0,79.0,97.0,26.0,,98.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,185777,294638,6,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,92.0,,,,24.0,36.666667,97.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,185777,294638,7,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,89.0,139.0,81.0,100.333,25.0,,97.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df[df['subject_id']==4].tail()

Unnamed: 0,subject_id,hadm_id,icustay_id,timepoint,event_time,ventilated,mv_start,aniongap,albumin,bicarbonate,bilirubin,creatinine,chloride,glucose,hematocrit,hemoglobin,lactate,magnesium,phosphate,platelet,potassium,ptt,inr,pt,sodium,bun,wbc,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean,age,first_icu_stay,adult_icu,first_careunit,diagnosis,curr_service,dischtime,admission_type,mort_icu,gender,admittime,los_icu,mv_hours,los_icu_hr,mv_end,los_hospital,first_hosp_stay,outtime,intime,ventnum,prev_service,transfertime,ethnicity,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,pulmonary_circulation,peripheral_vascular,hypertension,paralysis,other_neurological,chronic_pulmonary,diabetes_uncomplicated,diabetes_complicated,hypothyroidism,renal_failure,liver_disease,peptic_ulcer,aids,lymphoma,metastatic_cancer,solid_tumor,rheumatoid_arthritis,coagulopathy,obesity,weight_loss,fluid_electrolyte,blood_loss_anemia,deficiency_anemias,alcohol_abuse,drug_abuse,psychoses,depression
19,4,185777,294638,22,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,74.0,101.0,61.0,74.333298,,,100.0,179.333333,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
20,4,185777,294638,23,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,80.0,101.0,57.0,71.666702,,,100.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
21,4,185777,294638,24,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,88.0,,,,,,99.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
22,4,185777,294638,25,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,100.0,116.0,63.0,80.666702,,,98.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23,4,185777,294638,26,2191-03-17 03:29:31,0,NaT,17.0,2.8,24.0,2.2,0.5,97.0,140.0,34.2,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,111.0,,,,,37.444445,98.0,,47.0,1,1,MICU,"FEVER,DEHYDRATION,FAILURE TO THRIVE",MED,2191-03-23 18:41:00,EMERGENCY,0,0,2191-03-16 00:28:00,1.0,,40.0,NaT,7.0,1,2191-03-17 16:46:31,2191-03-16 00:29:31,,,2191-03-16 00:29:31,white,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dataframe w/o time (for 'static' models)

In [8]:
df_ml = df.set_index(['subject_id', 'timepoint']).groupby(level=0, group_keys=False).\
                                                  apply(featurize).reset_index()

In [9]:
df_ml[df_ml['subject_id']==4].head()

Unnamed: 0,subject_id,aniongap,albumin,bicarbonate,bilirubin,creatinine,chloride,glucose,hemoglobin,lactate,magnesium,phosphate,platelet,potassium,ptt,inr,pt,sodium,bun,wbc,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean,congestive_heart_failure,chronic_pulmonary,pulmonary_circulation,age,gender,label
0,4,17.0,2.8,24.0,2.2,0.5,97.0,140.0,11.5,2.1,1.9,3.2,207.0,3.1,31.3,1.0,12.3,135.0,9.0,9.7,111.0,116.0,63.0,80.666702,18.0,37.444445,98.0,179.333333,0.0,0.0,0.0,47.0,0.0,0.0


### Dataframe w/ time (for 'timepoint' MAgECs)

In [10]:
df_time = df.set_index(['subject_id']).groupby(level=0, group_keys=False).\
                                       apply(featurize_time).apply(pd.Series.explode).reset_index()

In [11]:
df_time[df_time['subject_id']==4].head()

Unnamed: 0,subject_id,aniongap,albumin,bicarbonate,bilirubin,creatinine,chloride,glucose,hemoglobin,lactate,magnesium,phosphate,platelet,potassium,ptt,inr,pt,sodium,bun,wbc,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean,congestive_heart_failure,chronic_pulmonary,pulmonary_circulation,age,gender,timepoint,label
0,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,97,119,69,85.6667,28,,98,,0,0,0,47,0,3,0
1,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,94,119,69,85.6667,28,,97,153.0,0,0,0,47,0,4,0
2,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,99,133,79,97.0,26,,98,153.0,0,0,0,47,0,5,0
3,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,92,133,79,97.0,24,36.6667,97,153.0,0,0,0,47,0,6,0
4,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,89,139,81,100.333,25,36.6667,97,153.0,0,0,0,47,0,7,0


In [12]:
df_time[df_time['subject_id']==4].tail()

Unnamed: 0,subject_id,aniongap,albumin,bicarbonate,bilirubin,creatinine,chloride,glucose,hemoglobin,lactate,magnesium,phosphate,platelet,potassium,ptt,inr,pt,sodium,bun,wbc,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean,congestive_heart_failure,chronic_pulmonary,pulmonary_circulation,age,gender,timepoint,label
19,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,74,101,61,74.3333,18,36.6667,100,179.333,0,0,0,47,0,22,0
20,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,80,101,57,71.6667,18,36.6667,100,179.333,0,0,0,47,0,23,0
21,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,88,101,57,71.6667,18,36.6667,99,179.333,0,0,0,47,0,24,0
22,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,100,116,63,80.6667,18,36.6667,98,179.333,0,0,0,47,0,25,0
23,4,17,2.8,24,2.2,0.5,97,140,11.5,2.1,1.9,3.2,207,3.1,31.3,1,12.3,135,9,9.7,111,116,63,80.6667,18,37.4444,98,179.333,0,0,0,47,0,26,0


### Train/Valid Split

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [14]:
seed = 7
np.random.seed(seed)

x = df_ml[list(set(df_ml.columns) - {'subject_id', 'label'})]
Y = df_ml[['subject_id', 'label']]

x_train, x_validation, Y_train, Y_validation = train_test_split(x.copy(), Y, test_size=0.2, random_state=seed)

### Impute vitals+labs with mean and co-morbidities with 0

In [15]:
def impute(df):
    df[vitals+labs] = df[vitals+labs].fillna(df[vitals+labs].mean())
    df[comobs] = df[comobs].fillna(0)
    return df

In [16]:
x_train = impute(x_train)
x_validation = impute(x_validation)

### Scale data

In [17]:
from sklearn.preprocessing import StandardScaler

stsc = StandardScaler()
xst_train = stsc.fit_transform(x_train)
xst_train = pd.DataFrame(xst_train, index=x_train.index, columns=x_train.columns)

xst_validation = stsc.transform(x_validation)
xst_validation = pd.DataFrame(xst_validation, index=x_validation.index, columns=x_validation.columns)

### Train 'static' models
These are single timepoint (single row) models. The training data is grouped by patient and all timepoints are condenced to a single 'last' timepoint. 

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


def predict(model, data):
    """
    Model output (predicted) probabilities.
    Wrapper for predict_proba function in scikit-learn models.
    When a model does not have a predict_proba use predict interface.
    """
    if hasattr(model, 'predict_proba'):
        probs = model.predict_proba(data)
        if probs.shape[1] == 2:
            probs = probs[:, 1].ravel()
        else:
            probs = probs.ravel()
    else:
        probs = np.array(model.predict(data))
    return probs


def predict_classes(model, data):
    """
    Model output (predicted) classes.
    """
    if hasattr(model, 'predict_classes'):
        return model.predict_classes(data).ravel()
    else:
         return model.predict(data).ravel()

    
def evaluate(model, x_test, y_test):
    # predict probabilities for test set
    yhat_probs = predict(model, x_test)

    # predict classes for test set
    yhat_classes = predict_classes(model, x_test)
    
    # reduce to 1d array
    if len(yhat_probs[0].shape):
        yhat_probs = yhat_probs[:, 0]
        yhat_classes = yhat_classes[:, 0]
 
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_test, yhat_classes)
    print('Accuracy: %f' % accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(y_test, yhat_classes)
    print('Precision: %f' % precision)

    # recall: tp / (tp + fn)
    recall = recall_score(y_test, yhat_classes)
    print('Recall: %f' % recall)

    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_test, yhat_classes)
    print('F1 score: %f' % f1)

    # ROC AUC
    auc = roc_auc_score(y_test, yhat_probs)
    print('ROC AUC: %f' % auc)

    # confusion matrix
    matrix = confusion_matrix(y_test, yhat_classes)
    print(matrix)

In [19]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(Y_train['label']), Y_train['label'])
class_weights

array([0.61911131, 2.59887711])

#### LR

In [20]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1., class_weight='balanced', solver='lbfgs')
lr.fit(xst_train, Y_train['label'])

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [21]:
evaluate(lr, xst_validation, Y_validation['label'])

Accuracy: 0.650504
Precision: 0.323094
Recall: 0.681055
F1 score: 0.438272
ROC AUC: 0.693912
[[1071  595]
 [ 133  284]]


#### RG

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
rf = CalibratedClassifierCV(RandomForestClassifier(n_estimators=800, 
                                                   min_samples_split=2, 
                                                   min_samples_leaf=4, 
                                                   max_features='sqrt', 
                                                   max_depth=90, 
                                                   bootstrap=True, 
                                                   n_jobs=-1),
                            method='sigmoid', cv=5)
rf.fit(xst_train, Y_train['label'])

CalibratedClassifierCV(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=90, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
            cv=5, method='sigmoid')

In [23]:
evaluate(rf, xst_validation, Y_validation['label'])

Accuracy: 0.856937
Precision: 0.910345
Recall: 0.316547
F1 score: 0.469751
ROC AUC: 0.823043
[[1653   13]
 [ 285  132]]


#### MLP

In [24]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier

mlp = Sequential()
mlp.add(Dense(60, input_dim=len(xst_train.columns), activation='relu'))
mlp.add(Dropout(0.2))
mlp.add(Dense(30, input_dim=60, activation='relu'))
mlp.add(Dropout(0.2))
mlp.add(Dense(1, activation='sigmoid'))
mlp.compile(loss='binary_crossentropy', 
            loss_weights=[class_weights[1]], optimizer='adam', metrics=['accuracy'])
mlp.fit(xst_train, Y_train['label'], epochs=100, batch_size=64, verbose=0)

Using TensorFlow backend.


<keras.callbacks.callbacks.History at 0x12f0e3b00>

In [25]:
evaluate(mlp, xst_validation, Y_validation['label'])

Accuracy: 0.854057
Precision: 0.772947
Recall: 0.383693
F1 score: 0.512821
ROC AUC: 0.804984
[[1619   47]
 [ 257  160]]


### Time-aware (LSTM) model

#### Data pre-processing

In [28]:
# Get train/valid
train_ind = df_time[~np.isin(df_time['subject_id'], Y_validation.subject_id.unique())].index
valid_ind = df_time[np.isin(df_time['subject_id'], Y_validation.subject_id.unique())].index

# Impute
df_series_train = impute(df_time.iloc[train_ind].copy())
df_series_valid = impute(df_time.iloc[valid_ind].copy())

# Get X, Y as numpy arrays
df_series_train_X = df_series_train[list(set(df_series_train.columns) - 
                                         {'subject_id', 'label', 'timepoint'})].astype(float)

df_series_train_Y = df_series_train[['subject_id', 'label', 'timepoint']]

df_series_valid_X = df_series_valid[list(set(df_series_valid.columns) - 
                                         {'subject_id', 'label', 'timepoint'})].astype(float)

df_series_valid_Y = df_series_valid[['subject_id', 'label', 'timepoint']]

# scale
stsc2 = StandardScaler()
tmp = stsc2.fit_transform(df_series_train_X)
df_series_train_X = pd.DataFrame(tmp, index=df_series_train_X.index, columns=df_series_train_X.columns)
tmp = stsc2.transform(df_series_valid_X)
df_series_valid_X = pd.DataFrame(tmp, index=df_series_valid_X.index, columns=df_series_valid_X.columns)

In [29]:
# concat X/Y for train/valid
df_series_train = pd.concat([df_series_train_X, df_series_train_Y], axis=1)
df_series_valid = pd.concat([df_series_valid_X, df_series_valid_Y], axis=1)

In [30]:
df_series_valid.head()

Unnamed: 0,bicarbonate,glucose_mean,pulmonary_circulation,meanbp_mean,ptt,pt,sysbp_mean,wbc,magnesium,age,tempc_mean,phosphate,inr,chloride,sodium,heartrate_mean,albumin,creatinine,diasbp_mean,bun,platelet,gender,chronic_pulmonary,spo2_mean,congestive_heart_failure,lactate,bilirubin,hemoglobin,resprate_mean,potassium,aniongap,glucose,subject_id,label,timepoint
0,0.082431,-0.022705,-0.261491,0.426152,-0.187468,-0.418429,-0.082093,-0.184449,-0.089849,-0.879606,-0.045702,-0.274201,-0.398609,-0.73269,-0.45567,0.636508,-1.179369,-0.57922,0.42589,-0.823223,-0.33529,-1.134352,-0.499044,0.465272,-0.61177,-0.242366,0.185142,-0.152695,1.524211,-1.308177,0.139907,-0.174261,4,0,3
1,0.082431,0.173433,-0.261491,0.426152,-0.187468,-0.418429,-0.082093,-0.184449,-0.089849,-0.879606,-0.045702,-0.274201,-0.398609,-0.73269,-0.45567,0.47526,-1.179369,-0.57922,0.42589,-0.823223,-0.33529,-1.134352,-0.499044,0.113185,-0.61177,-0.242366,0.185142,-0.152695,1.524211,-1.308177,0.139907,-0.174261,4,0,4
2,0.082431,0.173433,-0.261491,1.162645,-0.187468,-0.418429,0.563827,-0.184449,-0.089849,-0.879606,-0.045702,-0.274201,-0.398609,-0.73269,-0.45567,0.744006,-1.179369,-0.57922,1.088605,-0.823223,-0.33529,-1.134352,-0.499044,0.465272,-0.61177,-0.242366,0.185142,-0.152695,1.15837,-1.308177,0.139907,-0.174261,4,0,5
3,0.082431,0.173433,-0.261491,1.162645,-0.187468,-0.418429,0.563827,-0.184449,-0.089849,-0.879606,-0.164977,-0.274201,-0.398609,-0.73269,-0.45567,0.367762,-1.179369,-0.57922,1.088605,-0.823223,-0.33529,-1.134352,-0.499044,0.113185,-0.61177,-0.242366,0.185142,-0.152695,0.792528,-1.308177,0.139907,-0.174261,4,0,6
4,0.082431,0.173433,-0.261491,1.379239,-0.187468,-0.418429,0.84065,-0.184449,-0.089849,-0.879606,-0.164977,-0.274201,-0.398609,-0.73269,-0.45567,0.206514,-1.179369,-0.57922,1.221148,-0.823223,-0.33529,-1.134352,-0.499044,0.113185,-0.61177,-0.242366,0.185142,-0.152695,0.975449,-1.308177,0.139907,-0.174261,4,0,7


In [31]:
def zero_pad(df):
    x = list()
    y = list()    
    series_cols = set(df.columns) - {'subject_id', 'timepoint'}    
    for _, fname in df.set_index(['subject_id']).groupby(level=0, group_keys=False):       
        y_data = np.array(fname['label'].values[0])
        tmp = fname[series_cols].astype(float).values
        x_data = np.zeros([25, tmp.shape[1]])
        x_data[:tmp.shape[0],:] = tmp
        x.append(x_data)
        y.append(y_data)    
    return np.array(x), np.array(y)

In [32]:
xt_train, Yt_train = zero_pad(df_series_train)
xt_valid, Yt_valid = zero_pad(df_series_valid)

In [33]:
len(xt_train), len(xt_valid)

(8332, 2083)

In [34]:
from keras.layers import LSTM
lstm = Sequential()
lstm.add(LSTM(128, dropout=0.5, recurrent_dropout=0.2, input_shape=xt_train.shape[1:]))
lstm.add(Dense(1, activation='sigmoid'))
lstm.compile(loss='binary_crossentropy',
             loss_weights=[class_weights[1]],
             optimizer='adam', 
             metrics=['accuracy'])

In [35]:
lstm.fit(xt_train, Yt_train, epochs=100, batch_size=64, verbose=0)

<keras.callbacks.callbacks.History at 0x137107d68>

In [36]:
evaluate(lstm, xt_valid, Yt_valid)

Accuracy: 0.985118
Precision: 0.934685
Recall: 0.995204
F1 score: 0.963995
ROC AUC: 0.999250
[[1637   29]
 [   2  415]]
