In [1]:
import numpy as np
import pandas as pd
import psycopg2
import os 
import random
import datetime
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

%matplotlib inline

random.seed(22891)

### Get data

In [2]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

engine = create_engine("postgresql+psycopg2://{}:{}@/{}".format(sqluser, sqluser, dbname))

schema_name = 'mimiciii'
conn = engine.connect()
conn.execute('SET search_path to ' + schema_name)

df = pd.read_sql("SELECT * FROM mimic_users_study;", conn)
conn.close()

### Create features

In [3]:
vitals = ['heartrate_mean', 'sysbp_mean', 'diasbp_mean', 'meanbp_mean',
          'resprate_mean', 'tempc_mean', 'spo2_mean', 'glucose_mean']
labs = ['aniongap', 'albumin', 'bicarbonate', 'bilirubin', 'creatinine', 
        'chloride', 'glucose', 'hemoglobin', 'lactate', 
        'magnesium', 'phosphate', 'platelet', 'potassium', 'ptt', 'inr', 
        'pt', 'sodium', 'bun', 'wbc']  # -hematocrit
comobs = ['congestive_heart_failure', 'chronic_pulmonary', 'pulmonary_circulation']
others = ['age', 'gender']

In [4]:
def last_val(x):
    vals = x[~np.isnan(x)]
    if len(vals):
        return vals[-1]
    else:
        return None
    
def featurize(df):
    out = dict()
    for lab in labs:
        out[lab] = last_val(df[lab])
    for vital in vitals:
        out[vital] = last_val(df[vital])
    for comob in comobs:
        out[comob] = last_val(df[comob])
    for other in others:
        out[other] = last_val(df[other])
    out['label'] = int(df.ventilated.iloc[-1])
    return pd.Series(out)

In [5]:
df_ml = df.set_index(['subject_id', 'timepoint']).groupby(level=0, group_keys=False).\
                                                  apply(featurize).reset_index()

### Impute vitals+labs with mean and co-morbidities with 0

In [6]:
df_ml[vitals+labs] = df_ml[vitals+labs].fillna(df_ml[vitals+labs].mean())
df_ml[comobs] = df_ml[comobs].fillna(0)

### Scale data

In [7]:
from sklearn import preprocessing

for feat in vitals+labs+comobs+others:
    df_ml[feat] = preprocessing.StandardScaler().fit_transform(df_ml[feat].values.reshape(-1, 1))

In [8]:
df_ml.head()

Unnamed: 0,subject_id,aniongap,albumin,bicarbonate,bilirubin,creatinine,chloride,glucose,hemoglobin,lactate,...,resprate_mean,tempc_mean,spo2_mean,glucose_mean,congestive_heart_failure,chronic_pulmonary,pulmonary_circulation,age,gender,label
0,4,0.193194,-1.220623,0.037682,0.2031997,-0.589794,-0.756506,-0.157733,-0.1702553,-0.2000681,...,-0.353315,0.920057,0.350774,0.375776,-0.623134,-0.4988,-0.26158,-0.913831,-1.142911,0.0
1,18,0.401561,3.064912,-0.169231,-0.1371907,-0.363041,-0.448226,1.492195,1.204078,1.019958,...,0.214905,-0.128543,0.656755,1.079414,-0.623134,-0.4988,-0.26158,-0.73873,0.874959,0.0
2,21,1.235029,-7.250133e-15,-1.617621,-0.3538028,0.940787,0.476616,0.283424,-0.9198916,1.263963,...,0.214905,0.500618,0.962736,0.111404,1.60479,-0.4988,-0.26158,1.420848,0.874959,0.0
3,25,0.609928,-0.4043308,-0.169231,-0.3538028,0.033776,-1.064787,1.942176,0.496088,-0.6067435,...,0.214905,0.011269,-0.56717,2.629045,-0.623134,-0.4988,-0.26158,-0.271795,0.874959,0.0
4,26,-0.848641,-7.250133e-15,0.865333,-1.374215e-15,-0.192976,0.939037,-0.45772,-3.698949e-15,7.224005e-16,...,0.025499,0.64043,0.350774,0.432718,1.60479,-0.4988,-0.26158,0.545343,0.874959,0.0


In [9]:
X = df_ml[list(set(df_ml.columns) - {'subject_id', 'label'})]
Y = df_ml['label'].astype(int)

### Model k-fold CV

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

def kfold(estimator, X, Y):
    kfold = StratifiedKFold(n_splits=10, shuffle=True)
    results = cross_val_score(estimator, X, Y, cv=kfold)
    print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [13]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
class_weights

array([0.62030971, 2.5779703 ])

### LR

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1., class_weight='balanced', solver='lbfgs')
kfold(lr, X, Y)

Baseline: 64.85% (0.85%)


### RF

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
rf = CalibratedClassifierCV(RandomForestClassifier(n_estimators=800, 
                                                   min_samples_split=2, 
                                                   min_samples_leaf=4, 
                                                   max_features='sqrt', 
                                                   max_depth=90, 
                                                   bootstrap=True, 
                                                   n_jobs=-1),
                            method='sigmoid', cv=5)
kfold(rf, X, Y)

Baseline: 87.02% (0.63%)


### MLP

In [17]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier

def create_baseline():
    model = Sequential()
    model.add(Dense(60, input_dim=len(X.columns), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(30, input_dim=60, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
                  loss_weights=[class_weights[1]], optimizer='adam', metrics=['accuracy'])
    return model

mlp = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=64, verbose=0)
kfold(mlp, X, Y)

Using TensorFlow backend.


Baseline: 86.30% (0.76%)
