## MIMICIII Mechanical Ventilation MAgECs

In [None]:
import numpy as np
import pandas as pd
import psycopg2
import os 
import random
import datetime
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

%matplotlib inline

random.seed(22891)

In [None]:
pd.set_option('display.max_columns', None)

### Get data

In [None]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

engine = create_engine("postgresql+psycopg2://{}:{}@/{}".format(sqluser, sqluser, dbname))

schema_name = 'mimiciii'
conn = engine.connect()
conn.execute('SET search_path to ' + schema_name)

df = pd.read_sql("SELECT * FROM mimic_users_study;", conn)
conn.close()

### Featurize

In [None]:
vitals = ['heartrate_mean', 'sysbp_mean', 'diasbp_mean', 'meanbp_mean',
          'resprate_mean', 'tempc_mean', 'spo2_mean', 'glucose_mean']
labs = ['aniongap', 'albumin', 'bicarbonate', 'bilirubin', 'creatinine', 
        'chloride', 'glucose', 'hemoglobin', 'lactate', 
        'magnesium', 'phosphate', 'platelet', 'potassium', 'ptt', 'inr', 
        'pt', 'sodium', 'bun', 'wbc']  # -hematocrit
comobs = ['congestive_heart_failure', 'chronic_pulmonary', 'pulmonary_circulation']
others = ['age', 'gender']

In [None]:
def last_val(x):
    vals = x[~np.isnan(x)]
    if len(vals):
        return vals[-1]
    else:
        return None
    
def featurize_time(df):
    out = dict()
    for i in range(len(df)):
        for lab in labs:
            val = last_val(df[lab].values[:i+1])
            if lab not in out:
                out[lab] = [val]
            else:
                out[lab].append(val)
        for vital in vitals:    
            val = last_val(df[vital].values[:i+1])
            if vital not in out:
                out[vital] = [val]
            else:
                out[vital].append(val)
        for comob in comobs:    
            val = last_val(df[comob].values[:i+1])
            if comob not in out:
                out[comob] = [val]
            else:
                out[comob].append(val)
        for other in others:
            val = last_val(df[other].values[:i+1])
            if other not in out:
                out[other] = [val]
            else:
                out[other].append(val)
        out['timepoint'] = df.timepoint.values
        out['label'] = [int(x) for x in df.ventilated.values]
    return pd.Series(out)

def featurize(df):
    out = dict()
    for lab in labs:
        out[lab] = last_val(df[lab])
    for vital in vitals:
        out[vital] = last_val(df[vital])
    for comob in comobs:
        out[comob] = last_val(df[comob])
    for other in others:
        out[other] = last_val(df[other])
    out['label'] = int(df.ventilated.iloc[-1])
    return pd.Series(out)

### Example from 'original' dataframe

In [None]:
df[df['subject_id']==4].head()

In [None]:
df[df['subject_id']==4].tail()

### Dataframe w/o time (for 'static' models)

In [None]:
df_ml = df.set_index(['subject_id', 'timepoint']).groupby(level=0, group_keys=False).\
                                                  apply(featurize).reset_index()

In [None]:
df_ml[df_ml['subject_id']==4].head()

### Dataframe w/ time (for 'timepoint' MAgECs)

In [None]:
df_time = df.set_index(['subject_id']).groupby(level=0, group_keys=False).\
                                       apply(featurize_time).apply(pd.Series.explode).reset_index()

In [None]:
df_time[df_time['subject_id']==4].head()

In [None]:
df_time[df_time['subject_id']==4].tail()

### Train/Valid Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
seed = 7
np.random.seed(seed)

x = df_ml[list(set(df_ml.columns) - {'subject_id', 'label'})]
Y = df_ml[['subject_id', 'label']]

x_train, x_validation, Y_train, Y_validation = train_test_split(x.copy(), Y, test_size=0.2, random_state=seed)

### Impute vitals+labs with mean and co-morbidities with 0

In [None]:
def impute(df):
    df[vitals+labs] = df[vitals+labs].fillna(df[vitals+labs].mean())
    df[comobs] = df[comobs].fillna(0)
    return df

In [None]:
x_train = impute(x_train)
x_validation = impute(x_validation)

### Scale data

In [None]:
from sklearn.preprocessing import StandardScaler

stsc = StandardScaler()
xst_train = stsc.fit_transform(x_train)
xst_train = pd.DataFrame(xst_train, index=x_train.index, columns=x_train.columns)

xst_validation = stsc.transform(x_validation)
xst_validation = pd.DataFrame(xst_validation, index=x_validation.index, columns=x_validation.columns)

### Train 'static' models
These are single timepoint (single row) models. The training data is grouped by patient and all timepoints are condenced to a single 'last' timepoint. 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


def predict(model, data):
    """
    Model output (predicted) probabilities.
    Wrapper for predict_proba function in scikit-learn models.
    When a model does not have a predict_proba use predict interface.
    """
    if hasattr(model, 'predict_proba'):
        probs = model.predict_proba(data)
        if probs.shape[1] == 2:
            probs = probs[:, 1].ravel()
        else:
            probs = probs.ravel()
    else:
        probs = np.array(model.predict(data))
    return probs


def predict_classes(model, data):
    """
    Model output (predicted) classes.
    """
    if hasattr(model, 'predict_classes'):
        return model.predict_classes(data).ravel()
    else:
         return model.predict(data).ravel()

    
def evaluate(model, x_test, y_test):
    # predict probabilities for test set
    yhat_probs = predict(model, x_test)

    # predict classes for test set
    yhat_classes = predict_classes(model, x_test)
    
    # reduce to 1d array
    if len(yhat_probs[0].shape):
        yhat_probs = yhat_probs[:, 0]
        yhat_classes = yhat_classes[:, 0]
 
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_test, yhat_classes)
    print('Accuracy: %f' % accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(y_test, yhat_classes)
    print('Precision: %f' % precision)

    # recall: tp / (tp + fn)
    recall = recall_score(y_test, yhat_classes)
    print('Recall: %f' % recall)

    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_test, yhat_classes)
    print('F1 score: %f' % f1)

    # ROC AUC
    auc = roc_auc_score(y_test, yhat_probs)
    print('ROC AUC: %f' % auc)

    # confusion matrix
    matrix = confusion_matrix(y_test, yhat_classes)
    print(matrix)

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(Y_train['label']), Y_train['label'])
class_weights

#### LR

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1., class_weight='balanced', solver='lbfgs')
lr.fit(xst_train, Y_train['label'])

In [None]:
evaluate(lr, xst_validation, Y_validation['label'])

#### RG

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
rf = CalibratedClassifierCV(RandomForestClassifier(n_estimators=800, 
                                                   min_samples_split=2, 
                                                   min_samples_leaf=4, 
                                                   max_features='sqrt', 
                                                   max_depth=90, 
                                                   bootstrap=True, 
                                                   n_jobs=-1),
                            method='sigmoid', cv=5)
rf.fit(xst_train, Y_train['label'])

In [None]:
evaluate(rf, xst_validation, Y_validation['label'])

#### MLP

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier

mlp = Sequential()
mlp.add(Dense(60, input_dim=len(xst_train.columns), activation='relu'))
mlp.add(Dropout(0.2))
mlp.add(Dense(30, input_dim=60, activation='relu'))
mlp.add(Dropout(0.2))
mlp.add(Dense(1, activation='sigmoid'))
mlp.compile(loss='binary_crossentropy', 
            loss_weights=[class_weights[1]], optimizer='adam', metrics=['accuracy'])
mlp.fit(xst_train, Y_train['label'], epochs=100, batch_size=64, verbose=0)

In [None]:
evaluate(mlp, xst_validation, Y_validation['label'])

### Time-aware (LSTM) model

#### Data pre-processing

In [None]:
# Get train/valid
train_ind = df_time[~np.isin(df_time['subject_id'], Y_validation.subject_id.unique())].index
valid_ind = df_time[np.isin(df_time['subject_id'], Y_validation.subject_id.unique())].index

# Impute
df_series_train = impute(df_time.iloc[train_ind].copy())
df_series_valid = impute(df_time.iloc[valid_ind].copy())

# Get X, Y as numpy arrays
df_series_train_X = df_series_train[list(set(df_series_train.columns) - 
                                         {'subject_id', 'label', 'time_point'})].astype(float)

df_series_train_Y = df_series_train[['subject_id', 'label', 'time_point']]

df_series_valid_X = df_series_valid[list(set(df_series_valid.columns) - 
                                         {'subject_id', 'label', 'time_point'})].astype(float)

df_series_valid_Y = df_series_valid[['subject_id', 'label', 'timepoint']]

# scale
stsc2 = StandardScaler()
tmp = stsc2.fit_transform(df_series_train_X)
df_series_train_X = pd.DataFrame(tmp, index=df_series_train_X.index, columns=df_series_train_X.columns)
tmp = stsc2.transform(df_series_valid_X)
df_series_valid_X = pd.DataFrame(tmp, index=df_series_valid_X.index, columns=df_series_valid_X.columns)

In [None]:
# concat X/Y for train/valid
df_series_train = pd.concat([df_series_train_X, df_series_train_Y], axis=1)
df_series_valid = pd.concat([df_series_valid_X, df_series_valid_Y], axis=1)

In [None]:
df_series_valid.head()

In [None]:
def zero_pad(df):
    x = list()
    y = list()    
    series_cols = set(df.columns) - {'subject_id', 'timepoint'}    
    for ind, fname in df.set_index(['subject_id']).groupby(level=0, group_keys=False):       
        y_data = np.array(fname['label'].values[0])
        tmp = fname[series_cols].astype(float).values
        x_data = np.zeros([25, tmp.shape[1]])
        x_data[:tmp.shape[0],:] = tmp
        x.append(x_data)
        y.append(y_data)    
    return np.array(x), np.array(y)

In [None]:
# xt_train, Yt_train, xt_valid, Yt_valid = time_series_data(df_time, valid_ind)

In [None]:
# len(xt_train), len(xt_valid)

In [None]:
from keras.layers import LSTM
lstm = Sequential()
lstm.add(LSTM(128, dropout=0.5, recurrent_dropout=0.2, input_shape=xt_train.shape[1:]))
lstm.add(Dense(1, activation='sigmoid'))
lstm.compile(loss='binary_crossentropy',
             loss_weights=[class_weights[1]],
             optimizer='adam', 
             metrics=['accuracy'])

In [None]:
lstm.fit(xt_train, Yt_train, epochs=100, batch_size=64, verbose=0)

In [None]:
evaluate(lstm, xt_valid, Yt_valid)