# One row per patient model

To keep it simple, to start off with we simply model the patients as a single row, i.e. each feature is a scalar or a categorical variable.

See `03_mortality_red_dataset` notebook for more info.

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

establish connection to DB and define helper function for running queries

In [1]:
import pandas as pd
from proto.etl.config import SSHInfoEicu, DBInfoEicu
from proto.etl.utils import connect_to_db_via_ssh, run_eicu_query, get_column_completeness, load_schema_for_modelling

conn = connect_to_db_via_ssh(SSHInfoEicu, DBInfoEicu)
cursor = conn.cursor()
query_schema = 'set search_path to eicu_crd;'

#### Load data, get target variable

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler

from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate, BatchNormalization, SpatialDropout1D
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [7]:
# load vars
df = pd.read_csv('orpp_all.csv').set_index('patientunitstayid')

# load targets
query = """
select p.patientunitstayid, i.hosp_mort, icu_los_hours
from patient_top5hospitals_mort_dataset p
inner join icustay_detail i
on p.patientunitstayid=i.patientunitstayid
"""
df_y = run_eicu_query(query, conn).set_index('patientunitstayid')

# there are 100 missing mortality labels, we impute them with zero
#re-order y vars to match the order of X
df_y.fillna(0, inplace=True)
df_y = df_y.loc[df.index]

#### Setup encoders for the categorical input vars

In [9]:
cat_encoders = {
    'ethnicity': LabelEncoder(),
    'hospital_region': LabelEncoder(),
    'unittype': LabelEncoder(),
    'apachedxgroup': LabelEncoder()
}
for col, label_encoder in cat_encoders.items():
    df[col] = label_encoder.fit_transform(df[col])

#### Normalise numeric input features

In [10]:
cat_vars = [
    'ethnicity',
    'hospital_region',
    'unittype',
    'apachedxgroup'
]
num_cols = list(df.columns[4:].values)

# we don't want to scale the embed dims
embed_dims = 100
num_cols_to_scale = num_cols[:-embed_dims]
scaler = RobustScaler(quantile_range=(10.0, 90.0))
# scaler = MinMaxScaler()
df[num_cols_to_scale] = scaler.fit_transform(df[num_cols_to_scale].values)

#### Setup training/test data

In [11]:
# do train-test split without scikit to keep the data with its col names - see later
np.random.seed(42)
test_ratio = 0.1
train_ix = np.random.rand(len(df)) < 1 - test_ratio

# define X and y, then split it into train (90%) and test (10%)
X = df.values
y = df_y['hosp_mort'].values
df_X_train = df[train_ix]
y_train = y[train_ix]
df_X_test = df[~train_ix]
y_test = y[~train_ix]

In [12]:
def get_data_dict(df, num_cols):
    return {
        'ethnicity': np.array(df.ethnicity),
        'hospital_region': np.array(df.hospital_region),
        'unittype': np.array(df.unittype),
        'apachedxgroup': np.array(df.apachedxgroup),
        'num_cols': df[num_cols].values
    }
X_train = get_data_dict(df_X_train, num_cols)
X_test = get_data_dict(df_X_test, num_cols)  

#### Define basic FFN model and its hyperparams

In [22]:
EMBED_SIZE = 5
DENSE_SIZE = 512
BATCH_SIZE = 2048
EPOCHS = 100
DROPOUT = 0.25
STEPS = int(len(X_train) / BATCH_SIZE) * EPOCHS

# set up decaying learning rate for Adam
LR_INIT, LR_FIN = 0.001, 0.0001
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
lr_decay = exp_decay(LR_INIT, LR_FIN, STEPS)
optimizer_adam = Adam(lr=0.001, decay=lr_decay, amsgrad=True)

In [14]:
# cat features embedded
in_et = Input(shape=(1,), name='ethnicity')
emb_et = Embedding(cat_encoders['ethnicity'].classes_.shape[0], 2)(in_et)
in_hr = Input(shape=(1,), name='hospital_region')
emb_hr = Embedding(cat_encoders['hospital_region'].classes_.shape[0], 2)(in_hr)
in_ut = Input(shape=(1,), name='unittype')
emb_ut = Embedding(cat_encoders['unittype'].classes_.shape[0], 2)(in_ut)
in_ag = Input(shape=(1,), name='apachedxgroup')
emb_ag = Embedding(cat_encoders['apachedxgroup'].classes_.shape[0], 5)(in_ag)

cat_feats = concatenate([emb_et, emb_hr, emb_ut, emb_ag])
cat_feats = Flatten()(cat_feats)

# num features with 2 layers and dropout
in_num = Input(shape=(len(num_cols),), name='num_cols')
num_feats = Dense(DENSE_SIZE, activation='relu')(in_num)
num_feats = Dropout(DROPOUT)(num_feats)
num_feats = Dense(DENSE_SIZE/2, activation='relu')(num_feats)
num_feats = Dropout(DROPOUT)(num_feats)

# concat cat and num features add final dense layer, output layer and compile model
all_feats = concatenate([cat_feats, num_feats])
all_feats = Dense(DENSE_SIZE/4, activation='relu')(all_feats)
all_feats = Dropout(DROPOUT)(all_feats)

out = Dense(1, activation=None)(all_feats)
model = Model(inputs=[in_et, in_hr, in_ut, in_ag, in_num], outputs=out)
model.compile(
    loss='binary_crossentropy',
    optimizer=optimizer_adam, 
    metrics=['accuracy']
)
model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
num_cols (InputLayer)           (None, 207)          0                                            
__________________________________________________________________________________________________
ethnicity (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
hospital_region (InputLayer)    (None, 1)            0                                            
__________________________________________________________________________________________________
unittype (InputLayer)           (None, 1)            0                                       

#### Train model

In [25]:
tf.test.is_gpu_available()

True

In [23]:
model.fit(
    X_train, y_train, 
    batch_size=BATCH_SIZE, 
    class_weight={0: 0.1, 1: 0.9},
    epochs=EPOCHS, 
    shuffle=True, 
    verbose=2
)

Epoch 1/100
 - 0s - loss: 7.8781 - acc: 0.9039
Epoch 2/100
 - 0s - loss: 7.8736 - acc: 0.9039
Epoch 3/100
 - 0s - loss: 7.8778 - acc: 0.9039
Epoch 4/100
 - 0s - loss: 7.8767 - acc: 0.9039
Epoch 5/100
 - 0s - loss: 7.8557 - acc: 0.9039
Epoch 6/100
 - 0s - loss: 7.8763 - acc: 0.9039
Epoch 7/100
 - 0s - loss: 7.8728 - acc: 0.9039
Epoch 8/100
 - 0s - loss: 7.8754 - acc: 0.9039
Epoch 9/100
 - 0s - loss: 7.8706 - acc: 0.9039
Epoch 10/100
 - 0s - loss: 7.8765 - acc: 0.9039
Epoch 11/100
 - 0s - loss: 7.8799 - acc: 0.9039
Epoch 12/100
 - 0s - loss: 7.8739 - acc: 0.9039
Epoch 13/100
 - 0s - loss: 7.8763 - acc: 0.9039
Epoch 14/100
 - 0s - loss: 7.8728 - acc: 0.9039
Epoch 15/100
 - 0s - loss: 7.8792 - acc: 0.9039
Epoch 16/100
 - 0s - loss: 7.8723 - acc: 0.9039
Epoch 17/100
 - 0s - loss: 7.8619 - acc: 0.9039
Epoch 18/100
 - 0s - loss: 7.8671 - acc: 0.9039
Epoch 19/100
 - 0s - loss: 7.8717 - acc: 0.9039
Epoch 20/100
 - 0s - loss: 7.8655 - acc: 0.9039
Epoch 21/100
 - 0s - loss: 7.8802 - acc: 0.9039
E

KeyboardInterrupt: 