## Neural Network Training - Keras Embedding Model

### Balanced Dataset

In [None]:
import numpy as np
import pandas as pd

import scipy

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, concatenate, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy

from scipy.sparse import csc_matrix, save_npz

Load the data

In [None]:
training_feats_filepath = "yale_new_haven_balanced_training_features.csv"
training_labels_filepath = "yale_new_haven_balanced_training_labels.csv"

In [None]:
X_train = pd.read_csv(training_feats_filepath)
y_train = pd.read_csv(training_labels_filepath)

In [None]:
train_ids = X_train['ID'].astype('int32')
X_train = X_train[[col for col in X_train if col != 'ID']]

In [None]:
disposition_var = {'disposition'}
demographic_vars = {'age', 'gender', 'ethnicity', 'race', 'lang',
       'religion', 'maritalstatus', 'employstatus', 'insurance_status'}
triage_evaluation_vars = {'dep_name', 'esi', 'arrivalmode', 'arrivalmonth', 'arrivalday', 'arrivalhour_bin'}.union({col for col in X_train.columns if 'triage_vital' in col})
chief_complaint_vars = {col for col in X_train.columns if "cc_" in col}
medication_vars = {col for col in X_train.columns if 'meds_' in col}
hospital_usage_stats_vars = {'previousdispo', 'n_edvisits', 'n_admissions', 'n_surgeries'}
imaging_ekg_vars = {'cxr_count','echo_count','ekg_count','otherxr_count', 'otherus_count', 'headct_count', 'otherct_count', 'mri_count','otherimg_count'}
historical_vital_vars = {'dbp_last',
 'dbp_max',
 'dbp_median',
 'dbp_min',
 'o2_device_last',
 'o2_device_max',
 'o2_device_median',
 'o2_device_min',
 'pulse_last',
 'pulse_max',
 'pulse_median',
 'pulse_min',
 'resp_last',
 'resp_max',
 'resp_median',
 'resp_min',
 'sbp_last',
 'sbp_max',
 'sbp_median',
 'sbp_min',
 'spo2_last',
 'spo2_max',
 'spo2_median',
 'spo2_min',
 'temp_last',
 'temp_max',
 'temp_median',
 'temp_min'}
curr = disposition_var.union(demographic_vars.union(triage_evaluation_vars.union(chief_complaint_vars.union(medication_vars.union(hospital_usage_stats_vars.union(imaging_ekg_vars.union(historical_vital_vars)))))))
past_medical_hist_vars = {col for col in X_train.columns if col not in curr and "_" not in col and col not in ['ID', 'previousdispo']}

cc_cols = list(chief_complaint_vars)
pmh_cols = list(past_medical_hist_vars)

cc_cols.sort()
pmh_cols.sort()

Get the sparse embeddings

In [None]:
cc_sparse_features = scipy.sparse.load_npz('cc_embedding_training_input.npz')
cc_features = pd.DataFrame(cc_sparse_features.todense())

In [None]:
pmh_sparse_features = scipy.sparse.load_npz('pmh_embedding_training_input.npz')
pmh_features = pd.DataFrame(pmh_sparse_features.todense())

In [None]:
other_features = X_train[[col for col in X_train.columns if col not in cc_cols and col not in pmh_cols]]

Setup the network

In [None]:
seq_len_cc = cc_features.shape[1]
seq_len_pmh = pmh_features.shape[1]

vocab_len_cc = cc_features.max().max() + 1
vocab_len_pmh = pmh_features.max().max() + 1

num_other_features = other_features.shape[1]

# Chief complaint embedding
cc_input = Input(shape=(seq_len_cc, ), name='cc')
embedding_cc = Embedding(vocab_len_cc, 50, input_length=seq_len_cc)(cc_input)
cc_embedded_features = GlobalAveragePooling1D()(embedding_cc)

# Past Medical History embedding
pmh_input = Input(shape=(seq_len_pmh, ), name='pmh')
embedding_pmh = Embedding(vocab_len_pmh, 50, input_length=seq_len_pmh)(pmh_input)
pmh_embedded_features = GlobalAveragePooling1D()(embedding_pmh)

# Other features
other_input = Input(shape=(num_other_features, ), name='other')

# Merge all available features into a single large vector via concatenation
x = concatenate([other_input, cc_embedded_features, pmh_embedded_features])

dense_1 = Dense(512, activation='relu')(x)
dropout_1 = Dropout(0.3)(dense_1)
dense_2 = Dense(256, activation='relu')(dropout_1)
dropout_2 = Dropout(0.2)(dense_2)
output = Dense(1, activation='sigmoid', name='output')(dropout_2)

# Instantiate an end-to-end model predicting both priority and department
model = Model(
    inputs=[other_input, cc_input, pmh_input],
    outputs=output
)

binary_crossentropy = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer="adam", loss=binary_crossentropy, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
history = model.fit(
    {'other': other_features, 'cc': cc_features, 'pmh': pmh_features}, 
    {'output': y_train}, 
    epochs=50, 
    batch_size=64, 
    callbacks=[earlyStopping],
    validation_split=0.1
)

In [None]:
model.evaluate({'other': other_features, 'cc': cc_features, 'pmh': pmh_features}, y_train)

In [None]:
model.evaluate({'other': other_features, 'cc': cc_features, 'pmh': pmh_features}, y_train)

Save the model

In [None]:
nn_filepath = "keras_embedding/"

In [None]:
model.save(nn_filepath)