## Neural Network Training - LSA
### Balanced

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy

Load the data

In [5]:
training_feats_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/normalized_preprocessing/regression_nn/yale_new_haven_balanced_training_features.csv"
training_labels_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/labels/yale_new_haven_balanced_training_labels.csv"

In [6]:
X_train = pd.read_csv(training_feats_filepath)
y_train = pd.read_csv(training_labels_filepath)

In [7]:
train_ids = X_train['ID'].astype('int32')
X_train = X_train[[col for col in X_train if col != 'ID']]

In [8]:
disposition_var = {'disposition'}
demographic_vars = {'age', 'gender', 'ethnicity', 'race', 'lang',
       'religion', 'maritalstatus', 'employstatus', 'insurance_status'}
triage_evaluation_vars = {'dep_name', 'esi', 'arrivalmode', 'arrivalmonth', 'arrivalday', 'arrivalhour_bin'}.union({col for col in X_train.columns if 'triage_vital' in col})
chief_complaint_vars = {col for col in X_train.columns if "cc_" in col}
medication_vars = {col for col in X_train.columns if 'meds_' in col}
hospital_usage_stats_vars = {'previousdispo', 'n_edvisits', 'n_admissions', 'n_surgeries'}
imaging_ekg_vars = {'cxr_count','echo_count','ekg_count','otherxr_count', 'otherus_count', 'headct_count', 'otherct_count', 'mri_count','otherimg_count'}
historical_vital_vars = {'dbp_last',
 'dbp_max',
 'dbp_median',
 'dbp_min',
 'o2_device_last',
 'o2_device_max',
 'o2_device_median',
 'o2_device_min',
 'pulse_last',
 'pulse_max',
 'pulse_median',
 'pulse_min',
 'resp_last',
 'resp_max',
 'resp_median',
 'resp_min',
 'sbp_last',
 'sbp_max',
 'sbp_median',
 'sbp_min',
 'spo2_last',
 'spo2_max',
 'spo2_median',
 'spo2_min',
 'temp_last',
 'temp_max',
 'temp_median',
 'temp_min'}
curr = disposition_var.union(demographic_vars.union(triage_evaluation_vars.union(chief_complaint_vars.union(medication_vars.union(hospital_usage_stats_vars.union(imaging_ekg_vars.union(historical_vital_vars)))))))
past_medical_hist_vars = {col for col in X_train.columns if col not in curr and "_" not in col and col not in ['ID', 'previousdispo']}

cc_cols = list(chief_complaint_vars)
pmh_cols = list(past_medical_hist_vars)

cc_cols.sort()
pmh_cols.sort()

LSA on the Chief Complaint and the Past Medical History columns

In [9]:
tfidf_cc = TfidfTransformer()
tfidf_pmh = TfidfTransformer()

In [12]:
cc_long_features = tfidf_cc.fit_transform(X_train[cc_cols])
pmh_long_features = tfidf_pmh.fit_transform(X_train[pmh_cols])

In [26]:
k = 50
t_svd_cc = TruncatedSVD(k)
t_svd_pmh = TruncatedSVD(k)

cc_features = t_svd_cc.fit_transform(cc_long_features)
pmh_features = t_svd_pmh.fit_transform(pmh_long_features)

In [27]:
df_cc = pd.DataFrame(cc_features, columns=[f"cc_{i}" for i in range(k)])
df_pmh = pd.DataFrame(cc_features, columns=[f"pmh_{i}" for i in range(k)])

In [28]:
other_features = X_train[[col for col in X_train.columns if col not in cc_cols and col not in pmh_cols]]

In [34]:
X_train = pd.concat([other_features, df_cc, df_pmh], axis=1)

Setup the network

In [35]:
# use binary_crossentropy = BinaryCrossentropy(from_logits=True) if model returns values in range [-inf, inf]
# otherwise, model returns probabilities, then use from_logits=False (the default)
binary_crossentropy = BinaryCrossentropy()

In [36]:
model_1 = Sequential([
    Dense(512, activation="relu"),
    Dropout(0.3),
    Dense(256, activation="relu"),
    Dropout(0.2),
    Dense(1, activation="sigmoid")
])
model_1.compile(optimizer="adam", loss=binary_crossentropy, metrics=['accuracy'])

In [37]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=3)

Train the model

In [38]:
history = model_1.fit(
    X_train, 
    y_train, 
    epochs=10, 
    batch_size=64, 
    callbacks=[earlyStopping], 
    validation_split=0.1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
model_1.evaluate(X_train, y_train)



[0.32342734932899475, 0.8605650663375854]

Save the model

In [40]:
nn_filepath = "/home/mila/d/david.hobson/scratch/models/balanced/experiments/nn_LSA"

In [41]:
model_1.save(nn_filepath)

INFO:tensorflow:Assets written to: /home/mila/d/david.hobson/scratch/models/balanced/experiments/nn_LSA/assets
