## Neural Network Training - Pretrained Embedding Model with Dimension Reduction

### Balanced Dataset

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, concatenate, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy

Load the data

In [None]:
training_feats_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/pretrained_embeddings/PubMedBERT/regression_nn/balanced_training_set.csv"
training_labels_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/labels/yale_new_haven_balanced_training_labels.csv"

In [None]:
X_train = pd.read_csv(training_feats_filepath)
y_train = pd.read_csv(training_labels_filepath)

In [None]:
train_ids = X_train['ID'].astype('int32')
X_train = X_train[[col for col in X_train if col != 'ID']]

Separate the features

In [None]:
cc_cols = [col for col in X_train.columns if "cc_" in col]
pmh_cols = [col for col in X_train.columns if "pmh_" in col]

other_cols = [col for col in X_train.columns if col not in cc_cols and col not in pmh_cols]

Setup the network

In [None]:
cc_original_embedding_size = len(cc_cols)
pmh_original_embedding_size = len(pmh_cols)
num_other_features = len(other_cols)

# Chief complaint embedding
cc_input = Input(shape=(cc_original_embedding_size, ), name='cc')
cc_embedded_features = Dense(50, activation='relu')(cc_input)

# Past Medical History embedding
pmh_input = Input(shape=(pmh_original_embedding_size, ), name='pmh')
pmh_embedded_features = Dense(50, activation='relu')(pmh_input)

# Other features
other_input = Input(shape=(num_other_features, ), name='other')

# Merge all available features into a single large vector via concatenation
x = concatenate([other_input, cc_embedded_features, pmh_embedded_features])

dense_1 = Dense(512, activation='relu')(x)
dropout_1 = Dropout(0.3)(dense_1)
dense_2 = Dense(256, activation='relu')(dropout_1)
dropout_2 = Dropout(0.2)(dense_2)
output = Dense(1, activation='sigmoid', name='output')(dropout_2)

# Instantiate an end-to-end model predicting both priority and department
model = Model(
    inputs=[other_input, cc_input, pmh_input],
    outputs=output
)

binary_crossentropy = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer="adam", loss=binary_crossentropy, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
history = model.fit(
    {'other': X_train[other_cols], 'cc': X_train[cc_cols], 'pmh': X_train[pmh_cols]}, 
    {'output': y_train}, 
    epochs=10, 
    batch_size=64, 
    callbacks=[earlyStopping],
    validation_split=0.1
)

In [None]:
model.evaluate({'other': X_train[other_cols], 'cc': X_train[cc_cols], 'pmh': X_train[pmh_cols]}, y_train)

Save the model

In [None]:
nn_filepath = "/home/mila/d/david.hobson/scratch/models/balanced/experiments/pretrained/PubMedBERT/pretrained_embedding_reduced/"

In [None]:
model.save(nn_filepath)