# Keras Embedding Model - Preprocessing

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

from scipy.sparse import csc_matrix, save_npz

Load the training and test sets

In [None]:
X_train = pd.read_csv("~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/normalized_preprocessing/regression_nn/yale_new_haven_balanced_training_features.csv")
X_test = pd.read_csv("~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/normalized_preprocessing/regression_nn/yale_new_haven_balanced_test_features.csv")

Get the chief complaint and past medical history columns

In [None]:
disposition_var = {'disposition'}
demographic_vars = {'age', 'gender', 'ethnicity', 'race', 'lang',
       'religion', 'maritalstatus', 'employstatus', 'insurance_status'}
triage_evaluation_vars = {'dep_name', 'esi', 'arrivalmode', 'arrivalmonth', 'arrivalday', 'arrivalhour_bin'}.union({col for col in X_train.columns if 'triage_vital' in col})
chief_complaint_vars = {col for col in X_train.columns if "cc_" in col}
medication_vars = {col for col in X_train.columns if 'meds_' in col}
hospital_usage_stats_vars = {'previousdispo', 'n_edvisits', 'n_admissions', 'n_surgeries'}
imaging_ekg_vars = {'cxr_count','echo_count','ekg_count','otherxr_count', 'otherus_count', 'headct_count', 'otherct_count', 'mri_count','otherimg_count'}
historical_vital_vars = {'dbp_last',
 'dbp_max',
 'dbp_median',
 'dbp_min',
 'o2_device_last',
 'o2_device_max',
 'o2_device_median',
 'o2_device_min',
 'pulse_last',
 'pulse_max',
 'pulse_median',
 'pulse_min',
 'resp_last',
 'resp_max',
 'resp_median',
 'resp_min',
 'sbp_last',
 'sbp_max',
 'sbp_median',
 'sbp_min',
 'spo2_last',
 'spo2_max',
 'spo2_median',
 'spo2_min',
 'temp_last',
 'temp_max',
 'temp_median',
 'temp_min'}
curr = disposition_var.union(demographic_vars.union(triage_evaluation_vars.union(chief_complaint_vars.union(medication_vars.union(hospital_usage_stats_vars.union(imaging_ekg_vars.union(historical_vital_vars)))))))
past_medical_hist_vars = {col for col in X_train.columns if col not in curr and "_" not in col and col not in ['ID', 'previousdispo']}

cc_cols = list(chief_complaint_vars)
pmh_cols = list(past_medical_hist_vars)

cc_cols.sort()
pmh_cols.sort()

Get the names of the conditions

In [None]:
df_vars = pd.read_csv("~/scratch/datasets/yale_new_haven/supplementary_info/variable_descriptions.csv")

### Train and apply the `TextVectorization`

In [None]:
cols = pmh_cols

In [None]:
def get_text_tensor(df, df_vars, cols):

    # get a df of the chief complaint and past medical history variable names to actual names
    df_cols = df[cols].rename(columns=df_vars.set_index('Variable Name')['Description'].loc[cols].to_dict())

    # create series of all of a patients complaints and pmh conditions
    df_cols_list = df_cols.T.apply(lambda x: list(x[x==1].index))
    
    # convert to a tensor
    return tf.ragged.constant([np.array(i) for i in df_cols_list]), df_cols_list.apply(lambda x: len(x)).max()

In [None]:
train_tensor, train_max_seq_length = get_text_tensor(X_train, df_vars, cols)
test_tensor, test_max_seq_length = get_text_tensor(X_test, df_vars, cols)

Train the vectorizer

In [None]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=None,
    split=None,
    output_mode='int',
    output_sequence_length=train_max_seq_length
)

In [None]:
vectorize_layer.adapt(train_tensor)

In [None]:
len(vectorize_layer.get_vocabulary())

Save the results as sparse matrices

In [None]:
train_features = csc_matrix(vectorize_layer(train_tensor))
test_features = csc_matrix(vectorize_layer(test_tensor))

In [None]:
save_npz('/home/mila/d/david.hobson/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/keras_embeddings/pmh_embedding_training_input.npz', train_features)
save_npz('/home/mila/d/david.hobson/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/keras_embeddings/pmh_embedding_test_input.npz', test_features)