## Ped-BERT embeddings

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@wisc.edu <br>
Date created: June 8, 2022 <br>

Citations for MLM: https://keras.io/examples/nlp/masked_language_modeling/

Citations for ICD10 to ICD9: https://github.com/AtlasCUMC/ICD10-ICD9-codes-conversion

### Step 1: Import packages

In [None]:
# standard
import pandas as pd
import numpy as np
import os
import glob
import re
from pprint import pprint
import importlib
import time

# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from dataclasses import dataclass
tf.keras.backend.set_floatx('float64')

#!pip install icd9cms
from icd9cms.icd9 import search

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# user defined
import utils_dt_prep
import utils_MLM
import utils_MLM_eval
import utils_embeddings

# matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

# opress warnings
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

---
### Step 2: Set-up config

In [None]:
@dataclass
class Config:
    MAX_LEN = 40 #of diagnosis history
    BATCH_SIZE = 32
    PAT_MIN_LENGTH = 3 #minimum number of visits
    DIAG_PER_VISIT = 3 #diagnosis per visit to consider
    DIAG_LENGTH = 2 # how many digits from diagnosis code to consider
    train_pct = 0.8
    seed = 1789
    KEYS_diag = ['diag']
    KEYS_diag_age = ['diag', 'age']
    KEYS_diag_cnty = ['diag', 'cnty']
    KEYS_diag_age_cnty = ['diag', 'age', 'cnty']
    ############################################
    create_Xy = False
    create_vect_layer = False
    draw_rlnIs = False
    create_model = True
    run_model = True
    plot_model = True
    load_model = False
    chunk_splits = True
    chunk_split_size = 2
    
config = Config()

---
### Step 3: Read data

``read medical records for all patients with SSN and birth records``

In [None]:
df_init_bpe = utils_dt_prep.read_data_bpe()

# print shapes and head
print('Unique patients ', df_init_bpe.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init_bpe.shape)
df_init_bpe.head(2)

``read medical records for all patients with SSN (includes those with birth records)``

In [None]:
importlib.reload(utils_dt_prep)
df_init_pe = utils_dt_prep.read_data_pe()

# print shapes and head
print('Unique patients ', df_init_pe.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init_pe.shape)
df_init_pe.head(2)

In [None]:
df_init_pe.groupby(['data_source'],as_index=False).bthyearI.count()

``keep only non-birth records data``

In [None]:
# from df_init_pe drop if rlnI in df_init_bpe
rlnI_in_bpe = df_init_bpe.rlnI_updated.unique()
df_init_pe = df_init_pe[~df_init_pe.rlnI_updated.isin(rlnI_in_bpe)]

# concatenate df_init_bpe and df_init_pe
#df_init = pd.concat([df_init_bpe, df_init_pe], axis=0)

# shuffle patients
np.random.seed(config.seed)
shuffle = np.random.permutation(np.arange(df_init_pe.shape[0]))
df_init = df_init_pe.iloc[shuffle, :]
df_init.reset_index(drop=True, inplace=True)

# drop df_init_bpe and df_init_pe
del df_init_bpe, df_init_pe

# print shapes and head
print('Unique patients ', df_init.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init.shape)

df_init.head(2)

---

how many unique births if I keep only patients with at least 3 hospital/ER visits

In [None]:
temp = df_init.copy()

# keep only patients with at least 3 hospital/ER visits
temp = temp.groupby('rlnI_updated',as_index=False).admtdate.count()
temp = temp[temp.admtdate.ge(config.PAT_MIN_LENGTH)]
print('Unique patients ', temp.rlnI_updated.nunique())
temp = df_init[df_init.rlnI_updated.isin(temp.rlnI_updated.unique())]

# delete temp
del temp

---
### Step 4: Preprocess data

``drop observations and add features``

In [None]:
importlib.reload(utils_dt_prep)
print('Unique patients before preprocessing', df_init.rlnI_updated.nunique())

# drop pbervations
%time df = utils_dt_prep.drop_observations(df_init, config.PAT_MIN_LENGTH)

# add features, includes visit summary (for diag, age, cnty)
%time df = utils_dt_prep.add_features(df, config.DIAG_PER_VISIT, config.DIAG_LENGTH)

# print stats
print('Unique patients after preprocessing', df.rlnI_updated.nunique())
print('Number of encounters after preprocessing (shape of data) ', df.shape) 

In [None]:
# add patient history summary (diag, age, cnty)
importlib.reload(utils_dt_prep)

# add patient history summary (for diag, age, cnty); returns dictionary
%time hist_dict = utils_dt_prep.add_history(df)
# keep track of rlnIs
rlnIs = np.array(hist_dict['diag'].rlnI_updated)

# print shape of data
print('Unique patients after history preprocessing ', hist_dict['diag'].rlnI_updated.nunique())

In [None]:
# save to df, numpy or csv
df.to_csv("./data/df.csv")
np.save("./data/hist_dict.npy", hist_dict)
np.save("./data/rlnIs.npy", hist_dict['diag'].rlnI_updated)

Print the diagnosis, age, seg, pos history for the first patient in my data, to make sure the data preprocessing step worked as planned. 

Note that the beginning of a patient medical history is marked with ('[CLS]') and hospital visits are separated by ('[SEP]'). For each patient visit, I only include the top 5 diagnosis codes.

In [None]:
for key in config.KEYS_diag_age_cnty:
    print(key, ': ', hist_dict[key]['pat_'+key+'_hist'][0])

``define features and outcome for MLM``

Features represent the diagnosis, age, seg, and pos history of each patient. 

Outcome is a masked diganosis code (this is what I am trying to predict with the Bert MLM model; I set everything to -1 at first)

In [None]:
if config.create_Xy:
    # create empty dict
    X = {}

    # add key and value for each feature
    for key in config.KEYS_diag_age_cnty:
        X[key] = hist_dict[key]['pat_'+key+'_hist'].values

    # create outcome for each patient
    y = -1 * np.ones(X['diag'].shape, dtype=int)  # set everything to -1

    print('Shape of X_diag ', X['diag'].shape)
    print('Shape of y ', y.shape)

    # save
    np.save('./data/X.npy', X)
    np.save('./data/y.npy', y)
    
else:
    # load data
    X = np.load("./data/X.npy", allow_pickle="TRUE").item()
    y = np.load("./data/y.npy", allow_pickle="TRUE")

<span style="color:chocolate">Get vectorize layers</span>

This step extracts the unique diagnosis, age, and cnty codes in X

In [None]:
if config.create_vect_layer:
    # create vectorize layers (vl) and extract vocabulary
    importlib.reload(utils_dt_prep)

    start_time= time.time()
    # create empty dict
    vect_layer = {}
    for key in config.KEYS_diag_age_cnty:
        print('Key: ', key)
        if key=='diag':
            vect_layer[key] = utils_dt_prep.get_vectorize_layer(
                X[key],
                config.MAX_LEN,
                special_tokens=["[MASK]"])
        else:
            vect_layer[key] = utils_dt_prep.get_vectorize_layer(
                X[key],
                config.MAX_LEN)

    # print execution time
    print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')
    
    # save vectorize layers
    for key in config.KEYS_diag_age_cnty:
        utils_dt_prep.save_vectorize_layer(vect_layer, key)

else:
    # load vect layer
    vect_layer = {}
    for key in config.KEYS_diag_age_cnty:
        vect_layer[key] = tf.keras.models.load_model('./vectorizers/vect_layer_'+key)
        vect_layer[key] = vect_layer[key].layers[0]

In [None]:
# Get mask token id for masked language model
mask_token_id = vect_layer['diag'](["[MASK]"]).numpy()[0][0]
print('ID of masked token', mask_token_id)

My vocabulary contains the **padding** token ('') and OOV token ('[UNK]') as well as the passed tokens ('[CLS]', '[SEP]', and '[MASK]' if key=='diag').

Below I inspect the first 10 tokens in my vocabulary.

In [None]:
for key in config.KEYS_diag_age_cnty:
    print(key, ': ', vect_layer[key].get_vocabulary()[:10])

<span style="color:chocolate">Encode</span>

The idea here is to map each unique diagnosis, age, seg, pos (token) in my vocabulary to a unique integer. The TextVectorization class provides an Encoder, which I will use to create a mapping between tokens and corresponding integers.

Max sequence length is set to config.MAX_LEN. If a diagnosis, age, seg, pos history is less than MAX_LEN then padding is performed by adding 0s.

In [None]:
importlib.reload(utils_dt_prep)

# create empty dict
X_tokenized = {}

for key in config.KEYS_diag_age_cnty:
    %time X_tokenized[key] = utils_dt_prep.encode(vect_layer[key], X[key])

# print shape
print('Shape of X_diag_tokenized ', X_tokenized['diag'].shape)
print('Shape of X_age_tokenized ', X_tokenized['age'].shape)

Inspect non-encoded and encoded histories for the first patient in my data

In [None]:
for key in config.KEYS_diag_age_cnty:
    print(key, 'nenc: ', X[key][0])
    print(key, 'enc: ', X_tokenized[key][0])
    print('-----------------------------------')
    

<span style="color:chocolate">Create id2token and token2id mappings</span>

In [None]:
# create empty dict
id2token = {}
token2id = {}

for key in config.KEYS_diag_age_cnty:
    id2token[key] = dict(enumerate(vect_layer[key].get_vocabulary()))
    token2id[key] = {y: x for x, y in id2token[key].items()}

<span style="color:chocolate">Get masked input and labels. Transform to Batched Tensors</span>

In [None]:
importlib.reload(utils_dt_prep)

# find encoded val of CLS
loc_CLS = token2id['diag']['[CLS]']

# create MLM data
X_diag_masked, y_masked, sample_weights = utils_dt_prep.get_masked_input_and_labels(
    X_tokenized['diag'], mask_token_id, loc_CLS
)

# replace X_tokenized_diag with X_diag_masked
X_tokenized['diag'] = X_diag_masked

In [None]:
sample_weights.shape

In [None]:
y_masked[1]

In [None]:
X_diag_masked[1]

In [None]:
sample_weights[1]

---
<span style="color:orange">!! Decide what embeddings you will use</span>
---

In [None]:
keys = config.KEYS_diag_cnty # other options are config.KEYS_diag_seg_pos_age_zip, config.KEYS_diag_seg_pos

# define subset of X_tokenized
X_tokenized_subset = {}
for key in keys:
    X_tokenized_subset[key] = X_tokenized[key]

<span style="color:chocolate">Split data into training and test</span>

In [None]:
if config.draw_rlnIs:
    # put unique rlnI in a df
    rlnIs = np.load("./data/rlnIs.npy", allow_pickle="TRUE")
    df_rlnIs = pd.DataFrame({'rlnI_updated': rlnIs})

    # split rlnIs into training and test
    np.random.seed(config.seed)
    train_rlnI = np.random.choice(rlnIs, int(rlnIs.shape[0]*Config.train_pct), replace=False)
    test_rlnI = list(set(rlnIs) - set(train_rlnI))
    
    # save train_rlnIs and test_rlnIs
    np.save("./data/train_rlnI.npy", train_rlnI)
    np.save("./data/test_rlnI.npy", test_rlnI)
    
else:
    #load rlnIs
    train_rlnI = np.load("./data/train_rlnI.npy", allow_pickle="TRUE")
    test_rlnI = np.load("./data/test_rlnI.npy", allow_pickle="TRUE")
    df_rlnIs = pd.DataFrame({'rlnI_updated': rlnIs})

# grab indexes
train_idx = list(df_rlnIs[df_rlnIs.rlnI_updated.isin(train_rlnI)].index)
test_idx = list(df_rlnIs[df_rlnIs.rlnI_updated.isin(test_rlnI)].index)

# grab train data 
X_tokenized_subset_train = {}
for key in X_tokenized_subset.keys():
    X_tokenized_subset_train[key] = X_tokenized_subset[key][train_idx, :]
    
y_masked_train = y_masked[train_idx, :]
sample_weights_train = sample_weights[train_idx, :]

# grab test data 
X_tokenized_subset_test = {}
for key in X_tokenized_subset.keys():
    X_tokenized_subset_test[key] = X_tokenized_subset[key][test_idx, :]
    
y_masked_test = y_masked[test_idx, :]
sample_weights_test = sample_weights[test_idx, :]

print('Shape of train data ', X_tokenized_subset_train['diag'].shape)
print('Shape of test data ', X_tokenized_subset_test['diag'].shape)

<span style="color:chocolate">Tranform to tensor</span>

In [None]:
# training data
mlm_tensor_train = tf.data.Dataset.from_tensor_slices(
    (X_tokenized_subset_train, y_masked_train, sample_weights_train)
)
mlm_tensor_train = mlm_tensor_train.shuffle(1000).batch(config.BATCH_SIZE)

# test data
mlm_tensor_test = tf.data.Dataset.from_tensor_slices(
    (X_tokenized_subset_test, y_masked_test, sample_weights_test)
)
mlm_tensor_test = mlm_tensor_test.shuffle(1000).batch(config.BATCH_SIZE)

----
### Step 5: Pre-training BERT for MLM

I will create a BERT-like pretraining model architecture
using the `MultiHeadAttention` layer.

It will take token ids as inputs (including masked tokens)
and it will predict the correct ids for the masked input tokens.

<span style="color:chocolate">Create a medical history example to monitor MLM predictions</span>

In [None]:
importlib.reload(utils_MLM)

# define example
sample_tokens = {}
for key in X_tokenized_subset_train.keys():
    sample_tokens[key] = X_tokenized_subset_train[key][2:3,:]
display('Sample tokens ', sample_tokens)

# define monitor
generator_callback = utils_MLM.MaskedTextGenerator(vect_layer,
    sample_tokens, id2token, token2id, mask_token_id
)

``Build model``

In [None]:
if config.create_model:
    bert_masked_model = utils_MLM.create_masked_language_bert_model(vect_layer, sample_weights_train, keys)
    bert_masked_model.summary()

``Fit and save model``

In [None]:
if config.run_model:
    # fit model
    history = bert_masked_model.fit(
        mlm_tensor_train,
        epochs=15,
        verbose=1,
        #callbacks=[generator_callback]
    )

    # save model
    if len(keys)==1:
        model_name='_base'
    if len(keys)==2 and keys[1]=='age':
        model_name='_age'
    if len(keys)==2 and keys[1]=='cnty':
        model_name='_cnty'
    if len(keys)==3:
        model_name='_age_cnty'

    bert_masked_model.save("bert_mlm" + model_name +".h5", include_optimizer=False)

Plot loss

In [None]:
if config.plot_model:
    hist = history.history
    x_arr = np.arange(len(hist['loss'])) + 1

    fig = plt.figure(figsize=(10, 4))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(x_arr, hist['loss'], '-o', label='Train loss')
    ax.legend(fontsize=15)
    ax.set_xlabel('Epoch', size=15)
    ax.set_ylabel('Loss', size=15)

    #ax.set_ylim(0, 0.00001)

----
### Step 6: Evaluation

If you start code from here.... import the model first

In [None]:
# Load pretrained bert model
if config.load_model:
    bert_masked_model = keras.models.load_model(
        "bert_mlm_cnty.h5", custom_objects={"MaskedLanguageModel": utils_MLM.MaskedLanguageModel})
    
    # set model name
    if len(keys)==1:
        model_name='_base'
    if len(keys)==2 and keys[1]=='age':
        model_name='_age'
    if len(keys)==2 and keys[1]=='cnty':
        model_name='_cnty'
    if len(keys)==3:
        model_name='_age_cnty'

<span style="color:chocolate">Create top K diagnosis predictions for an example patient in X_tokenized_subset</span>

In [None]:
# define example (X)
sample_tokens = {}
for key in X_tokenized_subset.keys():
    sample_tokens[key] = X_tokenized_subset[key][test_idx, :][2:3,:]
#display('Sample tokens ', sample_tokens)

# retrieve ground truth
y_sample = y_masked[test_idx, :][2:3, :]

# predict
mlm_sample_prediction = bert_masked_model.predict(sample_tokens)

# print shape
print('Shape of mlm_tensor_predictions ', mlm_sample_prediction.shape)

The first dimension represents the number of examples (unique patients), the second dimension the length of the medical history for each patient (MAX_LEN), and the third dimension represents the diagnosis vocabulary size (probabilities for each word in the vocab).

In [None]:
# print predictions
#importlib.reload(utils_MLM_eval)
#utils_MLM_eval.one_patient_K_predictions(
#    sample_tokens, y_sample[train_idx, :],
#    mlm_sample_prediction, id2token['diag'],
#    mask_token_id
#)


<span style="color:chocolate">Evaluation (if small data)</span>

In [None]:
importlib.reload(utils_MLM_eval)
if not config.chunk_splits:
    ## Create top K diagnosis predictions for all patients in X_tokenized_subset. Report Accuracy ##
    ################################################################################################
    # define empty dictionaries
    mlm_sample_prediction = {}
    sample_sample_weights = {}
    y_sample = {}

    # add indeces to dictionary
    dt_idx = {}
    dt_idx['train'] = train_idx
    dt_idx['test'] = test_idx

    for dt in ['train', 'test']:
        # define example (X)
        sample_tokens = {}
        for key in X_tokenized_subset.keys():
            sample_tokens[key] = X_tokenized_subset[key][dt_idx[dt], :]
        #display('Sample tokens ', sample_tokens)

        # retrieve ground truth
        y_sample[dt] = y_masked[dt_idx[dt], :]

        # pull masked token indexes
        sample_sample_weights[dt] = sample_weights[dt_idx[dt], :]

        # predict
        mlm_sample_prediction[dt] = bert_masked_model.predict(sample_tokens)

        # print shape
        print('Shape of mlm_tensor_predictions ', mlm_sample_prediction[dt].shape)
        
        
    # print predictions
    for dt in ['train', 'test']:
        accuracy = utils_MLM_eval.all_patients_K_predictions_and_accuracy(
            sample_tokens, y_sample[dt],
            mlm_sample_prediction[dt], id2token['diag'],
            mask_token_id
        )

        print(dt + ' accuracy: %.3f'% (accuracy))
             
    ## Create diagnosis predictions for all patients in X_tokenized_subset. Report APS and AUC ##
    ##############################################################################################
    # print predictions
    metrics = {}
    for dt in ['train', 'test']:
        print(dt)
        print('------')
        metrics[dt] = utils_MLM_eval.all_patients_predictions_and_apc(
            vect_layer,
            sample_tokens, y_sample[dt],
            mlm_sample_prediction[dt], id2token['diag'],
            sample_sample_weights[dt])

        # add sample name
        metrics[dt]['sample'] = dt
        
    # export metrics to csv
    metrics = pd.concat([metrics['train'], metrics['test']], axis=0)
    metrics.to_csv('../embeddings/results/ApsAuc_' + model_name + '.csv')
    
    # prin number of examples in train and test
print('Train examples:', len(train_idx))
print('Test examples:', len(test_idx))

<span style="color:chocolate">Evaluation (if large data, chunk data)</span>

Needed if the data is to large: I split the training and test data into XXX chunks and take the average value of ACC, APS, and AUC

In [None]:
if config.chunk_splits:
    # define global dictionaries
    acc_master={'train':[], 'test':[]}
    aps_master={'train':[], 'test':[]}
    auc_master={'train':[], 'test':[]}

    # define empty dictionaries
    mlm_sample_prediction = {}
    sample_sample_weights = {}
    y_sample = {}

    # add indeces to dictionary and split them into XXX chunks
    dt_idx = {}
    dt_idx['train'] =  np.array_split(train_idx, config.chunk_split_size)
    dt_idx['test'] =  np.array_split(test_idx, config.chunk_split_size)

    for dt in ['train', 'test']:
        print('-----')
        print(dt)
        print('-----')
        for idx, part in enumerate(dt_idx[dt]):
            if idx%20==0:
                print('Partitions executed: ', idx, '/', len(dt_idx[dt]))
            # define example (X)
            sample_tokens = {}
            for key in X_tokenized_subset.keys():
                sample_tokens[key] = X_tokenized_subset[key][part, :]
            #display('Sample tokens ', sample_tokens)

            # retrieve ground truth
            y_sample[dt] = y_masked[part, :]

            # pull masked token indexes
            sample_sample_weights[dt] = sample_weights[part, :]

            # predict
            mlm_sample_prediction[dt] = bert_masked_model.predict(sample_tokens)

            # print shape
            #print('Shape of mlm_tensor_predictions ', mlm_sample_prediction[dt].shape)

            # compute accuracy
            accuracy = utils_MLM_eval.all_patients_K_predictions_and_accuracy(
            sample_tokens, y_sample[dt],
            mlm_sample_prediction[dt], id2token['diag'],
            mask_token_id
            )

            #print(dt + ' accuracy: %.3f'% (accuracy))
            # append to global list
            acc_master[dt].append(accuracy)

            # compute APS and AUC
            temp_metrics = {}
            temp_metrics[dt] = utils_MLM_eval.all_patients_predictions_and_apc(
                vect_layer,
                sample_tokens, y_sample[dt],
                mlm_sample_prediction[dt], id2token['diag'],
                sample_sample_weights[dt],
                #silence_print=True
            )

            # append to global list
            aps_master[dt].append(temp_metrics[dt][temp_metrics[dt].metric.eq('APS')].value)
            auc_master[dt].append(temp_metrics[dt][temp_metrics[dt].metric.eq('AUC')].value)
            
            
    # print average metrics
    print('----')
    for dt in ['train', 'test']:
        print(dt + ' ACC: %.3f'% (np.mean(acc_master[dt])))
    print('----')
    
    for dt in ['train', 'test']:
        print(dt + ' APS: %.3f'% (np.mean(aps_master[dt])))
    print('----')
    
    for dt in ['train', 'test']:
        print(dt + ' AUC: %.3f'% (np.mean(auc_master[dt])))
    print('----')
    
    
    # export amerage metrics to csv
    metrics = pd.DataFrame()
    for dt in ['train', 'test']:
        temp= pd.DataFrame({
            'metric':['ACC', 'APS', 'AUC'],
            'value':[np.mean(acc_master[dt]), np.mean(aps_master[dt]),np.mean(auc_master[dt])],
            'sample':[dt, dt, dt]
            })
        metrics = pd.concat([metrics, temp], axis=0)
    
    metrics.reset_index(drop=True, inplace=True)
    metrics.to_csv('../embeddings/results/ApsAuc_' + model_name + '.csv')
    
# prin number of examples in train and test
print('Train examples:', len(train_idx))
print('Test examples:', len(test_idx))