## Ped-BERT embeddings

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@wisc.edu <br>
Date created: June 8, 2022 <br>

Citations for MLM: https://keras.io/examples/nlp/masked_language_modeling/

Citations for ICD10 to ICD9: https://github.com/AtlasCUMC/ICD10-ICD9-codes-conversion

### Step 1: Import packages

In [None]:
# standard
import pandas as pd
import numpy as np
import os
import glob
import re
from pprint import pprint
import importlib
import time

# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from dataclasses import dataclass
tf.keras.backend.set_floatx('float64')

#!pip install icd9cms
from icd9cms.icd9 import search

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# user defined
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
import utils_dt_prep
import utils_MLM
import utils_MLM_eval
import utils_embeddings
import utils_TDecoder
from predictions import utils_dt_prep_pred_all

from sklearn.metrics import average_precision_score as APS
from sklearn.metrics import roc_auc_score as ROC_AUC
from sklearn.metrics import roc_curve, auc

# matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

# opress warnings
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

---
### Step 2: Set-up config

In [None]:
@dataclass
class Config:
    MAX_LEN = 40 #of diagnosis history
    BATCH_SIZE = 32
    PAT_MIN_LENGTH = 3 #minimum number of visits
    DIAG_PER_VISIT = 3 #diagnosis per visit to consider
    DIAG_LENGTH = 2 # how many digits from diagnosis code to consider
    train_pct = 0.8
    seed = 1789
    KEYS_diag = ['diag']
    KEYS_diag_age = ['diag', 'age']
    KEYS_diag_cnty = ['diag', 'cnty']
    KEYS_diag_age_cnty = ['diag', 'age', 'cnty']
    ############################################
    create_Xy = False
    create_vect_layer = False
    draw_rlnIs = False
    create_model = True
    run_model = True
    plot_model = True
    load_model = False
    chunk_splits = True
    chunk_split_size = 200
    
config = Config()

---
### Step 3: Read data

``read medical records for all patients with SSN and birth records``

In [None]:
df_init_bpe = utils_dt_prep.read_data_bpe()

# print shapes and head
print('Unique patients ', df_init_bpe.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init_bpe.shape)
df_init_bpe.head(2)

``read medical records for all patients with SSN (includes those with birth records)``

In [None]:
importlib.reload(utils_dt_prep)
df_init_pe = utils_dt_prep.read_data_pe()

# print shapes and head
print('Unique patients ', df_init_pe.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init_pe.shape)
df_init_pe.head(2)

``keep only non-birth records data``

In [None]:
# from df_init_pe drop if rlnI in df_init_bpe
rlnI_in_bpe = df_init_bpe.rlnI_updated.unique()
df_init_pe = df_init_pe[~df_init_pe.rlnI_updated.isin(rlnI_in_bpe)]

# concatenate df_init_bpe and df_init_pe
#df_init = pd.concat([df_init_bpe, df_init_pe], axis=0)

# shuffle patients
np.random.seed(config.seed)
shuffle = np.random.permutation(np.arange(df_init_pe.shape[0]))
df_init = df_init_pe.iloc[shuffle, :]
df_init.reset_index(drop=True, inplace=True)

# drop df_init_bpe and df_init_pe
del df_init_bpe, df_init_pe

# print shapes and head
print('Unique patients ', df_init.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init.shape)

df_init.head(2)

---

how many unique births if I keep only patients with at least 3 hospital/ER visits

In [None]:
temp = df_init.copy()

# keep only patients with at least 3 hospital/ER visits
temp = temp.groupby('rlnI_updated',as_index=False).admtdate.count()
temp = temp[temp.admtdate.ge(config.PAT_MIN_LENGTH)]
print('Unique patients ', temp.rlnI_updated.nunique())
temp = df_init[df_init.rlnI_updated.isin(temp.rlnI_updated.unique())]

# delete temp
del temp

---
### Step 4: Preprocess data

``drop observations and add features``

In [None]:
importlib.reload(utils_dt_prep)
print('Unique patients before preprocessing', df_init.rlnI_updated.nunique())

# drop obervations
%time df = utils_dt_prep.drop_observations(df_init, config.PAT_MIN_LENGTH)

# add features, includes visit summary (for diag, age, cnty)
%time df = utils_dt_prep.add_features(df, config.DIAG_PER_VISIT, config.DIAG_LENGTH)

# print stats
print('Unique patients after preprocessing', df.rlnI_updated.nunique())
print('Number of encounters after preprocessing (shape of data) ', df.shape) 

``create train, val, and test datasets``

In [None]:
if config.draw_rlnIs:
    # put unique rlnI in a df
    rlnIs = np.load("./data/rlnIs.npy", allow_pickle="TRUE")
    df_rlnIs = pd.DataFrame({'rlnI_updated': rlnIs})

    # split rlnIs into training and test
    np.random.seed(config.seed)
    train_rlnI = np.random.choice(rlnIs, int(rlnIs.shape[0]*Config.train_pct), replace=False)
    test_rlnI = list(set(rlnIs) - set(train_rlnI))
    
    # save train_rlnIs and test_rlnIs
    np.save("./data/train_rlnI.npy", train_rlnI)
    np.save("./data/test_rlnI.npy", test_rlnI)
    
else:
    #load rlnIs
    train_rlnI = np.load("./data/train_rlnI.npy", allow_pickle="TRUE")
    test_rlnI = np.load("./data/test_rlnI.npy", allow_pickle="TRUE")
    rlnIs = np.load("./data/rlnIs.npy", allow_pickle="TRUE")
    df_rlnIs = pd.DataFrame({'rlnI_updated': rlnIs})

# pull train and test from df
df_train = df[df.rlnI_updated.isin(train_rlnI)]
df_test = df[df.rlnI_updated.isin(test_rlnI)]

print('Shape of df_train ', df_train.shape)
print('Shape of df_test', df_test.shape)

print('Unique patients in df_train ', df_train.rlnI_updated.nunique())
print('Unique patients in df_test ', df_test.rlnI_updated.nunique())

df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

<span style="color:chocolate">create input-output pairs</span>

In [None]:
importlib.reload(utils_dt_prep_pred_all)
df_train_in, df_train_out = utils_dt_prep_pred_all.input_output_pairs(df_train, config.PAT_MIN_LENGTH)
df_test_in, df_test_out = utils_dt_prep_pred_all.input_output_pairs(df_test, config.PAT_MIN_LENGTH)

In [None]:
importlib.reload(utils_dt_prep)

print('df_train_in')
print('-----------')
%time hist_dict_train_in = utils_dt_prep.add_history(df_train_in)

print('\ndf_test_in')
print('-----------')
%time hist_dict_test_in = utils_dt_prep.add_history(df_test_in)

print example patient in data

In [None]:
df_train_in[df_train_in.rlnI_updated.eq('00002OYZO')]

In [None]:
df_train_out[df_train_out.rlnI_updated.eq('00002OYZO')]

---
### Step 5: Create vocab used in MLM

<span style="color:chocolate">import data used in MLM</span>

In [None]:
X_mlm = np.load("./data/X.npy", allow_pickle="TRUE").item()

print('X', X_mlm.keys())

<span style="color:chocolate">load vocab used in MLM</span>

In [None]:
# load vect layer
vect_layer = {}
for key in config.KEYS_diag_age_cnty:
    vect_layer[key] = tf.keras.models.load_model('./vectorizers/vect_layer_'+key)
    vect_layer[key] = vect_layer[key].layers[0]

<span style="color:chocolate">Create id2token and token2id mappings</span>

In [None]:
# create empty dict
id2token = {}
token2id = {}

for key in config.KEYS_diag_age_cnty:
    id2token[key] = dict(enumerate(vect_layer[key].get_vocabulary()))
    token2id[key] = {y: x for x, y in id2token[key].items()}

---
### Step 6: Encode data based on MLM vocab

<span style="color:chocolate">Create features and labels</span>

In [None]:
# create empty dict
X_train = {}
X_test = {}

# add key and value for each feature
for key in config.KEYS_diag_age_cnty:
    X_train[key] = hist_dict_train_in[key]['pat_'+key+'_hist'].values
    X_test[key] = hist_dict_test_in[key]['pat_'+key+'_hist'].values

# create outcome for each patient
df_train_out['diag00_2d'] = df_train_out['diag00_2d'].astype(str)
df_test_out['diag00_2d'] = df_test_out['diag00_2d'].astype(str)
y_train = df_train_out.diag00_2d # predict main diagnosis code in next visit
y_test = df_test_out.diag00_2d # predict main diagnosis code in next visit

print('Shape of X_train_diag ', X_train['diag'].shape)
print('Shape of X_train_age ', X_train['age'].shape)
print('Shape of y_train ', y_train.shape)

<span style="color:chocolate">Encode features</span>

In [None]:
# create empty dict
X_train_tokenized = {}
X_test_tokenized = {}

for key in config.KEYS_diag_age_cnty:
    X_train_tokenized[key] = utils_dt_prep.encode(vect_layer[key], X_train[key])
    X_test_tokenized[key] = utils_dt_prep.encode(vect_layer[key], X_test[key])

# print shape
print('Shape of X_train_tokenized ', X_train_tokenized['diag'].shape)
print('Shape of X_test_tokenized ', X_test_tokenized['age'].shape)

<span style="color:chocolate">Encode outcomes</span>

In [None]:
# union of train and test
y_union = np.union1d(y_train, y_test).tolist()
y_union_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_union)[:,0]
y_union_tokenized = np.unique(y_union_tokenized, axis=0) #token 1 shows up two times

# train
y_train_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_train)[:,0]
y_train_tokenized = pd.get_dummies(y_train_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_train_tokenized_cols = np.array(y_train_tokenized.columns)
y_train_tokenized = y_train_tokenized.to_numpy()

# test
y_test_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_test)[:,0]
y_test_tokenized = pd.get_dummies(y_test_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_test_tokenized_cols = np.array(y_test_tokenized.columns)
y_test_tokenized = y_test_tokenized.to_numpy()

print('Shape y_train_tokenized ', y_train_tokenized.shape)
print('Shape y_test_tokenized ', y_test_tokenized.shape)

Inspect non-encoded and encoded histories and outcome in next visit for the first patient in my training data

In [None]:
# patient history
for key in config.KEYS_diag_age_cnty:
    print(key, 'nenc: ', X_train[key][0])
    print(key, 'enc: ', X_train_tokenized[key][0])
    print('-----------------------------------')

In [None]:
# patient outcome in the next visit (first 20)
y_train_tokenized[0][:20]

In [None]:
y_train_tokenized.shape

---
<span style="color:orange">!! Decide what embeddings you will use</span>
---

``for train``

In [None]:
keys = config.KEYS_diag

# define subset of X_train_tokenized
X_train_tokenized_subset = {}
for key in keys:
    X_train_tokenized_subset[key] = X_train_tokenized[key]

``for test``

In [None]:
keys = config.KEYS_diag

# define subset of X_train_tokenized
X_test_tokenized_subset = {}
for key in keys:
    X_test_tokenized_subset[key] = X_test_tokenized[key]

In [None]:

print('Shape of train data ', X_train_tokenized_subset['diag'].shape)
print('Shape of test data ', X_test_tokenized_subset['diag'].shape)

In [None]:
y_train_tokenized.shape

In [None]:
y_test_tokenized.shape

<span style="color:chocolate">Convert train, val, and test subsets to tensors</span>

In [None]:
tf.random.set_seed(config.seed)

# create sample_weights (this is how the MLM was trained)
sample_weights_train = np.ones(y_train_tokenized.shape[0])

train_tensor = (
    tf.data.Dataset.from_tensor_slices((X_train_tokenized_subset, y_train_tokenized, sample_weights_train)))
train_tensor = train_tensor.shuffle(1000).batch(config.BATCH_SIZE)

In [None]:
tf.random.set_seed(config.seed)

# create sample_weights (this is how the MLM was trained)
sample_weights_test = np.ones(y_test_tokenized.shape[0])

test_tensor = (
    tf.data.Dataset.from_tensor_slices((X_test_tokenized_subset, y_test_tokenized, sample_weights_test)))
test_tensor = test_tensor.shuffle(1000).batch(config.BATCH_SIZE)

----
### Step 5: Pre-training TDecoder

---

``Build model``

In [None]:
importlib.reload(utils_TDecoder)
if config.create_model:
    TDecoder_model = utils_TDecoder.create_TDecoder(vect_layer, sample_weights_train, keys, y_train_tokenized.shape[1])
    TDecoder_model.summary()

``Fit and save model``

In [None]:
if config.run_model:
    # fit model
    history = TDecoder_model.fit(
        train_tensor,
        epochs=15,
        verbose=1,
    )

    # save model
    if len(keys)==1:
        model_name='_base'
    if len(keys)==2 and keys[1]=='age':
        model_name='_age'
    if len(keys)==2 and keys[1]=='cnty':
        model_name='_cnty'
    if len(keys)==3:
        model_name='_age_cnty'

    TDecoder_model.save("TDecoder" + model_name +".h5", include_optimizer=False)

Plot loss

In [None]:
if config.plot_model:
    hist = history.history
    x_arr = np.arange(len(hist['loss'])) + 1

    fig = plt.figure(figsize=(10, 4))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(x_arr, hist['loss'], '-o', label='Train loss')
    ax.legend(fontsize=15)
    ax.set_xlabel('Epoch', size=15)
    ax.set_ylabel('Loss', size=15)

    #ax.set_ylim(0, 0.00001)

----
### Step 6: Evaluation

If you start code from here.... import the model first

In [None]:
# Load pretrained bert model
if config.load_model:
    TDecoder_model = keras.models.load_model(
        "./TDecoder_base.h5")
    
    # set model name
    if len(keys)==1:
        model_name='_base'
    if len(keys)==2 and keys[1]=='age':
        model_name='_age'
    if len(keys)==2 and keys[1]=='cnty':
        model_name='_cnty'
    if len(keys)==3:
        model_name='_age_cnty'

<span style="color:chocolate">Evaluation (on training and test data)</span>

In [None]:
#y_train_tokenized_pred = classifier_model.predict(X_train_tokenized_subset)
#y_train_tokenized_pred.shape

In [None]:
#y_test_tokenized_pred = classifier_model.predict(X_test_tokenized_subset)
#y_test_tokenized_pred.shape

<span style="color:chocolate">Evaluation (if large data, chunk data)</span>

Needed if the data is to large: I split the training and test data into XXX chunks and take the average value of APS, and AUC

In [None]:
if config.chunk_splits:
    # define global dictionaries
    aps_master={'train':[], 'test':[]}
    auc_master={'train':[], 'test':[]}

    # define empty dictionaries
    y_sample = {}
    y_sample_pred = {}
    
    # define indeces
    train_idx = np.arange(y_train_tokenized.shape[0])
    test_idx = np.arange(y_test_tokenized.shape[0])
    
    # create dictionary with train and test data
    X_tokenized_subset = {
        'train': X_train_tokenized_subset,
        'test': X_train_tokenized_subset,
    }
    
    y = {
        'train': y_train_tokenized,
        'test': y_test_tokenized
    }
    
    # add indeces to dictionary and split them into XXX chunks
    dt_idx = {}
    dt_idx['train'] =  np.array_split(train_idx, config.chunk_split_size)[0:2]
    dt_idx['test'] =  np.array_split(test_idx, config.chunk_split_size)[0:2]

    for dt in ['train', 'test']:
        print('-----')
        print(dt)
        print('-----')
        for idx, part in enumerate(dt_idx[dt]):
            if idx%20==0:
                print('Partitions executed: ', idx, '/', len(dt_idx[dt]))
            # define example (X)
            X_sample = {}
            for key in X_tokenized_subset[dt].keys():
                X_sample[key] = X_tokenized_subset[dt][key][part, :]
            #display('Sample tokens ', sample_tokens)

            # retrieve ground truth
            y_sample[dt] = y[dt][part, :]

            # predict
            y_sample_pred[dt] = TDecoder_model.predict(X_sample)

            # print shape
            #print('Shape of TDecoder_sample_prediction ', y_sample_pred[dt].shape)

            # compute APS and AUC
            temp_metrics = {}
            
            aps_samples = []
            fpr_micro= []
            tpr_micro = []
            area_micro = []

            # compute average precision score
            aps_samples = APS(
                y_sample[dt],
                y_sample_pred[dt],
                average='samples'
            )
            print('APS:', aps_samples)

            # ROC curve and ROC area
            fpr_micro, tpr_micro, _ = roc_curve(
                y_sample[dt].ravel(),
                y_sample_pred[dt].ravel()
            )
            area_micro = auc(fpr_micro, tpr_micro)
            print('AUC:', area_micro)


            # add ROC area and APS to a df
            temp_metrics[dt] = pd.DataFrame(
                {'metric': ['APS', 'AUC'],
                 'value': [aps_samples, area_micro]
                }
            )
            
            # append to global list
            aps_master[dt].append(temp_metrics[dt][temp_metrics[dt].metric.eq('APS')].value)
            auc_master[dt].append(temp_metrics[dt][temp_metrics[dt].metric.eq('AUC')].value)
                  
            
    # print average metrics    
    for dt in ['train', 'test']:
        print(dt + ' APS: %.3f'% (np.mean(aps_master[dt])))
    print('----')
    
    for dt in ['train', 'test']:
        print(dt + ' AUC: %.3f'% (np.mean(auc_master[dt])))
    print('----')
    
    
    # export amerage metrics to csv
    metrics = pd.DataFrame()
    for dt in ['train', 'test']:
        temp= pd.DataFrame({
            'metric':['APS', 'AUC'],
            'value':[np.mean(aps_master[dt]),np.mean(auc_master[dt])],
            'sample':[dt, dt]
            })
        metrics = pd.concat([metrics, temp], axis=0)
    
    metrics.reset_index(drop=True, inplace=True)
    metrics.to_csv('../embeddings/results/TDecoder_ApsAuc_' + model_name + '.csv')
    
# prin number of examples in train and test
print('Train examples:', len(train_idx))
print('Test examples:', len(test_idx))