# Main script for Prediction Task

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@wisc.edu <br>
Date created: June 8, 2022 <br>

### Step 1: Import packages

In [None]:
# standard
import pandas as pd
import numpy as np
import os
import time
import importlib
import glob
import re
from pprint import pprint
import time


# plots
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from PIL import Image
%matplotlib inline

# sklearn and others
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score as APS
from sklearn.metrics import roc_auc_score as ROC_AUC
from sklearn.metrics import precision_recall_curve as PRC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import RocCurveDisplay
from itertools import cycle
import patsy
import statsmodels.api as sm

#tensorflow
import tensorflow as tf
from tensorflow import keras
from dataclasses import dataclass

# user defined
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
from embeddings import utils_dt_prep
from  embeddings import utils_MLM
from embeddings import utils_embeddings
import utils_dt_prep_pred_all
import utils_classifier
import utils_classifier_plus
import utils_eval_downstream

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

---
### Step 2: Set-up config

In [None]:
@dataclass
class Config:
    MAX_LEN = 40
    BATCH_SIZE = 32
    PAT_MIN_LENGTH = 3 #minimum number of visits
    DIAG_PER_VISIT = 3 #diagnosis per visit to consider
    DIAG_LENGTH = 2 # how many digits from diagnosis code to consider
    train_pct = 0.8
    val_pct = 0.1
    seed = [1235, 1789, 2134, 1455, 1112] #1235
    KEYS_diag = ['diag']
    KEYS_diag_age = ['diag', 'age']
    KEYS_diag_cnty = ['diag', 'cnty']
    KEYS_diag_age_cnty = ['diag', 'age', 'cnty']
    top_diag=10 #top diagmosis based on rocauc or aps
    draw_train_val_test = False
    create_Xy=False
    save_model=True
    load_model=False
    
    
config = Config()
config.seed = config.seed[4]

---
### Step 3: Read data

In [None]:
# read medical records for all patients with SSN and birth records
df_init = utils_dt_prep.read_data_bpe()

# print shapes and head
print('Unique patients ', df_init.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init.shape)
df_init.head(2)

### Step 4: Preprocess data

``drop observations and add features``

In [None]:
importlib.reload(utils_dt_prep)
print('Unique patients before preprocessing', df_init.rlnI_updated.nunique())

# drop pbervations
%time df = utils_dt_prep.drop_observations(df_init, config.PAT_MIN_LENGTH)
# add features, includes visit summary (for diag, age, cnty)
%time df = utils_dt_prep.add_features(df, config.DIAG_PER_VISIT, config.DIAG_LENGTH)

# print stats
print('Unique patients after preprocessing', df.rlnI_updated.nunique())
print('Number of encounters after preprocessing (shape of data) ', df.shape) 

``create train, val, and test datasets``

In [None]:
if config.draw_train_val_test:
    # find unique rlnIs in df
    rlnIs = df.rlnI_updated.unique()

    # split rlnIs into training, val, and test
    np.random.seed(config.seed)
    train_rlnI = np.random.choice(rlnIs, int(rlnIs.shape[0]*Config.train_pct), replace=False)
    val_rlnI = np.random.choice(train_rlnI, int(train_rlnI.shape[0]*Config.val_pct), replace=False)
    test_rlnI = list(set(rlnIs) - set(train_rlnI) - set (val_rlnI))
    # save train_rlnIs, val_rlnIs, and test_rlnIs
    np.save("./data/train_rlnI.npy", train_rlnI)
    np.save("./data/val_rlnI.npy", val_rlnI)
    np.save("./data/test_rlnI.npy", test_rlnI)
    
else:
    # load
    train_rlnI = np.load("./data/train_rlnI.npy", allow_pickle="TRUE")
    val_rlnI = np.load("./data/val_rlnI.npy", allow_pickle="TRUE")
    test_rlnI = np.load("./data/test_rlnI.npy", allow_pickle="TRUE")
    

# pull train and test from df
df_train = df[df.rlnI_updated.isin(train_rlnI)]
df_val = df[df.rlnI_updated.isin(val_rlnI)]
df_test = df[df.rlnI_updated.isin(test_rlnI)]

print('Shape of df_train ', df_train.shape)
print('Shape of df_val ', df_val.shape)
print('Shape of df_test', df_test.shape)

print('Unique patients in df_train ', df_train.rlnI_updated.nunique())
print('Unique patients in df_val ', df_val.rlnI_updated.nunique())
print('Unique patients in df_test ', df_test.rlnI_updated.nunique())

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

<span style="color:chocolate">create input-output pairs</span>

In [None]:
importlib.reload(utils_dt_prep_pred_all)
df_train_in, df_train_out = utils_dt_prep_pred_all.input_output_pairs(df_train, config.PAT_MIN_LENGTH)
df_val_in, df_val_out = utils_dt_prep_pred_all.input_output_pairs(df_val, config.PAT_MIN_LENGTH)
df_test_in, df_test_out = utils_dt_prep_pred_all.input_output_pairs(df_test, config.PAT_MIN_LENGTH)

# add df data to a dictionary and keep only cols of interest (used for Fairness tasks)
cols = ['rlnI_updated', 'age', 'bthyearI', 'cntyresI', 'cntyresI_name', 'pm25I', 'wfeI', 'sexI', 'raceI', 
    'patcnty', 'raceM', 'meduc', 'precare', 'visitsM_9mpp', 'visitsM_1ypp', 'visitsI_1yol',
    'bthresmb_name', 'prevsts']

df_dict = {
    'train_in': df_train_in[cols], 'val_in': df_val_in[cols], 'test_in': df_test_in[cols],
    'train_out': df_train_out[cols], 'val_out': df_val_out[cols],'test_out': df_test_out[cols]
}

# drop duplicates
for key in df_dict.keys():
    df_dict[key].drop_duplicates(subset=['rlnI_updated'], inplace=True)
    df_dict[key].reset_index(drop=True, inplace=True)

# print shapes
for key in df_dict.keys():
    print('Shape of df ' + key, df_dict[key].shape)
    
for key in df_dict.keys():
    if 'out' in key:
        print('Unique patients in df ' + key.split('_')[0], df_dict[key].rlnI_updated.nunique())
        # drop rlnI_updated column
        df_dict[key].drop(columns='rlnI_updated', inplace=True)
    else:
        df_dict[key].drop(columns='rlnI_updated', inplace=True)
        
# save data dict
np.save('./data/df_dict.npy', df_dict)

<span style="color:chocolate">create patient history for df_in data</span>

In [None]:
importlib.reload(utils_dt_prep)

print('df_train_in')
print('-----------')
%time hist_dict_train_in = utils_dt_prep.add_history(df_train_in)

print('df_train_in')
print('-----------')
%time hist_dict_val_in = utils_dt_prep.add_history(df_val_in)

print('\ndf_test_in')
print('-----------')
%time hist_dict_test_in = utils_dt_prep.add_history(df_test_in)


print example patient in data

In [None]:
df_train_in[df_train_in.rlnI_updated.eq('00003PWWP')]

In [None]:
df_train_out[df_train_out.rlnI_updated.eq('00003PWWP')]

---
### Step 5: Create vocab used in MLM

<span style="color:chocolate">import data used in MLM</span>

In [None]:
X_mlm = np.load("../../../embeddings/data/X.npy", allow_pickle="TRUE").item()

print('X', X_mlm.keys())

<span style="color:chocolate">load vocab used in MLM</span>

In [None]:
# load vect layer
vect_layer = {}
for key in config.KEYS_diag_age_cnty:
    vect_layer[key] = tf.keras.models.load_model('../../../embeddings/vectorizers/vect_layer_'+key)
    vect_layer[key] = vect_layer[key].layers[0]

In [None]:
# Get mask token id for masked language model
mask_token_id = vect_layer['diag'](["[MASK]"]).numpy()[0][0]
print('ID of masked token', mask_token_id)

<span style="color:chocolate">Create id2token and token2id mappings</span>

In [None]:
# create empty dict
id2token = {}
token2id = {}

for key in config.KEYS_diag_age_cnty:
    id2token[key] = dict(enumerate(vect_layer[key].get_vocabulary()))
    token2id[key] = {y: x for x, y in id2token[key].items()}

---
### Step 6: Encode downstream data based on MLM vocab

<span style="color:chocolate">Create features and labels</span>

In [None]:
if config.create_Xy:
    # create empty dict
    X_train = {}
    X_val = {}
    X_test = {}

    # add key and value for each feature
    for key in config.KEYS_diag_age_cnty:
        X_train[key] = hist_dict_train_in[key]['pat_'+key+'_hist'].values
        X_val[key] = hist_dict_val_in[key]['pat_'+key+'_hist'].values
        X_test[key] = hist_dict_test_in[key]['pat_'+key+'_hist'].values

    # create outcome for each patient
    df_train_out['diag00_2d'] = df_train_out['diag00_2d'].astype(str)
    df_val_out['diag00_2d'] = df_val_out['diag00_2d'].astype(str)
    df_test_out['diag00_2d'] = df_test_out['diag00_2d'].astype(str)
    y_train = df_train_out.diag00_2d # predict main diagnosis code in next visit
    y_val = df_val_out.diag00_2d # predict main diagnosis code in next visit
    y_test = df_test_out.diag00_2d # predict main diagnosis code in next visit
    
    # save
    np.save('./data/X_train.npy', X_train)
    np.save('./data/X_val.npy', X_val)
    np.save('./data/X_test.npy', X_test)
    np.save('./data/y_train.npy', y_train)
    np.save('./data/y_val.npy', y_val)
    np.save('./data/y_test.npy', y_test)
    
else:
    # load data
    X_train = np.load("./data/X_train.npy", allow_pickle="TRUE").item()
    X_val = np.load("./data/X_val.npy", allow_pickle="TRUE").item()
    X_test = np.load("./data/X_test.npy", allow_pickle="TRUE").item()
    y_train = np.load("./data/y_train.npy", allow_pickle="TRUE")
    y_val = np.load("./data/y_val.npy", allow_pickle="TRUE")
    y_test = np.load("./data/y_test.npy", allow_pickle="TRUE")
    df_dict = np.load('./data/df_dict.npy', allow_pickle="TRUE").item()

    print('Shape of X_train_diag ', X_train['diag'].shape)
    print('Shape of X_train_age ', X_train['age'].shape)
    print('Shape of y_train ', y_train.shape)

<span style="color:chocolate">Encode features</span>

In [None]:
# create empty dict
X_train_tokenized = {}
X_val_tokenized = {}
X_test_tokenized = {}

for key in config.KEYS_diag_age_cnty:
    X_train_tokenized[key] = utils_dt_prep.encode(vect_layer[key], X_train[key])
    X_val_tokenized[key] = utils_dt_prep.encode(vect_layer[key], X_val[key])
    X_test_tokenized[key] = utils_dt_prep.encode(vect_layer[key], X_test[key])

# print shape
print('Shape of X_train_tokenized ', X_train_tokenized['diag'].shape)
print('Shape of X_val_tokenized ', X_val_tokenized['diag'].shape)
print('Shape of X_test_tokenized ', X_test_tokenized['age'].shape)

<span style="color:chocolate">Encode outcomes</span>

In [None]:
# union of train and test
y_union = np.union1d(y_train, y_val).tolist()
y_union = np.union1d(y_union, y_test).tolist()
y_union_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_union)[:,0]
y_union_tokenized = np.unique(y_union_tokenized, axis=0) #token 1 shows up two times

# train
y_train_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_train)[:,0]
y_train_tokenized = pd.get_dummies(y_train_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_train_tokenized_cols = np.array(y_train_tokenized.columns)
y_train_tokenized = y_train_tokenized.to_numpy()

# val
y_val_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_val)[:,0]
y_val_tokenized = pd.get_dummies(y_val_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_val_tokenized_cols = np.array(y_val_tokenized.columns)
y_val_tokenized = y_val_tokenized.to_numpy()

# test
y_test_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_test)[:,0]
y_test_tokenized = pd.get_dummies(y_test_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_test_tokenized_cols = np.array(y_test_tokenized.columns)
y_test_tokenized = y_test_tokenized.to_numpy()

print('Shape y_train_tokenized ', y_train_tokenized.shape)
print('Shape y_val_tokenized ', y_val_tokenized.shape)
print('Shape y_test_tokenized ', y_test_tokenized.shape)

Inspect non-encoded and encoded histories and outcome in next visit for the first patient in my training data

In [None]:
# patient history
for key in config.KEYS_diag_age_cnty:
    print(key, 'nenc: ', X_train[key][0])
    print(key, 'enc: ', X_train_tokenized[key][0])
    print('-----------------------------------')

In [None]:
# patient outcome in the next visit (first 20)
y_train_tokenized[0][:20]

In [None]:
y_train_tokenized.shape

---
<span style="color:orange">!! Decide what embeddings you will use</span>
---

for train

In [None]:
keys = config.KEYS_diag_age_cnty

# define subset of X_train_tokenized
X_train_tokenized_subset = {}
for key in keys:
    X_train_tokenized_subset[key] = X_train_tokenized[key]

for val

In [None]:
keys = config.KEYS_diag_age_cnty

# define subset of X_train_tokenized
X_val_tokenized_subset = {}
for key in keys:
    X_val_tokenized_subset[key] = X_val_tokenized[key]

for test

In [None]:
keys = config.KEYS_diag_age_cnty
# other options are config.KEYS_diag_seg_pos_age_zip, config.KEYS_diag_seg_pos

# define subset of X_train_tokenized
X_test_tokenized_subset = {}
for key in keys:
    X_test_tokenized_subset[key] = X_test_tokenized[key]

<span style="color:chocolate">Convert train, val, and test subsets to tensors</span>

In [None]:
tf.random.set_seed(config.seed)

# create sample_weights (this is how the MLM was trained)
sample_weights_train = np.ones(y_train_tokenized.shape[0])

train_tensor = (
    tf.data.Dataset.from_tensor_slices((X_train_tokenized_subset, y_train_tokenized, sample_weights_train)))
train_tensor = train_tensor.shuffle(1000).batch(config.BATCH_SIZE)

In [None]:
tf.random.set_seed(config.seed)

# create sample_weights (this is how the MLM was trained)
sample_weights_val = np.ones(y_val_tokenized.shape[0])

val_tensor = (
    tf.data.Dataset.from_tensor_slices((X_val_tokenized_subset, y_val_tokenized, sample_weights_val)))
val_tensor = val_tensor.shuffle(1000).batch(config.BATCH_SIZE)

In [None]:
tf.random.set_seed(config.seed)

# create sample_weights (this is how the MLM was trained)
sample_weights_test = np.ones(y_test_tokenized.shape[0])

test_tensor = (
    tf.data.Dataset.from_tensor_slices((X_test_tokenized_subset, y_test_tokenized, sample_weights_test)))
test_tensor = test_tensor.shuffle(1000).batch(config.BATCH_SIZE)

---
### Step 7: Next visit diagnosis (downstream task)

I will use my self-supervised MLM model on a downstream task of next visit diagnosis classification.
To do this, I will import the trained MLM model, and I will create a multilabel classifier by adding a pooling layer and a Dense layer on top of the pretrained MLM features.

``import TDecoder model``

In [None]:
# Load pretrained bert model
TDecoder_model = keras.models.load_model(
    "../../../embeddings/TDecoder_age_cnty.h5"
)

# remove the classification layer
pretrained_TDecoder_model = tf.keras.Model(
    TDecoder_model.input, TDecoder_model.get_layer("encoder_0/ffn_layernormalization").output
)

pretrained_TDecoder_model.trainable = True

# Freeze it
#pretrained_TDecoder_model.trainable = False

In [None]:
'''
temp =  train_tensor.take(10)
temp_embeddings = pretrained_TDecoder_model.predict(temp)
temp_embeddings.shape


token_emb = mlm_model.get_layer("encoder_0/ffn_layernormalization").output
token_emb_average = tf.keras.layers.GlobalAveragePooling1D()(token_emb)
token_emb_averageg
'''

<span style="color:chocolate">extract token (patient visit) embeddings</span>

In [None]:
# this only prints the output of the encoder_0/ffn_layernormalization of the mlm model
#train_token_embeddings = pretrained_TDecoder_model.predict(X_train_tokenized)
#train_token_embeddings.shape

In [None]:
# this only prints the output of the encoder_0/ffn_layernormalization of the mlm model
#test_token_embeddings = pretrained_TDecoder_model.predict(X_test_tokenized)
#test_token_embeddings.shape

<span style="color:chocolate">extract sentence (patient history) embeddings</span>

In [None]:
#train_pool_embeddings = tf.keras.layers.GlobalAveragePooling1D()(train_token_embeddings).numpy()
#train_pool_embeddings.shape

In [None]:
#test_pool_embeddings = tf.keras.layers.GlobalAveragePooling1D()(test_token_embeddings).numpy()
#test_pool_embeddings.shape

``Downstream task``

<span style="color:chocolate">calculate initial bias</span>

In [None]:
calc_init_bias = False
if calc_init_bias:
    importlib.reload(utils_classifier) 
    bias_init, class_weight = utils_classifier.initial_weights(vect_layer, df, y_union, y_train_tokenized_cols)
    print('Length of initial bias', len(bias_init))

<span style="color:chocolate">add early stopping</span>

In [None]:
early_stopping = True
if early_stopping:
    early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=2,
    mode='min',
    restore_best_weights=True)

<span style="color:chocolate">define model</span>

In [None]:
importlib.reload(utils_classifier) 
classifier_model = utils_classifier.classifier_model(
    pretrained_TDecoder_model,
    y_train_tokenized.shape[1],
    keys,
    #bias_init
)
classifier_model.summary()

<span style="color:chocolate">fit model</span>

In [None]:
# Train the classifier with unfrozen BERT MLM layers
hist = classifier_model.fit(
    train_tensor,
    validation_data=val_tensor,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
)

<span style="color:chocolate">plot loss and accuracy</span>

In [None]:
# grab history
history = hist.history

# plot loss for train and validation
fig = plt.figure(figsize=(12, 2))
ax = fig.add_subplot(1, 3, 1)
plt.plot(history['loss'], lw=2, color='darkgoldenrod')
plt.plot(history['val_loss'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0,0.2)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Loss');

# plot accuracy for train and validation
ax = fig.add_subplot(1, 3, 2)
plt.plot(history['accuracy'], lw=2, color='darkgoldenrod')
plt.plot(history['val_accuracy'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0.7,1)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Accuracy');

<span style="color:chocolate">save or load model</span>

In [None]:
if config.save_model:
    classifier_model.save("./cls_model_base+age+cnty(TDecoder).h5", include_optimizer=False)
if config.load_model:
    classifier_model = keras.models.load_model("./cls_model_base+age+cnty(TDecoder).h5")

<span style="color:chocolate">predictions  (on test data)</span>

In [None]:
#y_train_tokenized_pred = classifier_model.predict(X_train_tokenized_subset)
#y_train_tokenized_pred.shape

In [None]:
y_test_tokenized_pred = classifier_model.predict(X_test_tokenized_subset)
y_test_tokenized_pred.shape

<span style="color:chocolate">example predictions for next visit diagnosis</span>

In [None]:
pat_id = 145
# find ground truth diagnosis
print('Next visit diagnosis for patient ID', pat_id)
print('---------------------------------------')
print('Ground truth diagnosis: ', y_test[pat_id])

# find predicted probability of next visit diagnosis
diag_index = np.where(y_test_tokenized[pat_id] == 1)
diag_prob = y_test_tokenized_pred[pat_id][diag_index]
print('Ground truth predict prob: ', diag_prob)

# find top 5 predicted probabilities of next visit diagnosis
top5_diag_index = y_test_tokenized_pred[pat_id].argsort()[-5 :][::-1]
top5_diag_prob = y_test_tokenized_pred[pat_id][top5_diag_index]
print('Top 5 diag predict prob: ', top5_diag_prob)

# find diag codes associated with top 5 predicted probabilities 
top5_tokens = y_test_tokenized_cols[top5_diag_index]
top5_diag_code = " ".join([id2token['diag'][t] for t in top5_tokens if t != 0])
print('Top 5 diag codes: ', top5_diag_code)

# find previous diag history
print(X_test['diag'][pat_id])

<span style="color:chocolate">Metrics (model evaluation)</span>

at the diagnostic level

In [None]:
importlib.reload(utils_eval_downstream)
start_time= time.time()
recall_diag, precision_diag, fpr_diag, tpr_diag, df_aps_auc_diag, df_y_cols = utils_eval_downstream.metrics_each_diagnosis(
    y_union_tokenized,
    id2token,
    y_test_tokenized_pred,
    y_test_tokenized,
    y_test
)

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

at the micro or samples level

In [None]:
start_time= time.time()
precision_micro, recall_micro, fpr_micro, tpr_micro, aps_samples, area_micro = utils_eval_downstream.metrics_averages(
    y_union_tokenized,
    id2token,
    y_test_tokenized_pred,
    y_test_tokenized
)

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

Print TDecoder (micro/samples) metrics

In [None]:
print('Average precission score:',
      np.round(aps_samples,3)
)
print('ROC AUC:',
      np.round(area_micro,3)
)

# add APS and ROC areato a df
temp_df = pd.DataFrame(
        {'metric': ['APS', 'AUC'],
         'value': [aps_samples, area_micro]
        }
    )

temp_df['seed'] = config.seed

# export metrics to csv
temp_df.to_csv('./results/ApsAucDownstream__base+age+cnty(TDecoder).csv', mode='a')

Print TDecoder (for each disease) plots

In [None]:
importlib.reload(utils_eval_downstream)
start_time= time.time()
utils_eval_downstream.plot_pr_roc(
    recall_diag, precision_diag, fpr_diag,
    tpr_diag, df_aps_auc_diag, precision_micro,
    recall_micro, fpr_micro, tpr_micro, aps_samples,
    area_micro, config.top_diag)


# save data to dictionary
dict_metrics = {
    'recall_diag': recall_diag,
    'precision_diag': precision_diag,
    'fpr_diag': fpr_diag,
    'tpr_diag': tpr_diag,
    'df_aps_auc_diag': df_aps_auc_diag,
    'precision_micro': precision_micro,
    'recall_micro': recall_micro,
    'fpr_micro': fpr_micro,
    'tpr_micro': tpr_micro,
    'aps_samples': aps_samples,
    'area_micro': area_micro,
    'config.top_diag': config.top_diag
}

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

np.save('./results/EachApsAucDownstream__base+age+cnty(TDecoder)_'+ str(config.seed)+ '_.npy', dict_metrics) 

---
`FairAware tasks`

In [None]:
start_time= time.time()
temp_df_test_out = df_dict['test_out'].copy()
importlib.reload(utils_eval_downstream)

temp_df_test_out = utils_eval_downstream.fairaware_cleaning(temp_df_test_out)
utils_eval_downstream.fairaware_plots(
    temp_df_test_out, y_test_tokenized_cols,
    id2token, y_test_tokenized_pred, y_test_tokenized
)

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

---
Extrinsic Evaluation of Embeddings based on prediction distribution by sex

In [None]:
embeddings_extrinsic_eval = pd.DataFrame()
for disease in ['Congenital Anomalies', 'Tuberculosis']:
    # find the index of the disease of interest
    #print('Index of disease of interest:')
    #print('-----------------------------')
    idx  = df_y_cols[df_y_cols.diag_name.str.startswith(disease)].index[0]
    display(idx)
    #print('\n')

    # pull out ground truth for disease at idx = idx (these will be binary 0 or 1)
    #print('Ground truth')
    #print('------------')
    temp_tokenized = y_test_tokenized[:, idx]
    #display(temp_tokenized)
    #print('\n')

    # pull out predictions for disease at idx = idx (these will be probabilities)
    #print('Predictions (probabilities)')
    #print('---------------------------')
    temp_tokenized_pred = y_test_tokenized_pred[:, idx]
    #display(temp_tokenized_pred)
    #print('\n')

    # pull out sexI values from df_test_out
    #print('Gender values')
    sexI = temp_df_test_out.sexI.values
    #display(sexI)
    #print('\n')

    # print distribution of those M vs. F if probability greater than the mean probability
    # this means the predicted outcomes is that is has the disease of interest
    temp_pred = list(sexI[temp_tokenized_pred>temp_tokenized_pred.mean()])
    temp_pred = pd.DataFrame({'disease': disease, 'sexI_distribution': temp_pred})
    
    # add to df
    embeddings_extrinsic_eval = pd.concat([embeddings_extrinsic_eval, temp_pred], axis=0) 
    
embeddings_extrinsic_eval[embeddings_extrinsic_eval.disease.eq('Tuberculosis')].sexI_distribution.hist()
embeddings_extrinsic_eval[embeddings_extrinsic_eval.disease.eq('Congenital Anomalies')].sexI_distribution.hist()

# save to pdf
embeddings_extrinsic_eval.to_csv('./results/embeddings_extrinsic_eval_base+age+cnty(TDecoder)_'+str(config.seed)+'_.csv')

---
``downstream task + extra features``

<span style="color:chocolate">extract patient (sentence) embeddings from classifier_model</span>

In [None]:
# define embeddings model 
classifier_model_embed = tf.keras.Model(
    classifier_model.input, classifier_model.get_layer("global_max_pooling1d").output
)

# extract embeddings from classification model
train_embeddings_pool = classifier_model_embed.predict(X_train_tokenized_subset)
val_embeddings_pool = classifier_model_embed.predict(X_val_tokenized_subset)
test_embeddings_pool = classifier_model_embed.predict(X_test_tokenized_subset)

print('Shape of train:', train_embeddings_pool.shape)
print('Shape of train:', val_embeddings_pool.shape)
print('Shape of test:', test_embeddings_pool.shape)

<span style="color:chocolate">create extra features</span> 

In [None]:
X_train_tokenized_subset_plus = utils_classifier_plus.create_extra_features(df_dict['train_in'], train_embeddings_pool)
X_val_tokenized_subset_plus = utils_classifier_plus.create_extra_features(df_dict['val_in'], val_embeddings_pool)
X_test_tokenized_subset_plus = utils_classifier_plus.create_extra_features(df_dict['test_in'], test_embeddings_pool)

In [None]:
## check for columns that are extra between training and validation
col_diff_train_val = np.setdiff1d(list(X_train_tokenized_subset_plus.columns), list(X_val_tokenized_subset_plus.columns))
col_diff_train_val = list(col_diff_train_val)
print('Columns difference train_val', col_diff_train_val)
# drop column from X_train_tokenized_subset_plus
X_train_tokenized_subset_plus.drop(columns=col_diff_train_val, inplace=True)

## check for columns that are extra between training and test
col_diff_train_test = np.setdiff1d(list(X_train_tokenized_subset_plus.columns), list(X_test_tokenized_subset_plus.columns))
col_diff_train_test = list(col_diff_train_test)
print('Columns difference train_test', col_diff_train_test)
# drop column from X_train_tokenized_subset_plus
X_train_tokenized_subset_plus.drop(columns=col_diff_train_test, inplace=True)

<span style="color:chocolate">convert train, val, and test sets to tensors</span>

In [None]:
tf.random.set_seed(config.seed)

train_plus_tensor = (
    tf.data.Dataset.from_tensor_slices((X_train_tokenized_subset_plus, y_train_tokenized)))
train_plus_tensor = train_plus_tensor.shuffle(1000).batch(config.BATCH_SIZE)

val_plus_tensor = (
    tf.data.Dataset.from_tensor_slices((X_val_tokenized_subset_plus, y_val_tokenized)))
val_plus_tensor = val_plus_tensor.shuffle(1000).batch(config.BATCH_SIZE)

test_plus_tensor = (
    tf.data.Dataset.from_tensor_slices((X_test_tokenized_subset_plus, y_test_tokenized)))
test_plus_tensor = test_plus_tensor.shuffle(1000).batch(config.BATCH_SIZE)


<span style="color:chocolate">define model</span>

In [None]:
importlib.reload(utils_classifier_plus)
classifier_plus_model = utils_classifier_plus.classifier_plus_model(
    X_train_tokenized_subset_plus.shape[1],
    y_train_tokenized.shape[1]
)
classifier_plus_model.summary()

<span style="color:chocolate">fit model</span>

In [None]:
# Train model
hist = classifier_plus_model.fit(
    train_plus_tensor,
    validation_data=val_plus_tensor,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
)

<span style="color:chocolate">plot loss and accuracy</span>

In [None]:
# grab history
history = hist.history

# plot loss for train and validation
fig = plt.figure(figsize=(12, 2))
ax = fig.add_subplot(1, 3, 1)
plt.plot(history['loss'], lw=2, color='darkgoldenrod')
plt.plot(history['val_loss'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0,0.2)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Loss');

# plot accuracy for train and validation
ax = fig.add_subplot(1, 3, 2)
plt.plot(history['accuracy'], lw=2, color='darkgoldenrod')
plt.plot(history['val_accuracy'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0.7,1)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Accuracy');

<span style="color:chocolate">save or load</span>

In [None]:
if config.save_model:
    classifier_model.save("./cls_model_plus_base+age+cnty(TDecoder).h5", include_optimizer=False)
if config.load_model:
    classifier_model = keras.models.load_model("./cls_model_plus_base+age+cnty(TDecoder).h5")

<span style="color:chocolate">predictions  (on train and test data)</span>

In [None]:
#y_train_tokenized_plus_pred = classifier_plus_model.predict(X_train_tokenized_subset_plus)
#y_train_tokenized_plus_pred.shape

In [None]:
y_test_tokenized_plus_pred = classifier_plus_model.predict(X_test_tokenized_subset_plus)
y_test_tokenized_plus_pred.shape

<span style="color:chocolate">Metrics (model evaluation)</span>

at the micro or samples level

In [None]:
start_time= time.time()
precision_micro_plus, recall_micro_plus,\
fpr_micro_plus, tpr_micro_plus,\
aps_samples_plus, area_micro_plus = utils_eval_downstream.metrics_averages(
    y_test_tokenized_cols,
    id2token,
    y_test_tokenized_plus_pred,
    y_test_tokenized
)

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

Print TDecoder (for each disease) plots

In [None]:
print('Average precission score:',
      np.round(aps_samples_plus,3)
)
print('ROC AUC:',
      np.round(area_micro_plus,3)
)

# add APS and ROC areato a df
metrics_plus = pd.DataFrame(
        {'metric': ['APS', 'AUC'],
         'value': [aps_samples_plus, area_micro_plus]
        }
    )

metrics_plus['seed'] = config.seed
# export metrics to csv
metrics_plus.to_csv('./results/ApsAucDownstreamExtra__base+age+cnty(TDecoder).csv', mode='a')