# Main script for Prediction Task

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@wisc.edu <br>
Date created: June 8, 2022 <br>

### Step 1: Import packages

In [None]:
# standard
import pandas as pd
import numpy as np
import os
import time
import importlib
import glob
import re
from pprint import pprint
import time


# plots
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from PIL import Image
%matplotlib inline

# sklearn and others
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score as APS
from sklearn.metrics import roc_auc_score as ROC_AUC
from sklearn.metrics import precision_recall_curve as PRC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import RocCurveDisplay
from itertools import cycle
import patsy
import statsmodels.api as sm

#tensorflow
import tensorflow as tf
from tensorflow import keras
from dataclasses import dataclass

# user defined
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
from embeddings import utils_dt_prep
from  embeddings import utils_MLM
import utils_dt_prep_pred_all
import utils_classifier_random_embed
import utils_classifier_logistic
import utils_eval_downstream
from embeddings import utils_embeddings

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

---
### Step 2.1: Set-up config

In [None]:
@dataclass
class Config:
    MAX_LEN = 40
    BATCH_SIZE = 32
    PAT_MIN_LENGTH = 3 #minimum number of visits
    DIAG_PER_VISIT = 3 #diagnosis per visit to consider
    DIAG_LENGTH = 2 # how many digits from diagnosis code to consider
    PRIMARY_DIAG_ONLY = True
    DIAG3 = True
    train_pct = 0.8
    val_pct = 0.1
    seed = [1235, 1789, 2134, 1455, 1112] #1235
    top_diag = 10 #top diagmosis based on rocauc or aps
    draw_train_val_test = False
    create_Xy=False
    save_model=True
    load_model=False
    process_LOS = False

config = Config()
config.seed = config.seed[0]

---
### Step 2.2: Define functions

In [None]:
def add_dates(df):
    ''' Add dates for year, month, day of birth for patient
        Add dates for year, month, day of hospital visit for patient
    '''
    # define dates
    dates = ['bthdate', 'admtdate']
    
    for col in dates:
        # transform to string
        df[col] = df[col].astype(str)
        # make sure date is Pandas compatible
        df[col] = pd.to_datetime(df[col], errors = 'coerce')

    # define bth variable to be added (year, month, day of birth)
    newvars = [['bthyear', 'bthmonth', 'bthday'],
              ['admtyear', 'admtmonth', 'admtday']]
    
    for i in range(len(dates)):
        # add bth year
        df[newvars[i][0]] = pd.DatetimeIndex(df[dates[i]]).year
        # add bth month
        df[newvars[i][1]] = pd.DatetimeIndex(df[dates[i]]).month
        # add bth date
        df[newvars[i][2]] = pd.DatetimeIndex(df[dates[i]]).day
    
    return df

---
### Step 2.3: Define working directories

In [None]:
raw_pdd_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/health/'

---
### Step 3: Read data

``los``

In [None]:
if config.process_LOS:
    # read uncleaned PDD data
    df_pdd = pd.read_csv(raw_pdd_dir + 'PDD_los.csv')
    df_pdd.rename(columns={'rln':'rlnI_updated'}, inplace=True)
    
    # add dates
    df_pdd = add_dates(df_pdd)
    # keep only iacf bthyear >=1991
    df_pdd = df_pdd[df_pdd.bthyear.ge(1991.)]

    # reset index
    df_pdd.reset_index(
        drop=True,
        inplace=True
    )
    
    # save data
    df_pdd['data_source'] = 'PDD'
    df_pdd.rename(columns={'rln':'rlnI_updated'}, inplace=True)
    cols = ['rlnI_updated', 'admtdate', 'bthdate', 'data_source','los', 'los_adj']
    df_pdd[cols].to_csv(raw_pdd_dir + 'pdd_los.csv')
    
else:
  # load data  
    df_pdd = pd.read_csv(raw_pdd_dir + 'pdd_los.csv')
    
df_pdd.head()

``all health data``

In [None]:
importlib.reload(utils_dt_prep)
# read medical records for all patients with SSN and birth records
df_init = utils_dt_prep.read_data_bpe(keep_all_cols=True)

# print shapes and head
print('Unique patients ', df_init.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init.shape)
df_init.head(2)

### Step 4: Preprocess data

``merge LOS``

In [None]:
# merge with df 
df_init2 = df_init.merge(
    df_pdd,
    on = ['rlnI_updated', 'admtdate', 'bthdate','data_source'], 
    how='left'
)

print('Shape of df_init after LOS merge', df_init.shape) #why the difference to above?

# set ER LOS to 0
df_init2['los'] = np.where((df_init2.data_source.eq('EDD')), 0, df_init2.los)

# from birth data, if admtdate=bthdate set los to 0 (mean is 3.59, 75% is 3.0)
df_init2['los'] = np.where((df_init2.data_source.eq('Birth')) & (df_init2.admtdate.eq(df_init2.bthdate)), 0, df_init2.lenstayI)

df_init2['los2'] = np.where(df_init2.los.isna(), 0, df_init2.los)

# set PDD LOS to LOS (no NAN values found). some PDD LOS have values of 0, which means the patient was discarged the same day

# keep only cols of interest
columns = [
'rlnI_updated', 'bthdateI', 'bthyearI', 'cntyresI', 'cntyresI_name',
'pm25I', 'wfeI', 'sexI', 'raceI', 
'patcnty', 'admtdate', 'admtyear', 'admtmonth',
'raceM', 'meduc', 'precare', 'visitsM_9mpp', 'visitsM_1ypp', 'visitsI_1yol',
'bthresmb_name', 'prevsts',
'diag00', 'diag01', 'diag02', 'diag03', 'diag04', 'data_source', 'data_source2', 'los', 'lenstayI', 'los2'
]

df_init2 = df_init2[columns]

``drop observations and add features``

In [None]:
importlib.reload(utils_dt_prep)
print('Unique patients before preprocessing', df_init2.rlnI_updated.nunique())

# drop pbervations
%time df = utils_dt_prep.drop_observations(df_init2, config.PAT_MIN_LENGTH)
# add features, includes visit summary (for diag, age, cnty)
%time df = utils_dt_prep.add_features(df, config.DIAG_PER_VISIT, config.DIAG_LENGTH)

# print stats
print('Unique patients after preprocessing', df.rlnI_updated.nunique())
print('Number of encounters after preprocessing (shape of data) ', df.shape) 

``define multiclass task``

In [None]:
df['los_bins'] = np.where(df.los2.le(1), 0,
                          np.where(df.los2.le(3), 1, 2))
df.los_bins.hist()

``create train, val, and test datasets``

In [None]:
if config.draw_train_val_test:
    # find unique rlnIs in df
    rlnIs = df.rlnI_updated.unique()

    # split rlnIs into training, val, and test
    np.random.seed(config.seed)
    train_rlnI = np.random.choice(rlnIs, int(rlnIs.shape[0]*Config.train_pct), replace=False)
    val_rlnI = np.random.choice(train_rlnI, int(train_rlnI.shape[0]*Config.val_pct), replace=False)
    test_rlnI = list(set(rlnIs) - set(train_rlnI) - set (val_rlnI))
    # save train_rlnIs, val_rlnIs, and test_rlnIs
    np.save("../diagnosis_prediction/data/train_rlnI.npy", train_rlnI)
    np.save("../diagnosis_prediction/data/val_rlnI.npy", val_rlnI)
    np.save("../diagnosis_prediction/data/test_rlnI.npy", test_rlnI)
    
else:
    # load
    train_rlnI = np.load("../diagnosis_prediction/data/train_rlnI.npy", allow_pickle="TRUE")
    val_rlnI = np.load("../diagnosis_prediction/data/val_rlnI.npy", allow_pickle="TRUE")
    test_rlnI = np.load("../diagnosis_prediction/data/test_rlnI.npy", allow_pickle="TRUE")
    

# pull train and test from df
df_train = df[df.rlnI_updated.isin(train_rlnI)]
df_val = df[df.rlnI_updated.isin(val_rlnI)]
df_test = df[df.rlnI_updated.isin(test_rlnI)]

print('Shape of df_train ', df_train.shape)
print('Shape of df_val ', df_val.shape)
print('Shape of df_test', df_test.shape)

print('Unique patients in df_train ', df_train.rlnI_updated.nunique())
print('Unique patients in df_val ', df_val.rlnI_updated.nunique())
print('Unique patients in df_test ', df_test.rlnI_updated.nunique())

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

<span style="color:chocolate">create input-output pairs</span>

In [None]:
importlib.reload(utils_dt_prep_pred_all)
df_train_in, df_train_out = utils_dt_prep_pred_all.input_output_pairs(df_train, config.PAT_MIN_LENGTH)
df_val_in, df_val_out = utils_dt_prep_pred_all.input_output_pairs(df_val, config.PAT_MIN_LENGTH)
df_test_in, df_test_out = utils_dt_prep_pred_all.input_output_pairs(df_test, config.PAT_MIN_LENGTH)

print example patient in data

In [None]:
df_train_in[df_train_in.rlnI_updated.eq('00003PWWP')]

In [None]:
df_train_out[df_train_out.rlnI_updated.eq('00003PWWP')]

<span style="color:chocolate">create one-hot diagnosis features from input data</span>

In [None]:
if config.PRIMARY_DIAG_ONLY:
    # find union of diag codes across train, val, and test sets
    diag_union = np.union1d(df_train_in.diag00_2d.unique(), df_val_in.diag00_2d.unique())
    diag_union = np.union1d(diag_union, df_test_in.diag00_2d.unique())
    # find difference between train and test set diag00 codes (main diagnosis code)
    setdif_train_union = np.setdiff1d(diag_union, df_train_in.diag00_2d.unique())
    setdif_val_union = np.setdiff1d(diag_union, df_val_in.diag00_2d.unique())
    setdif_test_union = np.setdiff1d(diag_union, df_test_in.diag00_2d.unique())
    print('In union but not in train', setdif_train_union) 
    print('In union but not in val', setdif_val_union) 
    print('In union but not in test', setdif_test_union)

    ## train set: create one-hot features ##
    oh_train_in = pd.get_dummies(
        df_train_in.copy(),
        columns = ["diag00_2d"],#, "diag01_2d", "diag02_2d"],
        drop_first=True
    ) 
    # pull one-hot diag00 columns
    oh_cols_train_in = [col for col in oh_train_in if col.startswith('diag00_2d_')]
    # find max of one-hot diag00 columns
    oh_train_in_final = oh_train_in.groupby('rlnI_updated', as_index=False)[oh_cols_train_in].max()
    # add one-hot columns that are in union but not in train
    if len(setdif_train_union)>0:
        for i in range(len(setdif_train_union)):
            oh_train_in_final['diag00_2d_'+setdif_train_union[i]] = 0
            
            
    ## val set: create one-hot features ##
    oh_val_in = pd.get_dummies(
        df_val_in.copy(),
        columns = ["diag00_2d"],#, "diag01_2d", "diag02_2d"],
        drop_first=True
    ) 
    # pull one-hot diag00 columns
    oh_cols_val_in = [col for col in oh_val_in if col.startswith('diag00_2d_')]
    # find max of one-hot diag00 columns
    oh_val_in_final = oh_val_in.groupby('rlnI_updated', as_index=False)[oh_cols_val_in].max()
    # add one-hot columns that are in union but not in val
    if len(setdif_val_union)>0:
        for i in range(len(setdif_val_union)):
            oh_val_in_final['diag00_2d_'+setdif_val_union[i]] = 0

            
    ## test set: create one-hot features ##
    oh_test_in = pd.get_dummies(
        df_test_in.copy(),
        columns = ["diag00_2d"],#, "diag01_2d", "diag02_2d"],
        drop_first=True)
    # pull one-hot diag00 columns
    oh_cols_test_in = [col for col in oh_test_in if col.startswith('diag00_2d_')]
    # find max of one-hot diag00 columns
    oh_test_in_final = oh_test_in.groupby('rlnI_updated', as_index=False)[oh_cols_test_in].max()
    # add columns that are in train but not in test set
    if len(setdif_test_union)>0:
        for i in range(len(setdif_test_union)):
            oh_test_in_final['diag00_2d_'+setdif_test_union[i]] = 0

    print('Training set length', len(oh_train_in_final.columns))
    print('Val set length', len(oh_train_in_final.columns))
    print('Test set length', len(oh_test_in_final.columns))

else:
    print('Write code to include more than the primary diag code')

print one-hot diag00 features example (patient) from train data

In [None]:
# first, print diag codes in non one-hot form
df_train_in[df_train_in.rlnI_updated.eq("00003PWWP")][["diag00_2d"]]#, "diag01_2d", "diag02_2d"]]

In [None]:
# second, print one-hot diag00 features
oh_train_in[oh_train_in.rlnI_updated.eq('00003PWWP')][['diag00_2d_V3', 'diag00_2d_46']]

In [None]:
# third, print max of one-hot diag00 (final features)
oh_train_in_final[oh_train_in_final.rlnI_updated.eq('00003PWWP')][['diag00_2d_V3', 'diag00_2d_46']]

In [None]:
# print all data for patient example
oh_train_in_final[oh_train_in_final.rlnI_updated.eq('00003PWWP')]

for diag00, diag01, diag02 (first 3)

In [None]:
if config.DIAG3:
    oh_train_in_final = pd.DataFrame()
    oh_val_in_final = pd.DataFrame()
    oh_test_in_final = pd.DataFrame()

    for idx, diag in enumerate(['diag00_2d', 'diag01_2d', 'diag02_2d']):
        print(diag)
        # find union of diag codes across train, val, and test sets
        diag_union = np.union1d(df_train_in[diag].unique(), df_val_in[diag].unique())
        diag_union = np.union1d(diag_union, df_test_in[diag].unique())
        # find difference between train and test set diag00 codes (main diagnosis code)
        setdif_train_union = np.setdiff1d(diag_union, df_train_in[diag].unique())
        setdif_val_union = np.setdiff1d(diag_union, df_val_in[diag].unique())
        setdif_test_union = np.setdiff1d(diag_union, df_test_in[diag].unique())
        print('In union but not in train', setdif_train_union) 
        print('In union but not in val', setdif_val_union) 
        print('In union but not in test', setdif_test_union)

        ## train set: create one-hot features ##
        oh_train_in = pd.get_dummies(
            df_train_in.copy(),
            columns = [diag],#, "diag01_2d", "diag02_2d"],
            drop_first=True
        )
        # pull one-hot diag00 columns
        oh_cols_train_in = [col for col in oh_train_in if col.startswith(diag+'_')]
        # find max of one-hot diag00 columns
        temp_oh_train_in_final = oh_train_in.groupby('rlnI_updated', as_index=False)[oh_cols_train_in].max()
        # add one-hot columns that are in train but not in test set
        if len(setdif_train_union)>0:
            for i in range(len(setdif_train_union)):
                temp_oh_train_in_final[diag+'_'+setdif_train_union[i]] = 0
        if idx>0:
            temp_oh_train_in_final = temp_oh_train_in_final.iloc[:, 1:]
        oh_train_in_final = pd.concat([oh_train_in_final,temp_oh_train_in_final], axis=1)
        
        
        ## val set: create one-hot features ##
        oh_val_in = pd.get_dummies(
            df_val_in.copy(),
            columns = [diag],#, "diag01_2d", "diag02_2d"],
            drop_first=True
        )
        # pull one-hot diag00 columns
        oh_cols_val_in = [col for col in oh_val_in if col.startswith(diag+'_')]
        # find max of one-hot diag00 columns
        temp_oh_val_in_final = oh_val_in.groupby('rlnI_updated', as_index=False)[oh_cols_val_in].max()
        # add one-hot columns that are in train but not in test set
        if len(setdif_val_union)>0:
            for i in range(len(setdif_val_union)):
                temp_oh_val_in_final[diag+'_'+setdif_val_union[i]] = 0
        if idx>0:
            temp_oh_val_in_final = temp_oh_val_in_final.iloc[:, 1:]
        oh_val_in_final = pd.concat([oh_val_in_final,temp_oh_val_in_final], axis=1)

            
        ## test set: create one-hot features ##
        oh_test_in = pd.get_dummies(
            df_test_in.copy(),
            columns = [diag],#, "diag01_2d", "diag02_2d"],
            drop_first=True
        )
        # pull one-hot diag00 columns
        oh_cols_test_in = [col for col in oh_test_in if col.startswith(diag+'_')]
        # find max of one-hot diag00 columns
        temp_oh_test_in_final = oh_test_in.groupby('rlnI_updated', as_index=False)[oh_cols_test_in].max()
        # add columns that are in train but not in test set
        if len(setdif_test_union)>0:
            for i in range(len(setdif_test_union)):
                temp_oh_test_in_final[diag+'_'+setdif_test_union[i]] = 0
        if idx>0:
            temp_oh_test_in_final = temp_oh_test_in_final.iloc[:, 1:]
        oh_test_in_final = pd.concat([oh_test_in_final,temp_oh_test_in_final], axis=1)


        print('Training set length', len(oh_train_in_final.columns))
        print('Validation set length', len(oh_train_in_final.columns))
        print('Test set length', len(oh_test_in_final.columns))
        
# print all data for patient example
oh_train_in_final[oh_train_in_final.rlnI_updated.eq('00003PWWP')]

<span style="color:chocolate">Create features and labels</span>

In [None]:
# features (X)
X_train = oh_train_in_final.iloc[:, 1:]
X_val = oh_val_in_final.iloc[:, 1:]
X_test = oh_test_in_final.iloc[:, 1:]

# labels (y)
if config.PRIMARY_DIAG_ONLY:
    y_train = df_train_out.los_bins # predict LOS in next visit
    y_val = df_val_out.los_bins 
    y_test = df_test_out.los_bins 
else:
    print('Write code to include more than the primary diag code')
    ## find out examples by class
for idx in range(3):
    class_pct_train = y_train[y_train.eq(idx)].shape[0]
    print('Class ' + str(idx) + ' pct in train', class_pct_train)
for idx in range(3):
    class_pct_val = y_val[y_val.eq(idx)].shape[0]
    print('Class ' + str(idx) + ' pct in train', class_pct_val)
for idx in range(3):
    class_pct_test = y_test[y_test.eq(idx)].shape[0]
    print('Class ' + str(idx) + ' pct in test', class_pct_test)

## downsample - randomly exclude examples from the majority class (class 0)
idx_y_train_is0 = np.where(y_train.eq(0))[0] #indexes where y_train is class 0
idx_y_train_is1 = np.where(y_train.eq(1))[0]
idx_y_train_is2 = np.where(y_train.eq(2))[0]
idx_train_0_rnd = np.random.choice(idx_y_train_is0, int((y_train[y_train.eq(1)].shape[0]+y_train[y_train.eq(2)].shape[0])/2)) # create the average of the two other classes
idx_train_all = np.array(list(idx_train_0_rnd) + list(idx_y_train_is1) + list(idx_y_train_is2))
np.random.shuffle(idx_train_all)
y_train = y_train[idx_train_all]

idx_y_val_is0 = np.where(y_val.eq(0))[0] #indexes where y_train is class 0
idx_y_val_is1 = np.where(y_val.eq(1))[0]
idx_y_val_is2 = np.where(y_val.eq(2))[0]
idx_val_0_rnd = np.random.choice(idx_y_val_is0, int((y_val[y_val.eq(1)].shape[0]+y_val[y_val.eq(2)].shape[0])/2)) # create the average of the two other classes
idx_val_all = np.array(list(idx_val_0_rnd) + list(idx_y_val_is1) + list(idx_y_val_is2))
np.random.shuffle(idx_val_all)
y_val = y_val[idx_val_all]

idx_y_test_is0 = np.where(y_test.eq(0))[0]
idx_y_test_is1 = np.where(y_test.eq(1))[0]
idx_y_test_is2 = np.where(y_test.eq(2))[0]
idx_test_0_rnd = np.random.choice(idx_y_test_is0, int((y_test[y_test.eq(1)].shape[0]+y_test[y_test.eq(2)].shape[0])/2)) # create the average of the two other classes
idx_test_all = np.array(list(idx_test_0_rnd) + list(idx_y_test_is1) + list(idx_y_test_is2))
np.random.shuffle(np.array(idx_test_all))
y_test = y_test[idx_test_all]

# X
X_train = X_train.iloc[idx_train_all,:]
X_val = X_val.iloc[idx_val_all,:]
X_test = X_test.iloc[idx_test_all,:]

print('Shape of X_train ', X_train.shape)
print('Shape of y_train ', y_train.shape)

print('Shape of X_val ', X_val.shape)
print('Shape of y_val ', y_val.shape)

print('Shape of X_test ', X_test.shape)
print('Shape of y_test ', y_test.shape)

<span style="color:chocolate">load vocab used in MLM</span>

In [None]:
# load vect layer
vect_layer = {}
key='diag'
vect_layer[key] = tf.keras.models.load_model('../../../embeddings/vectorizers/vect_layer_'+key)
vect_layer[key] = vect_layer[key].layers[0]

# create maping
id2token = {}
token2id = {}
id2token[key] = dict(enumerate(vect_layer[key].get_vocabulary()))
token2id[key] = {y: x for x, y in id2token[key].items()}

<span style="color:chocolate">Encode outcomes (y)</span>

In [None]:
# union of train and test
y_union = np.union1d(y_train, y_val).tolist()
y_union = np.union1d(y_union, y_test).tolist()
y_union_tokenized = y_union

# train
y_train_tokenized = pd.get_dummies(y_train, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_train_tokenized_cols = np.array(y_train_tokenized.columns)
y_train_tokenized = y_train_tokenized.to_numpy()

# val
y_val_tokenized = pd.get_dummies(y_val, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_val_tokenized_cols = np.array(y_val_tokenized.columns)
y_val_tokenized = y_val_tokenized.to_numpy()

# test
y_test_tokenized = pd.get_dummies(y_test, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_test_tokenized_cols = np.array(y_test_tokenized.columns)
y_test_tokenized = y_test_tokenized.to_numpy()

print('Shape y_train_tokenized ', y_train_tokenized.shape)
print('Shape y_val_tokenized ', y_val_tokenized.shape)
print('Shape y_test_tokenized ', y_test_tokenized.shape)

<span style="color:chocolate">Convert train, val, and test subsets to tensors</span>

In [None]:
tf.random.set_seed(config.seed)
train_tensor = (
    tf.data.Dataset.from_tensor_slices((X_train, y_train_tokenized)))
train_tensor = train_tensor.shuffle(1000).batch(config.BATCH_SIZE)

In [None]:
tf.random.set_seed(config.seed)
val_tensor = (
    tf.data.Dataset.from_tensor_slices((X_val, y_val_tokenized)))
val_tensor = val_tensor.shuffle(1000).batch(config.BATCH_SIZE)

In [None]:
tf.random.set_seed(config.seed)
test_tensor = (
    tf.data.Dataset.from_tensor_slices((X_test, y_test_tokenized)))
test_tensor = test_tensor.shuffle(1000).batch(config.BATCH_SIZE)

---
### Step 7: Next visit LOS (downstream task)

Run a logistic regression model to predict diag00 in the next visit

<span style="color:chocolate">define model</span>

In [None]:
early_stopping = True
if early_stopping:
    early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=2,
    mode='min',
    restore_best_weights=True)

In [None]:
importlib.reload(utils_classifier_logistic)
classifier_logistic_model = utils_classifier_logistic.classifier_logistic_model(
    X_train.shape[1],
    y_train_tokenized.shape[1]
)
classifier_logistic_model.summary()

<span style="color:chocolate">fit model</span>

In [None]:
# Train model
hist = classifier_logistic_model.fit(
    train_tensor,
    validation_data=val_tensor,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
)

<span style="color:chocolate">plot loss and accuracy</span>

In [None]:
# grab history
history = hist.history

# plot loss for train and validation
fig = plt.figure(figsize=(12, 2))
ax = fig.add_subplot(1, 3, 1)
plt.plot(history['loss'], lw=2, color='darkgoldenrod')
plt.plot(history['val_loss'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0,0.2)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Loss');

# plot accuracy for train and validation
ax = fig.add_subplot(1, 3, 2)
plt.plot(history['accuracy'], lw=2, color='darkgoldenrod')
plt.plot(history['val_accuracy'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0.7,1)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Accuracy');

<span style="color:chocolate">save or load</span>

In [None]:
if config.save_model:
    classifier_logistic_model.save("./cls_model_base(Logistic)_los.h5", include_optimizer=False)
if config.load_model:
    classifier_logistic_model = keras.models.load_model("./cls_model_base(Logistic)_los.h5")

<span style="color:chocolate">predictions  (on train and test data)</span>

In [None]:
#y_train_tokenized_logistic_pred = classifier_logistic_model.predict(X_train)
#y_train_tokenized_logistic_pred.shape

In [None]:
y_test_tokenized_logistic_pred = classifier_logistic_model.predict(X_test)
y_test_tokenized_logistic_pred.shape

<span style="color:chocolate">Metrics (model evaluation)</span>

``Print BEHRT (micro/samples) metrics``

In [None]:
## a "micro(sample)-average": quantifying score on all classes jointly
# precision and recall
precision_micro, recall_micro, _ = PRC(
    y_test_tokenized.ravel(),
    y_test_tokenized_rf_pred.ravel()
)


# average precision score
aps_samples_rf = APS(
    y_test_tokenized,
    y_test_tokenized_rf_pred,
    average="samples"
)

# ROC curve and ROC area (Micro-averaged One-vs-Rest ROC AUC score)
fpr_micro, tpr_micro, _ = roc_curve(
    y_test_tokenized.ravel(),
    y_test_tokenized_rf_pred.ravel()
)
area_micro_rf = auc(fpr_micro, tpr_micro)

print('Average precission score:',
      np.round(aps_samples_rf,3)
)
print('ROC AUC:',
      np.round(area_micro_rf,3)
)

# add APS and ROC areato a df
temp_df = pd.DataFrame(
        {'metric': ['APS', 'AUC'],
         'value': [aps_samples_rf, area_micro_rf]
        }
    )

temp_df['seed'] = config.seed

# export metrics to csv
temp_df.to_csv('./results/ApsAucDownstream___base(Logistic)_los.csv', mode='a')

``Print ROC AUC curve by class``

In [None]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_tokenized.ravel(), y_test_tokenized_logistic_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

n_classes = len(set(y_union))
target_names = ['class 0 (los <= 1 day)', 'class 1 (los <=3 days)', 'class 2 (los > 3 days)']
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)


colors = cycle(["aqua", "darkorange", "cornflowerblue"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_test_tokenized[:, class_id],
        y_test_tokenized_logistic_pred[:, class_id],
        name=f"ROC curve for {target_names[class_id]}",
        color=color,
        ax=ax,
        #chance_level=True,
    )
    
# random classifier
ax.plot(
    [0, 1],
    [0, 1],
    "k--",
    color="black",
    linewidth=1,
    label='random classifier'
)

_ = ax.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass",
)

---
``downstream task + extra features``

<span style="color:chocolate">create extra features</span> 

In [None]:
importlib.reload(utils_classifier_plus) 
X_train_subset_plus = utils_classifier_plus.create_extra_features(df_dict['train_in'], X_train.reset_index(drop=True, inplace=True))
X_val_subset_plus = utils_classifier_plus.create_extra_features(df_dict['val_in'], X_val.reset_index(drop=True, inplace=True))
X_test_subset_plus = utils_classifier_plus.create_extra_features(df_dict['test_in'], X_test.reset_index(drop=True, inplace=True))

In [None]:
## check for columns that are extra between training and validation
col_diff_train_val = np.setdiff1d(list(X_train_subset_plus.columns), list(X_val_subset_plus.columns))
col_diff_train_val = list(col_diff_train_val)
print('Columns difference train_val', col_diff_train_val)
# drop column from X_train_tokenized_subset_plus
X_train_subset_plus.drop(columns=col_diff_train_val, inplace=True)

## check for columns that are extra between training and test
col_diff_train_test = np.setdiff1d(list(X_train_subset_plus.columns), list(X_test_subset_plus.columns))
col_diff_train_test = list(col_diff_train_test)
print('Columns difference train_test', col_diff_train_test)
# drop column from X_train_tokenized_subset_plus
X_train_subset_plus.drop(columns=col_diff_train_test, inplace=True)

<span style="color:chocolate">convert train and test sets to tensors</span>

In [None]:
tf.random.set_seed(config.seed)

train_plus_tensor = (
    tf.data.Dataset.from_tensor_slices((X_train_subset_plus, y_train_tokenized)))
train_plus_tensor = train_plus_tensor.shuffle(1000).batch(config.BATCH_SIZE)

val_plus_tensor = (
    tf.data.Dataset.from_tensor_slices((X_val_subset_plus, y_val_tokenized)))
val_plus_tensor = val_plus_tensor.shuffle(1000).batch(config.BATCH_SIZE)

test_plus_tensor = (
    tf.data.Dataset.from_tensor_slices((X_test_subset_plus, y_test_tokenized)))
test_plus_tensor = test_plus_tensor.shuffle(1000).batch(config.BATCH_SIZE)

<span style="color:chocolate">define model</span>

In [None]:
importlib.reload(utils_classifier_plus)
classifier_plus_model = utils_classifier_logistic.classifier_logistic_model(
    X_train_subset_plus.shape[1],
    y_train_tokenized.shape[1]
)
classifier_plus_model.summary()

<span style="color:chocolate">fit model</span>

In [None]:
# Train model
hist = classifier_plus_model.fit(
    train_plus_tensor,
    validation_data=val_plus_tensor,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
)

<span style="color:chocolate">plot loss and accuracy</span>

In [None]:
# grab history
history = hist.history

# plot loss for train and validation
fig = plt.figure(figsize=(12, 2))
ax = fig.add_subplot(1, 3, 1)
plt.plot(history['loss'], lw=2, color='darkgoldenrod')
plt.plot(history['val_loss'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0,0.2)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Loss');

# plot accuracy for train and validation
ax = fig.add_subplot(1, 3, 2)
plt.plot(history['accuracy'], lw=2, color='darkgoldenrod')
plt.plot(history['val_accuracy'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0.7,1)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Accuracy');

In [None]:
if config.save_model:
    classifier_plus_model.save("./cls_model_plus_base(Logistic).h5", include_optimizer=False)
if config.load_model:
    classifier_plus_model = keras.models.load_model("./cls_model_plus_base(Logistic).h5", custom_objects={"MaskedLanguageModel": utils_MLM.MaskedLanguageModel})

<span style="color:chocolate">predictions  (on train and test data)</span>

In [None]:
#y_train_tokenized_plus_pred = classifier_plus_model.predict(X_train_subset_plus)
#y_train_tokenized_plus_pred.shape

In [None]:
y_test_tokenized_plus_pred = classifier_plus_model.predict(X_test_subset_plus)
y_test_tokenized_plus_pred.shape

<span style="color:chocolate">Metrics (model evaluation)</span>

``Print BEHRT (micro/samples) metrics``

In [None]:
## a "micro(sample)-average": quantifying score on all classes jointly
# precision and recall
precision_micro, recall_micro, _ = PRC(
    y_test_tokenized.ravel(),
    y_test_tokenized_plus_pred.ravel()
)


# average precision score
aps_samples = APS(
    y_test_tokenized,
    y_test_tokenized_plus_pred,
    average="samples"
)

# ROC curve and ROC area (Micro-averaged One-vs-Rest ROC AUC score)
fpr_micro, tpr_micro, _ = roc_curve(
    y_test_tokenized.ravel(),
    y_test_tokenized_plus_pred.ravel()
)
area_micro = auc(fpr_micro, tpr_micro)

print('Average precission score:',
      np.round(aps_samples,3)
)
print('ROC AUC:',
      np.round(area_micro,3)
)

# add APS and ROC areato a df
metrics_plus = pd.DataFrame(
        {'metric': ['APS', 'AUC'],
         'value': [aps_samples, area_micro]
        }
    )

metrics_plus['seed'] = config.seed

# export metrics to csv
metrics_plus.to_csv('./results/ApsAucDownstreamExtra__base(Logistic)_los.csv', mode='a')

``Print ROC AUC curve by class``

In [None]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_tokenized.ravel(), y_test_tokenized_plus_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

n_classes = len(set(y_union))
target_names = ['class 0 (los <= 1 day)', 'class 1 (los <=3 days)', 'class 2 (los > 3 days)']
fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)


colors = cycle(["aqua", "darkorange", "cornflowerblue"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_test_tokenized[:, class_id],
        y_test_tokenized_plus_pred[:, class_id],
        name=f"ROC curve for {target_names[class_id]}",
        color=color,
        ax=ax,
        #chance_level=True,
    )
    
# random classifier
ax.plot(
    [0, 1],
    [0, 1],
    "k--",
    color="black",
    linewidth=1,
    label='random classifier'
)

_ = ax.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass",
)