# Main script for Prediction Task

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@wisc.edu <br>
Date created: June 8, 2022 <br>

### Step 1: Import packages

In [None]:
# standard
import pandas as pd
import numpy as np
import os
import time
import importlib
import glob
import re
from pprint import pprint
import time


# plots
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from PIL import Image
%matplotlib inline

# sklearn and others
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score as APS
from sklearn.metrics import roc_auc_score as ROC_AUC
from sklearn.metrics import precision_recall_curve as PRC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
from itertools import cycle
import patsy
import statsmodels.api as sm

#tensorflow
import tensorflow as tf
from tensorflow import keras
from dataclasses import dataclass

# user defined
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
from embeddings import utils_dt_prep
from  embeddings import utils_MLM
import utils_dt_prep_pred_all
import utils_classifier_random_embed
import utils_classifier_logistic
import utils_classifier_plus
import utils_eval_downstream
from embeddings import utils_embeddings

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

---
### Step 2: Set-up config

In [None]:
@dataclass
class Config:
    MAX_LEN = 40
    BATCH_SIZE = 32
    PAT_MIN_LENGTH = 3 #minimum number of visits
    DIAG_PER_VISIT = 3 #diagnosis per visit to consider
    DIAG_LENGTH = 2 # how many digits from diagnosis code to consider
    PRIMARY_DIAG_ONLY = True
    DIAG3 = True
    train_pct = 0.8
    val_pct = 0.1
    seed = [1235, 1789, 2134, 1455, 1112] #1235
    top_diag = 10 #top diagmosis based on rocauc or aps
    draw_train_val_test = False
    save_model = True
    load_model = False
    
config = Config()
config.seed = config.seed[1]

---
### Step 3: Read data

In [None]:
importlib.reload(utils_dt_prep)
# read medical records for all patients with SSN and birth records
df_init = utils_dt_prep.read_data_bpe()

# print shapes and head
print('Unique patients ', df_init.rlnI_updated.nunique())
print('Number of encounters (shape of data) ', df_init.shape)
df_init.head(2)

### Step 4: Preprocess data

``drop observations and add features``

In [None]:
importlib.reload(utils_dt_prep)
print('Unique patients before preprocessing', df_init.rlnI_updated.nunique())

# drop pbervations
%time df = utils_dt_prep.drop_observations(df_init, config.PAT_MIN_LENGTH)
# add features, includes visit summary (for diag, age, cnty)
%time df = utils_dt_prep.add_features(df, config.DIAG_PER_VISIT, config.DIAG_LENGTH)

# print stats
print('Unique patients after preprocessing', df.rlnI_updated.nunique())
print('Number of encounters after preprocessing (shape of data) ', df.shape) 

``create train, val, and test datasets``

In [None]:
if config.draw_train_val_test:
    # find unique rlnIs in df
    rlnIs = df.rlnI_updated.unique()

    # split rlnIs into training, val, and test
    np.random.seed(config.seed)
    train_rlnI = np.random.choice(rlnIs, int(rlnIs.shape[0]*Config.train_pct), replace=False)
    val_rlnI = np.random.choice(train_rlnI, int(train_rlnI.shape[0]*Config.val_pct), replace=False)
    test_rlnI = list(set(rlnIs) - set(train_rlnI) - set (val_rlnI))
    # save train_rlnIs, val_rlnIs, and test_rlnIs
    np.save("./data/train_rlnI.npy", train_rlnI)
    np.save("./data/val_rlnI.npy", val_rlnI)
    np.save("./data/test_rlnI.npy", test_rlnI)
    
else:
    # load
    train_rlnI = np.load("./data/train_rlnI.npy", allow_pickle="TRUE")
    val_rlnI = np.load("./data/val_rlnI.npy", allow_pickle="TRUE")
    test_rlnI = np.load("./data/test_rlnI.npy", allow_pickle="TRUE")
    

# pull train and test from df
df_train = df[df.rlnI_updated.isin(train_rlnI)]
df_val = df[df.rlnI_updated.isin(val_rlnI)]
df_test = df[df.rlnI_updated.isin(test_rlnI)]

print('Shape of df_train ', df_train.shape)
print('Shape of df_val ', df_val.shape)
print('Shape of df_test', df_test.shape)

print('Unique patients in df_train ', df_train.rlnI_updated.nunique())
print('Unique patients in df_val ', df_val.rlnI_updated.nunique())
print('Unique patients in df_test ', df_test.rlnI_updated.nunique())

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

<span style="color:chocolate">create input-output pairs</span>

In [None]:
importlib.reload(utils_dt_prep_pred_all)
df_train_in, df_train_out = utils_dt_prep_pred_all.input_output_pairs(df_train, config.PAT_MIN_LENGTH)
df_val_in, df_val_out = utils_dt_prep_pred_all.input_output_pairs(df_val, config.PAT_MIN_LENGTH)
df_test_in, df_test_out = utils_dt_prep_pred_all.input_output_pairs(df_test, config.PAT_MIN_LENGTH)

# add df data to a dictionary and keep only cols of interest (used for Fairness tasks)
cols = ['rlnI_updated', 'age', 'bthyearI', 'cntyresI', 'cntyresI_name', 'pm25I', 'wfeI', 'sexI', 'raceI', 
    'patcnty', 'raceM', 'meduc', 'precare', 'visitsM_9mpp', 'visitsM_1ypp', 'visitsI_1yol',
    'bthresmb_name', 'prevsts']

df_dict = {
    'train_in': df_train_in[cols], 'val_in': df_val_in[cols], 'test_in': df_test_in[cols],
    'train_out': df_train_out[cols], 'val_out': df_val_out[cols],'test_out': df_test_out[cols]
}

# drop duplicates
for key in df_dict.keys():
    df_dict[key].drop_duplicates(subset=['rlnI_updated'], inplace=True)
    df_dict[key].reset_index(drop=True, inplace=True)

# print shapes
for key in df_dict.keys():
    print('Shape of df ' + key, df_dict[key].shape)
    
for key in df_dict.keys():
    if 'out' in key:
        print('Unique patients in df ' + key.split('_')[0], df_dict[key].rlnI_updated.nunique())
        # drop rlnI_updated column
        df_dict[key].drop(columns='rlnI_updated', inplace=True)
    else:
        df_dict[key].drop(columns='rlnI_updated', inplace=True)

print example patient in data

In [None]:
df_train_in[df_train_in.rlnI_updated.eq('00003PWWP')]

In [None]:
df_train_out[df_train_out.rlnI_updated.eq('00003PWWP')]

<span style="color:chocolate">create one-hot diagnosis features from input data</span>

In [None]:
if config.PRIMARY_DIAG_ONLY:
    # find union of diag codes across train, val, and test sets
    diag_union = np.union1d(df_train_in.diag00_2d.unique(), df_val_in.diag00_2d.unique())
    diag_union = np.union1d(diag_union, df_test_in.diag00_2d.unique())
    # find difference between train and test set diag00 codes (main diagnosis code)
    setdif_train_union = np.setdiff1d(diag_union, df_train_in.diag00_2d.unique())
    setdif_val_union = np.setdiff1d(diag_union, df_val_in.diag00_2d.unique())
    setdif_test_union = np.setdiff1d(diag_union, df_test_in.diag00_2d.unique())
    print('In union but not in train', setdif_train_union) 
    print('In union but not in val', setdif_val_union) 
    print('In union but not in test', setdif_test_union)

    ## train set: create one-hot features ##
    oh_train_in = pd.get_dummies(
        df_train_in.copy(),
        columns = ["diag00_2d"],#, "diag01_2d", "diag02_2d"],
        drop_first=True
    ) 
    # pull one-hot diag00 columns
    oh_cols_train_in = [col for col in oh_train_in if col.startswith('diag00_2d_')]
    # find max of one-hot diag00 columns
    oh_train_in_final = oh_train_in.groupby('rlnI_updated', as_index=False)[oh_cols_train_in].max()
    # add one-hot columns that are in union but not in train
    if len(setdif_train_union)>0:
        for i in range(len(setdif_train_union)):
            oh_train_in_final['diag00_2d_'+setdif_train_union[i]] = 0
            
            
    ## val set: create one-hot features ##
    oh_val_in = pd.get_dummies(
        df_val_in.copy(),
        columns = ["diag00_2d"],#, "diag01_2d", "diag02_2d"],
        drop_first=True
    ) 
    # pull one-hot diag00 columns
    oh_cols_val_in = [col for col in oh_val_in if col.startswith('diag00_2d_')]
    # find max of one-hot diag00 columns
    oh_val_in_final = oh_val_in.groupby('rlnI_updated', as_index=False)[oh_cols_val_in].max()
    # add one-hot columns that are in union but not in val
    if len(setdif_val_union)>0:
        for i in range(len(setdif_val_union)):
            oh_val_in_final['diag00_2d_'+setdif_val_union[i]] = 0

            
    ## test set: create one-hot features ##
    oh_test_in = pd.get_dummies(
        df_test_in.copy(),
        columns = ["diag00_2d"],#, "diag01_2d", "diag02_2d"],
        drop_first=True)
    # pull one-hot diag00 columns
    oh_cols_test_in = [col for col in oh_test_in if col.startswith('diag00_2d_')]
    # find max of one-hot diag00 columns
    oh_test_in_final = oh_test_in.groupby('rlnI_updated', as_index=False)[oh_cols_test_in].max()
    # add columns that are in train but not in test set
    if len(setdif_test_union)>0:
        for i in range(len(setdif_test_union)):
            oh_test_in_final['diag00_2d_'+setdif_test_union[i]] = 0

    print('Training set length', len(oh_train_in_final.columns))
    print('Val set length', len(oh_train_in_final.columns))
    print('Test set length', len(oh_test_in_final.columns))

else:
    print('Write code to include more than the primary diag code')

print one-hot diag00 features example (patient) from train data

In [None]:
# first, print diag codes in non one-hot form
df_train_in[df_train_in.rlnI_updated.eq("00003PWWP")][["diag00_2d"]]#, "diag01_2d", "diag02_2d"]]

In [None]:
# second, print one-hot diag00 features
oh_train_in[oh_train_in.rlnI_updated.eq('00003PWWP')][['diag00_2d_V3', 'diag00_2d_46']]

In [None]:
# third, print max of one-hot diag00 (final features)
oh_train_in_final[oh_train_in_final.rlnI_updated.eq('00003PWWP')][['diag00_2d_V3', 'diag00_2d_46']]

In [None]:
# print all data for patient example
oh_train_in_final[oh_train_in_final.rlnI_updated.eq('00003PWWP')]

for diag00, diag01, diag02 (first 3)

In [None]:
if config.DIAG3:
    oh_train_in_final = pd.DataFrame()
    oh_val_in_final = pd.DataFrame()
    oh_test_in_final = pd.DataFrame()

    for idx, diag in enumerate(['diag00_2d', 'diag01_2d', 'diag02_2d']):
        print(diag)
        # find union of diag codes across train, val, and test sets
        diag_union = np.union1d(df_train_in[diag].unique(), df_val_in[diag].unique())
        diag_union = np.union1d(diag_union, df_test_in[diag].unique())
        # find difference between train and test set diag00 codes (main diagnosis code)
        setdif_train_union = np.setdiff1d(diag_union, df_train_in[diag].unique())
        setdif_val_union = np.setdiff1d(diag_union, df_val_in[diag].unique())
        setdif_test_union = np.setdiff1d(diag_union, df_test_in[diag].unique())
        print('In union but not in train', setdif_train_union) 
        print('In union but not in val', setdif_val_union) 
        print('In union but not in test', setdif_test_union)

        ## train set: create one-hot features ##
        oh_train_in = pd.get_dummies(
            df_train_in.copy(),
            columns = [diag],#, "diag01_2d", "diag02_2d"],
            drop_first=True
        )
        # pull one-hot diag00 columns
        oh_cols_train_in = [col for col in oh_train_in if col.startswith(diag+'_')]
        # find max of one-hot diag00 columns
        temp_oh_train_in_final = oh_train_in.groupby('rlnI_updated', as_index=False)[oh_cols_train_in].max()
        # add one-hot columns that are in train but not in test set
        if len(setdif_train_union)>0:
            for i in range(len(setdif_train_union)):
                temp_oh_train_in_final[diag+'_'+setdif_train_union[i]] = 0
        if idx>0:
            temp_oh_train_in_final = temp_oh_train_in_final.iloc[:, 1:]
        oh_train_in_final = pd.concat([oh_train_in_final,temp_oh_train_in_final], axis=1)
        
        
        ## val set: create one-hot features ##
        oh_val_in = pd.get_dummies(
            df_val_in.copy(),
            columns = [diag],#, "diag01_2d", "diag02_2d"],
            drop_first=True
        )
        # pull one-hot diag00 columns
        oh_cols_val_in = [col for col in oh_val_in if col.startswith(diag+'_')]
        # find max of one-hot diag00 columns
        temp_oh_val_in_final = oh_val_in.groupby('rlnI_updated', as_index=False)[oh_cols_val_in].max()
        # add one-hot columns that are in train but not in test set
        if len(setdif_val_union)>0:
            for i in range(len(setdif_val_union)):
                temp_oh_val_in_final[diag+'_'+setdif_val_union[i]] = 0
        if idx>0:
            temp_oh_val_in_final = temp_oh_val_in_final.iloc[:, 1:]
        oh_val_in_final = pd.concat([oh_val_in_final,temp_oh_val_in_final], axis=1)

            
        ## test set: create one-hot features ##
        oh_test_in = pd.get_dummies(
            df_test_in.copy(),
            columns = [diag],#, "diag01_2d", "diag02_2d"],
            drop_first=True
        )
        # pull one-hot diag00 columns
        oh_cols_test_in = [col for col in oh_test_in if col.startswith(diag+'_')]
        # find max of one-hot diag00 columns
        temp_oh_test_in_final = oh_test_in.groupby('rlnI_updated', as_index=False)[oh_cols_test_in].max()
        # add columns that are in train but not in test set
        if len(setdif_test_union)>0:
            for i in range(len(setdif_test_union)):
                temp_oh_test_in_final[diag+'_'+setdif_test_union[i]] = 0
        if idx>0:
            temp_oh_test_in_final = temp_oh_test_in_final.iloc[:, 1:]
        oh_test_in_final = pd.concat([oh_test_in_final,temp_oh_test_in_final], axis=1)


        print('Training set length', len(oh_train_in_final.columns))
        print('Validation set length', len(oh_train_in_final.columns))
        print('Test set length', len(oh_test_in_final.columns))
        
# print all data for patient example
oh_train_in_final[oh_train_in_final.rlnI_updated.eq('00003PWWP')]

<span style="color:chocolate">Create features and labels</span>

In [None]:
# features (X)
X_train = oh_train_in_final.iloc[:, 1:]
X_val = oh_val_in_final.iloc[:, 1:]
X_test = oh_test_in_final.iloc[:, 1:]

# labels (y)
if config.PRIMARY_DIAG_ONLY:
    y_train = df_train_out.diag00_2d # predict main diagnosis code in next visit
    y_val = df_val_out.diag00_2d # predict main diagnosis code in next visit
    y_test = df_test_out.diag00_2d # predict main diagnosis code in next visit
else:
    print('Write code to include more than the primary diag code')

print('Shape of X_train ', X_train.shape)
print('Shape of y_train ', y_train.shape)

print('Shape of X_val ', X_val.shape)
print('Shape of y_val ', y_val.shape)

print('Shape of X_test ', X_test.shape)
print('Shape of y_test ', y_test.shape)

<span style="color:chocolate">load vocab used in MLM</span>

In [None]:
# load vect layer
vect_layer = {}
key='diag'
vect_layer[key] = tf.keras.models.load_model('../../../embeddings/vectorizers/vect_layer_'+key)
vect_layer[key] = vect_layer[key].layers[0]

# create maping
id2token = {}
token2id = {}
id2token[key] = dict(enumerate(vect_layer[key].get_vocabulary()))
token2id[key] = {y: x for x, y in id2token[key].items()}

<span style="color:chocolate">Encode outcomes (y)</span>

In [None]:
# union of train and test
y_union = np.union1d(y_train, y_val).tolist()
y_union = np.union1d(y_union, y_test).tolist()
y_union_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_union)[:,0]
y_union_tokenized = np.unique(y_union_tokenized, axis=0) #token 1 shows up two times

# train
y_train_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_train)[:,0]
y_train_tokenized = pd.get_dummies(y_train_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_train_tokenized_cols = np.array(y_train_tokenized.columns)
y_train_tokenized = y_train_tokenized.to_numpy()

# val
y_val_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_val)[:,0]
y_val_tokenized = pd.get_dummies(y_val_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_val_tokenized_cols = np.array(y_val_tokenized.columns)
y_val_tokenized = y_val_tokenized.to_numpy()

# test
y_test_tokenized = utils_dt_prep.encode(vect_layer['diag'], y_test)[:,0]
y_test_tokenized = pd.get_dummies(y_test_tokenized, drop_first=False).reindex(columns = y_union_tokenized, fill_value=0)
y_test_tokenized_cols = np.array(y_test_tokenized.columns)
y_test_tokenized = y_test_tokenized.to_numpy()

print('Shape y_train_tokenized ', y_train_tokenized.shape)
print('Shape y_val_tokenized ', y_val_tokenized.shape)
print('Shape y_test_tokenized ', y_test_tokenized.shape)

---
### Step 7: Next visit diagnosis (downstream task)

Run a logistic regression model to predict diag00 in the next visit

<span style="color:chocolate">define model</span>

In [None]:
classifier_rf_model = RandomForestClassifier(
        bootstrap=True,
        n_estimators=10,
        max_depth=5,
        max_features='sqrt',
        #class_weight='balanced_subsample',
        random_state=config.seed
)

<span style="color:chocolate">fit model</span>

In [None]:
classifier_rf_model.fit(X_train, y_train)

<span style="color:chocolate">tune model</span>

In [None]:
print('Train score:', classifier_rf_model.score(X_train, y_train))
print('Val score:', classifier_rf_model.score(X_val, y_val))
print('Test score:', classifier_rf_model.score(X_test, y_test))

<span style="color:chocolate">predictions  (on train and test data)</span>

In [None]:
#y_train_tokenized_rf_pred = classifier_rf_model.predict_proba(X_train)
#y_train_tokenized_rf_pred.shape

In [None]:
y_test_tokenized_rf_pred = classifier_rf_model.predict_proba(X_test)
y_test_tokenized_rf_pred.shape

<span style="color:chocolate">Metrics (model evaluation)</span>

In [None]:
## a "micro(sample)-average": quantifying score on all classes jointly
# precision and recall
precision_micro, recall_micro, _ = PRC(
    y_test_tokenized.ravel(),
    y_test_tokenized_rf_pred.ravel()
)


# average precision score
aps_samples_rf = APS(
    y_test_tokenized,
    y_test_tokenized_rf_pred,
    average="samples"
)

# ROC curve and ROC area (Micro-averaged One-vs-Rest ROC AUC score)
fpr_micro, tpr_micro, _ = roc_curve(
    y_test_tokenized.ravel(),
    y_test_tokenized_rf_pred.ravel()
)
area_micro_rf = auc(fpr_micro, tpr_micro)

print('Average precission score:',
      np.round(aps_samples_rf,3)
)
print('ROC AUC:',
      np.round(area_micro_rf,3)
)

# add APS and ROC areato a df
temp_df = pd.DataFrame(
        {'metric': ['APS', 'AUC'],
         'value': [aps_samples_rf, area_micro_rf]
        }
    )

temp_df['seed'] = config.seed

# export metrics to csv
#temp_df.to_csv('./results/ApsAucDownstream__base(RF).csv', mode='a')

at the diagnosis level

In [None]:
importlib.reload(utils_eval_downstream)
start_time= time.time()
recall_diag, precision_diag, fpr_diag, tpr_diag, df_aps_auc_diag, df_y_cols = utils_eval_downstream.metrics_each_diagnosis(
    y_union_tokenized,
    id2token,
    y_test_tokenized_rf_pred,
    y_test_tokenized,
    y_test
)

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

metrics at the micro or samples level

In [None]:
start_time= time.time()
precision_micro_logistic, recall_micro_logistic,\
fpr_micro_logistic, tpr_micro_logistic,\
aps_samples_logistic, area_micro_logistic = utils_eval_downstream.metrics_averages(
    y_test_tokenized_cols,
    id2token,
    y_test_tokenized_rf_pred,
    y_test_tokenized
)

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

In [None]:
print('Average precission score:',
      np.round(aps_samples_logistic,3)
)
print('ROC AUC:',
      np.round(area_micro_logistic,3)
)

# add APS and ROC areato a df
temp_df = pd.DataFrame(
        {'metric': ['APS', 'AUC'],
         'value': [aps_samples_logistic, area_micro_logistic]
        }
    )


temp_df['seed'] = config.seed

# export metrics to csv
temp_df.to_csv('./results/ApsAucDownstream__base(RF).csv', mode='a')

Plot APS and AUC for each diag00-diag03

In [None]:
importlib.reload(utils_eval_downstream)
start_time= time.time()
utils_eval_downstream.plot_pr_roc(
    recall_diag, precision_diag, fpr_diag,
    tpr_diag, df_aps_auc_diag, precision_micro_logistic,
    recall_micro_logistic, fpr_micro_logistic, tpr_micro_logistic, aps_samples_logistic,
    area_micro_logistic, config.top_diag)


# save data to dictionary
dict_metrics = {
    'recall_diag': recall_diag,
    'precision_diag': precision_diag,
    'fpr_diag': fpr_diag,
    'tpr_diag': tpr_diag,
    'df_aps_auc_diag': df_aps_auc_diag,
    'precision_micro': precision_micro_logistic,
    'recall_micro': recall_micro_logistic,
    'fpr_micro': fpr_micro_logistic,
    'tpr_micro': tpr_micro_logistic,
    'aps_samples': aps_samples_logistic,
    'area_micro': area_micro_logistic,
    'config.top_diag': config.top_diag
}

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

np.save('./results/EachApsAucDownstream__base(RF)_'+ str(config.seed)+ '_.npy', dict_metrics) 

---
``downstream task + extra features``

<span style="color:chocolate">create extra features</span>

In [None]:
importlib.reload(utils_classifier_plus) 
X_train_subset_plus = utils_classifier_plus.create_extra_features(df_dict['train_in'], X_train.reset_index(drop=True, inplace=True))
X_val_subset_plus = utils_classifier_plus.create_extra_features(df_dict['val_in'], X_val.reset_index(drop=True, inplace=True))
X_test_subset_plus = utils_classifier_plus.create_extra_features(df_dict['test_in'], X_test.reset_index(drop=True, inplace=True))

In [None]:
## check for columns that are extra between training and validation
col_diff_train_val = np.setdiff1d(list(X_train_subset_plus.columns), list(X_val_subset_plus.columns))
col_diff_train_val = list(col_diff_train_val)
print('Columns difference train_val', col_diff_train_val)
# drop column from X_train_tokenized_subset_plus
X_train_subset_plus.drop(columns=col_diff_train_val, inplace=True)

## check for columns that are extra between training and test
col_diff_train_test = np.setdiff1d(list(X_train_subset_plus.columns), list(X_test_subset_plus.columns))
col_diff_train_test = list(col_diff_train_test)
print('Columns difference train_test', col_diff_train_test)
# drop column from X_train_tokenized_subset_plus
X_train_subset_plus.drop(columns=col_diff_train_test, inplace=True)

<span style="color:chocolate">define model</span>

In [None]:
classifier_rf_plus_model = RandomForestClassifier(
        bootstrap=True,
        n_estimators=10,
        max_depth=5,
        max_features='sqrt',
        random_state=config.seed
)

<span style="color:chocolate">fit model</span>

In [None]:
# Train model
classifier_rf_plus_model.fit(X_train_subset_plus, y_train)

<span style="color:chocolate">tune model</span>

In [None]:
print('Train score:', classifier_rf_plus_model.score(X_train_subset_plus, y_train))
print('Val score:', classifier_rf_plus_model.score(X_val_subset_plus, y_val))

<span style="color:chocolate">predictions  (on train and test data)</span>

In [None]:
#y_train_tokenized_plus_pred = classifier_rf_plus_model.predict_proba(X_train_subset_plus)
#y_train_tokenized_plus_pred.shape

In [None]:
y_test_tokenized_plus_pred = classifier_rf_plus_model.predict_proba(X_test_subset_plus)
y_test_tokenized_plus_pred.shape

<span style="color:chocolate">Metrics (model evaluation)</span>

at the micro or samples level

In [None]:
start_time= time.time()
precision_micro_plus, recall_micro_plus,\
fpr_micro_plus, tpr_micro_plus,\
aps_samples_plus, area_micro_plus = utils_eval_downstream.metrics_averages(
    y_test_tokenized_cols,
    id2token,
    y_test_tokenized_plus_pred,
    y_test_tokenized
)

# print execution time
print('Execution time:', np.round((time.time()-start_time)/60, 2), 'minutes')

Plot APS and AUC for each diag00-diag03

In [None]:
print('Average precission score:',
      np.round(aps_samples_plus,3)
)
print('ROC AUC:',
      np.round(area_micro_plus,3)
)

# add APS and ROC areato a df
metrics_plus = pd.DataFrame(
        {'metric': ['APS', 'AUC'],
         'value': [aps_samples_plus, area_micro_plus]
        }
    )

metrics_plus['seed'] = config.seed

# export metrics to csv
metrics_plus.to_csv('./results/ApsAucDownstreamExtra__base(RF).csv', mode='a')