In [None]:
import pandas as pd

all_d = []
hint_path = './clinical-trial-outcome-prediction/data/'

# for phase in ['I', 'II', 'III']:
for phase in [ 'III']:
    for split in ['train', 'valid', 'test']:
        d = pd.read_csv(hint_path+'phase_{}_{}.csv'.format(phase, split))
        d['split'] = split
        d['phase'] = phase
        
        d['icdcodes'].fillna('', inplace=True)
        d['criteria'].fillna('', inplace=True)

        d['criteria'] = d['criteria'].str.replace('\n','')
        d['icdcodes'] = d['icdcodes'].str.replace('[','')
        d['icdcodes'] = d['icdcodes'].str.replace(']','')
        d['icdcodes'] = d['icdcodes'].str.replace('\'','')
        d['icdcodes'] = d['icdcodes'].str.replace(',','')

        d['sentences'] = d[['icdcodes', 'criteria']].agg(' . '.join, axis=1)
        all_d.append(d)

d = pd.concat(all_d)
# d

In [None]:
import sklearn
import sklearn.metrics
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble
import numpy as np

def bootstrap_eval_(labels, preds):
    prauc = sklearn.metrics.average_precision_score(y_true=labels, y_score=preds[:,1])
    if len(np.unique(labels)) > 1:
        rocauc = sklearn.metrics.roc_auc_score(y_true=labels, y_score=preds[:,1])
    else:
        rocauc = np.nan
    f1 = sklearn.metrics.f1_score(y_true=labels, y_pred=preds.argmax(axis=1))
    acc = sklearn.metrics.accuracy_score(y_true=labels, y_pred=preds.argmax(axis=1))
    return prauc, rocauc, f1, acc
    
def bootstrap_eval(labels, preds, n_bootstrap=20, random_state=0):
    prauc_list = []
    rocauc_list = []
    f1_list = []
    acc_list = []

    np.random.seed(seed=random_state)
    for i in range(n_bootstrap):
        bootstrap_inds = np.random.randint(0, len(labels), size=len(labels))
        prauc, rocauc, f1, acc = bootstrap_eval_(labels=labels[bootstrap_inds], preds=preds[bootstrap_inds])
        prauc_list.append(prauc)
        rocauc_list.append(rocauc)
        f1_list.append(f1)
        acc_list.append(acc)

    return np.nan_to_num(np.nanmean(prauc_list)), np.nan_to_num(np.nanstd(prauc_list)), \
        np.nan_to_num(np.nanmean(rocauc_list)), np.nan_to_num(np.nanstd(rocauc_list)), \
        np.nan_to_num(np.nanmean(f1_list)), np.nan_to_num(np.nanstd(f1_list)), \
        np.nan_to_num(np.nanmean(acc_list)), np.nan_to_num(np.nanstd(acc_list))

def run_baselines(train_x, train_y, test_x, test_y, random_state=1, prepend=''):
    # logistic regression, svm, adaboost, xgboost, decision tree
    # baselines_csv = ['Model, Test PR AUC, Test ROC AUC, Test F1, Test Acc.']
    baselines_csv = []
    model_names = ['Logistic Regression', 'Decision Tree', 'AdaBoost', 'Random Forest']
    models = [sklearn.linear_model.LogisticRegression(random_state=random_state),
        # sklearn.svm.SVC(random_state=random_state, kernel='linear', probability=True), 
        sklearn.tree.DecisionTreeClassifier(random_state=random_state),
        sklearn.ensemble.AdaBoostClassifier(random_state=random_state),
        sklearn.ensemble.RandomForestClassifier(random_state=random_state, n_estimators=10)]

    for i in range(len(models)):
        preds = models[i].fit(train_x, train_y).predict_proba(test_x)
        prauc, prauc_std, rocauc, rocauc_std, f1, f1_std, acc, acc_std = bootstrap_eval(labels=test_y, preds=preds)
        baselines_csv.append(prepend+"{}, {:.3f} $\pm$ {:.3f}, {:.3f} $\pm$ {:.3f}, {:.3f} $\pm$ {:.3f}, {:.3f} $\pm$ {:.3f}".format(\
            model_names[i], prauc, prauc_std, rocauc, rocauc_std, f1, f1_std, acc, acc_std))

    return baselines_csv

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

train_x = list(d[d['split'].isin(['train', 'valid'])]['sentences'])
test_x = list(d[d['split'].isin(['test'])]['sentences'])
train_y = d[d['split'].isin(['train', 'valid'])]['label'].values
test_y = d[d['split'].isin(['test'])]['label'].values

len_train = len(train_x)
countv = CountVectorizer()
counts = countv.fit_transform(train_x+test_x)
x = TfidfTransformer(use_idf=True).fit_transform(counts)
train_x, test_x = x[:len_train], x[len_train:]

ind_to_word = {v:k for k,v in countv.vocabulary_.items()}


print(train_x.shape, test_x.shape)
results = run_baselines(train_x, np.array(train_y), test_x, np.array(test_y), random_state=3)
print('Model, Test PR AUC, Test ROC AUC, Test F1, Test Acc.')
print('\n'.join(results))

# output
"""
Model, Test PR AUC, Test ROC AUC, Test F1, Test Acc.
Logistic Regression, 0.876 $\pm$ 0.009, 0.721 $\pm$ 0.018, 0.844 $\pm$ 0.007, 0.744 $\pm$ 0.011
Decision Tree, 0.781 $\pm$ 0.013, 0.579 $\pm$ 0.019, 0.753 $\pm$ 0.013, 0.647 $\pm$ 0.015
AdaBoost, 0.823 $\pm$ 0.010, 0.626 $\pm$ 0.012, 0.796 $\pm$ 0.010, 0.690 $\pm$ 0.013
Random Forest, 0.815 $\pm$ 0.013, 0.631 $\pm$ 0.022, 0.811 $\pm$ 0.012, 0.709 $\pm$ 0.016
"""
