**Competency-Based GME Assessments: Leveraging Artificial Intelligence to Predict Sub-Competency Content**

Development Code

Gregory J Booth, MD

23 Jan 2022

This digital supplement outlines the code that was used to train an artificial intelligence algorithm to interpret feedback narratives on anesthesiology residents.




---



**Part 1: Install and import pacakges and load data**

In [None]:
#install the package for the AI algorithm
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 5.6 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.1-py3-none-any.whl (216 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp38-cp38-linux_x86_64.whl size=3134417 sha256=8e3ea7dad4d893c3ea8077523e6a9a4dd3f10f419921c33cfd6d20d3433fd3e7
  Stored in directory: /root/.cache/pip/wheels/93/61/2a/c54711a91c418ba06ba195b1d78ff24fcaad8592f2a694ac94
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.1


In [None]:
#import all other necessary packages
import fasttext
import pandas as pd
import numpy as np
from collections import Counter
import regex as re
from sklearn.preprocessing import label_binarize

In [None]:
#load files
text = pd.read_csv('/content/filename.csv')
#text = pd.read_csv('/content/filename_sensitivity.csv') #for sensitivity analysis

In [None]:
#Combine sub-competencies which had <1% representation. These were identified in phase 1 of the study
#where experts labeled each narrative comment.
text=text.replace({'PC9':'PC7','PC6':'PC2','ICS1':'ICS','ICS2':'ICS','ICS3':'ICS','P1':'P','P2':'P','P3':'P','PBLI1':'PBLI','PBLI2':'PBLI','SBP1':'SBP','SBP2':'SBP','SBP3':'SBP'})

In [None]:
#how many sentences are there?
len(text)

51428



---



**Part 2: Separate training and validation cohorts**




In [None]:
#separate validation cohort
text_test = text[(text['Index'] >=3566) & (text['Index'] <4170)].copy() #pull out Site 3
print(text_test.shape)
print(text_test['Eval'].head())

(4510, 4)
33922      did a great job on two spine cases  especial...
33923      did a great job on two spine cases  especial...
33924    Prep  patient knowledge and handling  anesthet...
33925    Prep  patient knowledge and handling  anesthet...
33926                                  is doing very well 
Name: Eval, dtype: object


In [None]:
#separate training cohort
text_train=text[(text['Index'] <3566) | (text['Index'] >=4170)].copy() #pull out Sites 1, 2, 4
print(text_train.shape)
print(text_train['Eval'].head())


(46918, 4)
0                          Outstanding resident 
1                          Outstanding resident 
2                        Very easy to work with 
3                        Very easy to work with 
4    came very well prepared  as he always does 
Name: Eval, dtype: object


In [None]:
#process training data
#check for comments where both raters agreed
retain = []
text_test=text_test.reset_index(drop=True)
text_train=text_train.reset_index(drop=True)
for i in range(len(text_train)-1):
  if text_train['Eval'][i]==text_train['Eval'][i+1]:
    if text_train['Milestone'][i]==text_train['Milestone'][i+1]:
      retain.append(i)

len(retain)
text_train=text_train.iloc[retain,:]
print(text_train.shape)


(11960, 4)


In [None]:
#process validation data (full test dataset)
text_test=text_test.reset_index(drop=True)
retain = []
for i in range(len(text_test)-1):
  if text_test['Eval'][i]==text_test['Eval'][i+1]:
      retain.append(i)

len(retain)
text_test=text_test.iloc[retain,:]
print(text_test.shape)


(2255, 4)


In [None]:
#process validation data (sensitivity analysis, just a subset of the data with 100% agreement))
retain = [] #process train data
text_test=text_test.reset_index(drop=True)
for i in range(len(text_test)-1):
  if text_test['Eval'][i]==text_test['Eval'][i+1]:
    if text_test['Milestone'][i]==text_test['Milestone'][i+1]:
      retain.append(i)

len(retain)
text_test=text_test.iloc[retain,:]
print(text_test.shape)

(1108, 4)


In [None]:
#put data into the form that fastText expects
text_train['Milestone']=text_train['Milestone'].apply(lambda x: '__label__'+x)

In [None]:
##combine consecutive sentences with same sub-competency in training data
text_train=text_train.reset_index(drop=True)
combo_text_train_index=[]
label_hold=[]
new_data2=[]
unique=[]
index_dup = []
orig_index=[]
eval_text = ''
j=0
data_length = len(text_train)
while j < (data_length-1):
  if text_train['Milestone'][j]==text_train['Milestone'][j+1] and int(text_train['Index'][j])==int(text_train['Index'][j+1]):
    #unique.append(text_train['Index'][j])
    orig_index.append(text_train['Index'][j])
    eval_text=text_train['Eval'][j]
    while text_train['Milestone'][j]==text_train['Milestone'][j+1] and int(text_train['Index'][j])==int(text_train['Index'][j+1]):
      j+=1
      eval_text=eval_text+text_train['Eval'][j]
      unique.append(text_train['Index'][j])
    label_hold.append((orig_index[-1],text_train['Milestone'][j-1],eval_text))
  else:
    label_hold.append((text_train['Index'][j],text_train['Milestone'][j],text_train['Eval'][j]))
  eval_text=''
  j=j+1
  if j == data_length-1:
    label_hold.append((text_train['Index'][j],text_train['Milestone'][j],text_train['Eval'][j]))


In [None]:
#build final training dataset

evals_final = [i[2] for i in label_hold]
milestones_final = [i[1] for i in label_hold]

text_train_final = pd.DataFrame(data = None,columns=['Milestone','Eval'])
text_train_final['Milestone']=milestones_final
text_train_final['Eval']=evals_final

In [None]:
text_train=text_train_final.copy()



---



**Part 3: Extract descriptive data on training and validation cohorts**

In [None]:
#demographic data for train and validation cohorts
labels_ordered =['__label__PC1','__label__PC2','__label__PC3','__label__PC4','__label__PC5',
                 '__label__PC7','__label__PC8','__label__PC10','__label__MK1','__label__MK2',
                 '__label__P','__label__ICS','__label__PBLI','__label__SBP','__label__D','__label__N',]

train_dict = Counter(text_train['Milestone'])
test_dict = Counter(text_test['Milestone'])
print('Category','\t','Train Count (%)','\t','Validation Count (%)')
train_count=[]
test_count=[]
for l in labels_ordered:
  index = l.strip('__label__')
  train_count.append(train_dict[l])
  test_count.append(test_dict[index])
  print('{:s} \t {:d} ({:.2f}%) \t{:d} ({:.2f}%)'.format(index, train_dict[l],100*round(train_dict[l]/len(text_train),3), test_dict[index], 100*round(test_dict[index]/len(text_test),3)))
print('total train = ',sum(train_count))
print('total test = ',sum(test_count))

Category 	 Train Count (%) 	 Validation Count (%)
PC1 	 369 (3.60%) 	107 (4.70%)
PC2 	 443 (4.30%) 	155 (6.90%)
PC3 	 190 (1.90%) 	49 (2.20%)
PC4 	 601 (5.90%) 	149 (6.60%)
PC5 	 788 (7.70%) 	93 (4.10%)
PC7 	 244 (2.40%) 	81 (3.60%)
PC8 	 85 (0.80%) 	25 (1.10%)
PC10 	 733 (7.20%) 	75 (3.30%)
MK1 	 759 (7.40%) 	172 (7.60%)
MK2 	 187 (1.80%) 	67 (3.00%)
P 	 803 (7.90%) 	252 (11.20%)
ICS 	 724 (7.10%) 	108 (4.80%)
PBLI 	 387 (3.80%) 	156 (6.90%)
SBP 	 186 (1.80%) 	76 (3.40%)
D 	 1348 (13.20%) 	218 (9.70%)
N 	 2371 (23.20%) 	472 (20.90%)
total train =  10218
total test =  2255


In [None]:
#demographic data for train and validation cohorts after tie-breaking
labels_ordered =['__label__PC1','__label__PC2','__label__PC3','__label__PC4','__label__PC5',
                 '__label__PC7','__label__PC8','__label__PC10','__label__MK1','__label__MK2',
                 '__label__P','__label__ICS','__label__PBLI','__label__SBP','__label__D','__label__N',]

train_dict = Counter(text_train['Milestone'])
test_dict = Counter(text_test['Milestone'])
print('Category','\t','Train Count (%)','\t','Validation Count (%)')
train_count=[]
test_count=[]
for l in labels_ordered:
  index = l.strip('__label__')
  train_count.append(train_dict[l])
  test_count.append(test_dict[index])
  print('{:s}\t{:d} ({:.2f}%)'.format(index, test_dict[index], 100*round(test_dict[index]/len(text_test),3)))
print('total train = ',sum(train_count))
print('total test = ',sum(test_count))

Category 	 Train Count (%) 	 Validation Count (%)
PC1	80 (3.50%)
PC2	153 (6.80%)
PC3	58 (2.60%)
PC4	191 (8.50%)
PC5	84 (3.70%)
PC7	90 (4.00%)
PC8	31 (1.40%)
PC10	78 (3.50%)
MK1	158 (7.00%)
MK2	66 (2.90%)
P	297 (13.20%)
ICS	119 (5.30%)
PBLI	149 (6.60%)
SBP	91 (4.00%)
D	222 (9.80%)
N	388 (17.20%)
total train =  10218
total test =  2255




---



**Part 4: Preprocess training and validation data**

In [None]:
#now let's do some pre-processing to training data
#lowercase
text_train['Eval']=text_train['Eval'].str.lower()

#stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stp = stopwords.words('english')
stp.append('throughout')
stop_words=stp.copy()
def remove_stop(line):
  words = nltk.word_tokenize(line)
  holder = []
  index1=0
  for word in words:
      if word not in stop_words:
          holder.append(word)
  return ' '.join(holder)

text_train['Eval']=text_train['Eval'].apply(lambda x: remove_stop(x))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#now let's do some pre-processing to validation data
#lowercase
text_test['Eval']=text_test['Eval'].str.lower()

#stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stp = stopwords.words('english')
stp.append('throughout')
stop_words=stp.copy()
def remove_stop(line):
  words = nltk.word_tokenize(line)
  holder = []
  index1=0
  for word in words:
      if word not in stop_words:
          holder.append(word)
  return ' '.join(holder)

text_test['Eval']=text_test['Eval'].apply(lambda x: remove_stop(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#format training data 
new = []
new=text_train['Milestone']+' '+text_train['Eval']
text_train['New']=new

In [None]:
#final size of training dataset
len(new)

10218



---



**Part 5: Now for the fun part... Find the optimal hyperparameters**

In [None]:
#set up a grid search
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

#shuffle data
text_train=text_train.sample(frac=1)
text_train=text_train.reset_index(drop=True)

results = dict()
k_fold_results = []
macro_holder=[]
epochs = [5, 10, 25, 50]
learn_rate = [0.1, 0.25, 0.5, 1]
grams=[1, 2, 3]
short_long = [(0,0),(0,1),(0,2),(0,3),(1,2),(1,3),(2,3),(1,4),(2,4)]

for sl in short_long:
  for e in epochs:
    for l in learn_rate:
      for w in grams:
        params=str(sl)+', ' +str(e)+ ', ' +str(l)+ ', '+str(w)
        kf_macro_f1=[]
        kf = KFold(n_splits=5)
        for train, test in kf.split(text_train):  
          np.savetxt('/content/train_data.txt', text_train['New'][train], fmt='%s')
          #np.savetxt('/content/validation_data.txt', text_train[['Milestone','Eval']].iloc[test,:], fmt='%s')
          model = fasttext.train_supervised(input='/content/train_data_kf.txt',epoch=e,lr=l,wordNgrams=w,minn=sl[0],maxn=sl[1])
          preds = []
          for test_index in test:
            preds.append(model.predict(text_train['Eval'][test_index])[0])
          y_true = text_train['Milestone'][test]
          score_macro = f1_score(y_true, preds, average='macro')
          macro_holder.append(score_macro)
        k_fold_results=sum(macro_holder)/len(macro_holder)
        results[params] = k_fold_results
        #print(params, ' ',results[params])
        k_fold_results=[]
        macro_holder=[]

In [None]:
#which hyperparameters gave the highest macro F1 score?
sorted(results.items(),key = lambda item: item[1],reverse=True)[0:10]

[('(1, 4), 25, 1, 2', 0.6835299950606571),
 ('(1, 4), 25, 1, 1', 0.6801583976137338),
 ('(1, 4), 50, 1, 3', 0.6798773125341171),
 ('(1, 4), 25, 1, 3', 0.679850739315856),
 ('(2, 4), 25, 1, 3', 0.6797712758232317),
 ('(0, 1), 50, 1, 1', 0.679667121085842),
 ('(1, 4), 50, 0.5, 3', 0.6795931780412124),
 ('(2, 4), 50, 0.5, 3', 0.679439256414293),
 ('(2, 4), 25, 1, 2', 0.6790211359778493),
 ('(2, 3), 50, 1, 3', 0.6787339201741046)]



---



**Part 6: Perform internal validation**

In [None]:
#maximize function (needed to perform one vs. all analysis)
def maximize(line):
  big = max(line)
  line = [0 if val !=big else 1 for val in line]
  return line


In [None]:
####Stratified bootstrap for internal validation

from sklearn.metrics import roc_curve, auc
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

straps = 500
labels =['__label__D','__label__ICS','__label__MK1','__label__MK2','__label__N','__label__P',
    '__label__PBLI','__label__PC1','__label__PC10','__label__PC2','__label__PC3',
    '__label__PC4','__label__PC5','__label__PC7','__label__PC8','__label__SBP']

#bootstrap code for internal validity (Fitting model each time)
#shuffle training data
text_train=text_train.sample(frac=1)
text_train=text_train.reset_index(drop=True)

#set up class-wise split
index_hold = dict()
counts = list(Counter(text_train['Milestone']).items())
for count in counts:
  index_hold[count[0]]=text_train.index[text_train['Milestone']==count[0]].to_list()

roc_hold = []
roc_auc_dict=defaultdict(list)
recall_dict=defaultdict(list)
precision_dict=defaultdict(list)
F1_dict=defaultdict(list)

for strap in range(straps):
  #holders for bootstrapped data and results
  new_train_index = []
  new_test_index = []

  for lab in labels:
    boot = np.random.choice(index_hold[lab],size=(len(index_hold[lab])),replace=True)
    test_index = [x for x in index_hold[lab] if x not in boot]

    #now build up entire train and test data
    new_train_index.extend(boot)
    new_test_index.extend(test_index)

  train_data = text_train.iloc[new_train_index]
  test_data = text_train.iloc[new_test_index]

  np.savetxt('/content/train_data_bootstrap.txt', train_data, fmt='%s')
  model = fasttext.train_supervised(input='/content/train_data_bootstrap.txt',epoch=25,lr=1,wordNgrams=2,minn=1,maxn=4)
  
  preds = []
  for t_index in new_test_index:
    preds.append(model.predict(test_data['Eval'][t_index])[0])
  y_true = test_data['Milestone'][new_test_index]
  ##AUC:
  y_true_labels=label_binarize(y_true,classes=labels)
  y_true_labels_array = np.array(y_true_labels)

  pred_update=[]#put float for probability of a true label
  preds_dict_array_modified = [] 
  preds1=[]
  for iteration in new_test_index:
      i,j = model.predict(test_data['Eval'][iteration],k=16) #probs for each class
      preds1.append(i[0])
      preds_dict_array_modified.append([1 if cat == i[0] else 0 for cat in labels])

  preds_dict_array_modified=np.array(preds_dict_array_modified) 
  fpr = dict() #finally on to calculating AUC with class vs. rest approach
  tpr = dict()
  roc_auc = dict()
  hold = []
  for i in range(16):
      fpr[i], tpr[i], _ = roc_curve(y_true_labels_array[:, i], (preds_dict_array_modified[:, i]))
      roc_auc[labels[i].strip('__label__')] = auc(fpr[i], tpr[i])
      #hold.append(auc(fpr[i], tpr[i]))
      
  roc_auc_dict[strap]=roc_auc


  ##Precision, Recall, F1
  result_cv = classification_report(y_true, preds)
  #print(result_cv)
  processed_cv=result_cv.split()
  for i in range(len(processed_cv)):
    if processed_cv[i] in labels:
      precision_dict[strap].append(np.float(processed_cv[i+1]))
      recall_dict[strap].append(np.float(processed_cv[i+2]))
      F1_dict[strap].append(np.float(processed_cv[i+3]))

  #print(strap, roc_auc)
  

In [None]:
#Results for Internal Validation
count=0
for lab in labels:
  add = []
  add_pre=[]
  add_rec=[]
  add_f1=[]
  for i in range((500)):
    hold1=lab.strip('__label__')
    add.append(roc_auc_dict[i][hold1])
    add_pre.append(precision_dict[i][count])
    add_rec.append(recall_dict[i][count])
    add_f1.append(F1_dict[i][count])
  mean_roc = sum(add)/straps
  mean_prec = sum(add_pre)/straps
  mean_rec = sum(add_rec)/straps
  mean_f1 = sum(add_f1)/straps
  roc_95 = (((sorted(add)[11]+sorted(add)[12])/2),((sorted(add)[488]+sorted(add)[487])/2))
  prec_95 =(((sorted(add_pre)[11]+sorted(add_pre)[12])/2),((sorted(add_pre)[488]+sorted(add_pre)[487])/2))
  rec_95 = (((sorted(add_rec)[11]+sorted(add_rec)[12])/2),((sorted(add_rec)[488]+sorted(add_rec)[487])/2))
  f1_95 = (((sorted(add_f1)[11]+sorted(add_f1)[12])/2),((sorted(add_f1)[488]+sorted(add_f1)[487])/2))
  print('{:s}\t{:.2f} ({:.2f}, {:.2f})\t{:.2f} ({:.2f}, {:.2f})\t{:.2f} ({:.2f}, {:.2f}\t{:.2f} ({:.2f}, {:.2f})'.format(lab.strip('__label__'),mean_prec,prec_95[0],prec_95[1],mean_rec,rec_95[0],rec_95[1],mean_f1,f1_95[0],f1_95[1],mean_roc,roc_95[0],roc_95[1]))

D	0.78 (0.64, 0.94)	0.77 (0.55, 0.91)	0.76 (0.69, 0.82	0.87 (0.77, 0.92)
ICS	0.76 (0.52, 0.94)	0.60 (0.34, 0.79)	0.65 (0.50, 0.73	0.79 (0.67, 0.87)
MK1	0.83 (0.64, 0.95)	0.70 (0.51, 0.84)	0.75 (0.65, 0.81	0.84 (0.75, 0.91)
MK2	0.51 (0.27, 0.86)	0.33 (0.14, 0.53)	0.38 (0.21, 0.51	0.66 (0.57, 0.76)
N	0.68 (0.53, 0.84)	0.89 (0.77, 0.97)	0.76 (0.68, 0.82	0.87 (0.85, 0.90)
P	0.59 (0.38, 0.82)	0.63 (0.44, 0.79)	0.59 (0.50, 0.65	0.79 (0.71, 0.84)
PBLI	0.59 (0.35, 0.86)	0.52 (0.31, 0.71)	0.53 (0.42, 0.62	0.75 (0.65, 0.84)
PC1	0.72 (0.49, 0.93)	0.57 (0.33, 0.77)	0.61 (0.47, 0.71	0.78 (0.66, 0.87)
PC10	0.88 (0.72, 0.97)	0.77 (0.54, 0.91)	0.81 (0.69, 0.87	0.88 (0.77, 0.94)
PC2	0.58 (0.38, 0.81)	0.33 (0.08, 0.61)	0.38 (0.13, 0.54	0.66 (0.54, 0.79)
PC3	0.72 (0.48, 0.93)	0.53 (0.29, 0.73)	0.60 (0.45, 0.71	0.76 (0.65, 0.86)
PC4	0.58 (0.39, 0.82)	0.44 (0.11, 0.71)	0.46 (0.18, 0.57	0.71 (0.55, 0.83)
PC5	0.88 (0.76, 0.96)	0.77 (0.56, 0.89)	0.81 (0.69, 0.86	0.88 (0.78, 0.94)
PC7	0.44 (0.21, 0.88)	0.26 (0

In [None]:
#save train and test data
#train
np.savetxt('/content/train_data.txt', text_train['New'], fmt='%s')
#test
np.savetxt('/content/test_data.txt', text_test['Eval'], fmt='%s')



---



**Part 7: Train the final NLP model!**

In [None]:
#Train the final model!!

model = fasttext.train_supervised(input='/content/train_data.txt',epoch=25,lr=1,wordNgrams=2,minn=1,maxn=4)


In [None]:
#Get the data into the format fastText expects
text_test['Milestone']=text_test['Milestone'].apply(lambda x: '__label__'+x)



---



**Part 8: Perform external validation**

In [None]:
#Bootstrap estimates for validation data

from sklearn.metrics import roc_curve, auc
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

labels =['__label__D','__label__ICS','__label__MK1','__label__MK2','__label__N','__label__P',
    '__label__PBLI','__label__PC1','__label__PC10','__label__PC2','__label__PC3',
    '__label__PC4','__label__PC5','__label__PC7','__label__PC8','__label__SBP']

straps = 500
text_test=text_test.reset_index(drop=True)
#set up class-wise split
index_hold = dict()
counts = list(Counter(text_test['Milestone']).items())
for count in counts:
  index_hold[count[0]]=text_test.index[text_test['Milestone']==count[0]].to_list()

roc_hold = []
roc_auc_dict=defaultdict(list)

for strap in range(straps):
  #holders for bootstrapped data and results
  test1_index = []
  test2_index = []

  for lab in labels:
    boot = np.random.choice(index_hold[lab],size=(len(index_hold[lab])),replace=True)
    #now build up entire train and test data
    test1_index.extend(boot)

  test1_data = text_test.iloc[test1_index]


  preds = []

  for t in test1_index:
      preds.append(model.predict(text_test['Eval'][t])[0])
  y_true = text_test['Milestone'].iloc[test1_index]
  #y_true = ['__label__'+i for i in y_true]

  ##AUC:
  y_true_labels=label_binarize(y_true,classes=labels)
  y_true_labels_array = np.array(y_true_labels)

  pred_update=[]#put float for probability of a true label
  preds_dict_array_modified = [] 
  for iteration in test1_index:
      i,j = model.predict(text_test['Eval'][iteration],k=16) #probs for each class
      pred_dict = dict()
      for count in range(0,16):
        pred_dict[i[count]]=j[count]#data=[label for label in j],columns=[col for col in i])

      preds1=[]
      for lab in labels:
        preds1.append(pred_dict[lab])
      pred_update.append(preds1) #build our array for all prediction probabilities
      
  for i in range(len(pred_update)):
    preds_dict_array_modified.append(maximize(pred_update[i])) #build prediction array

  preds_dict_array_modified=np.array(preds_dict_array_modified) 
  fpr = dict() #finally on to calculating AUC with class vs. rest approach
  tpr = dict()
  roc_auc = dict()
  hold = []
  for i in range(16):
      fpr[i], tpr[i], _ = roc_curve(y_true_labels_array[:, i], (preds_dict_array_modified[:, i]))
      roc_auc[labels[i].strip('__label__')] = auc(fpr[i], tpr[i])
      #hold.append(auc(fpr[i], tpr[i]))
      
  roc_auc_dict[strap]=roc_auc
  #print(roc_auc)

In [None]:
##Results for validation data
for lab in labels_ordered:
  add = []
  for i in range((500)):
    hold1=lab.strip('__label__')
    add.append(roc_auc_dict[i][hold1])
  mean_roc = sum(add)/straps
  roc_95 = (((sorted(add)[11]+sorted(add)[12])/2),((sorted(add)[488]+sorted(add)[487])/2))
  print('{:s}\t{:.2f} ({:.2f}, {:.2f})'.format(lab.strip('__label__'),mean_roc,roc_95[0],roc_95[1]))

PC1	0.75 (0.69, 0.81)
PC2	0.73 (0.69, 0.77)
PC3	0.71 (0.65, 0.78)
PC4	0.66 (0.63, 0.70)
PC5	0.87 (0.83, 0.92)
PC7	0.64 (0.59, 0.69)
PC8	0.61 (0.55, 0.69)
PC10	0.86 (0.81, 0.90)
MK1	0.85 (0.82, 0.89)
MK2	0.62 (0.57, 0.68)
P	0.62 (0.60, 0.65)
ICS	0.78 (0.74, 0.83)
PBLI	0.70 (0.66, 0.74)
SBP	0.58 (0.54, 0.62)
D	0.82 (0.79, 0.86)
N	0.85 (0.83, 0.88)


In [None]:
#Bootstrap estimates for validation data, sensitivity analysis (only sentences with 100% agreement)

from sklearn.metrics import roc_curve, auc
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

labels =['__label__D','__label__ICS','__label__MK1','__label__MK2','__label__N','__label__P',
    '__label__PBLI','__label__PC1','__label__PC10','__label__PC2','__label__PC3',
    '__label__PC4','__label__PC5','__label__PC7','__label__PC8','__label__SBP']

straps = 500
text_test=text_test.reset_index(drop=True)
#set up class-wise split
index_hold = dict()
counts = list(Counter(text_test['Milestone']).items())
for count in counts:
  index_hold[count[0]]=text_test.index[text_test['Milestone']==count[0]].to_list()

roc_hold = []
roc_auc_dict=defaultdict(list)

for strap in range(straps):
  #holders for bootstrapped data and results
  test1_index = []
  test2_index = []

  for lab in labels:
    boot = np.random.choice(index_hold[lab],size=(len(index_hold[lab])),replace=True)
    #now build up entire train and test data
    test1_index.extend(boot)

  test1_data = text_test.iloc[test1_index]


  preds = []

  for t in test1_index:
      preds.append(model.predict(text_test['Eval'][t])[0])
  y_true = text_test['Milestone'].iloc[test1_index]
  #y_true = ['__label__'+i for i in y_true]

  ##AUC:
  y_true_labels=label_binarize(y_true,classes=labels)
  y_true_labels_array = np.array(y_true_labels)

  pred_update=[]#put float for probability of a true label
  preds_dict_array_modified = [] 
  for iteration in test1_index:
      i,j = model.predict(text_test['Eval'][iteration],k=16) #probs for each class
      pred_dict = dict()
      for count in range(0,16):
        pred_dict[i[count]]=j[count]

      preds1=[]
      for lab in labels:
        preds1.append(pred_dict[lab])
      pred_update.append(preds1) #build our array for all prediction probabilities
      
  for i in range(len(pred_update)):
    preds_dict_array_modified.append(maximize(pred_update[i])) #build prediction array

  preds_dict_array_modified=np.array(preds_dict_array_modified) 
  fpr = dict() #finally on to calculating AUC with class vs. rest approach
  tpr = dict()
  roc_auc = dict()
  hold = []
  for i in range(16):
      fpr[i], tpr[i], _ = roc_curve(y_true_labels_array[:, i], (preds_dict_array_modified[:, i]))
      roc_auc[labels[i].strip('__label__')] = auc(fpr[i], tpr[i])
      #hold.append(auc(fpr[i], tpr[i]))
      
  roc_auc_dict[strap]=roc_auc
  #print(roc_auc)

In [None]:
##Results for validation data, sensitivity analysis (total agreement between raters on validation data)
for lab in labels_ordered:
  add = []
  for i in range((500)):
    hold1=lab.strip('__label__')
    add.append(roc_auc_dict[i][hold1])
  mean_roc = sum(add)/straps
  roc_95 = (((sorted(add)[11]+sorted(add)[12])/2),((sorted(add)[488]+sorted(add)[487])/2))
  print('{:s}\t{:.2f} ({:.2f}, {:.2f})'.format(lab.strip('__label__'),mean_roc,roc_95[0],roc_95[1]))


PC1	0.83 (0.76, 0.89)
PC2	0.75 (0.69, 0.82)
PC3	0.76 (0.65, 0.87)
PC4	0.75 (0.69, 0.81)
PC5	0.90 (0.86, 0.95)
PC7	0.72 (0.62, 0.81)
PC8	0.63 (0.53, 0.73)
PC10	0.87 (0.80, 0.93)
MK1	0.92 (0.89, 0.95)
MK2	0.65 (0.56, 0.78)
P	0.73 (0.67, 0.78)
ICS	0.87 (0.81, 0.92)
PBLI	0.80 (0.74, 0.87)
SBP	0.62 (0.50, 0.73)
D	0.88 (0.85, 0.91)
N	0.91 (0.89, 0.93)
