In [8]:
import xml.etree.cElementTree as ET
import os
import nltk
import string
import pandas as pd
import numpy as np

NOTE: The following functions, although used for generating train / test datasets, are left in here so the test files can be generated if needed.

### Get the files for parsing

In [3]:
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))
datafolder = (codefolder.replace("Code", "Dataset\\training-RiskFactors-Complete-Set1"))
#print (datafolder)

filenames = []
xml_contents = []

for file in os.listdir(datafolder):
    filename = os.fsdecode(os.fsencode((str(datafolder)+'\\'+file)))
    if filename.endswith( ('.xml') ): # select xml files
        #print(filename)
        filenames.append(filename)


### Define function to get tokens & their attributes

In [4]:
def spans(text):
    tokens = nltk.word_tokenize(text.lower())
    offset = 0
    start_indices = []
    end_indices = []
    for token in tokens:
        offset = text.find(token, offset)
        start, end = offset, offset+len(token)
        offset += len(token)
        start_indices.append(start)
        end_indices.append(end)
    return tokens, start_indices, end_indices

In [5]:
print(filenames[30])

C:\Users\sudha\Documents\W266-NLP\Final-Project-W266\Dataset\training-RiskFactors-Complete-Set1\226-04.xml


### Define function to generate IO Coding (Model2)

IO coding is a technique for extracting entities, where in the input sentence is tokenized and analyzed for occurences of words that belong to an entity of interest.  It follows the following scheme:

* I - marks beginning/inside of the entity
* O - marks that the token is NOT part of any entity


In [6]:
def Generate_IO_Coding(file_path, tag, attribute):
    """
    Function defined to take in a file for processing, tag and attributes to identify
    within the file (xml).  The tags are first read into objects, capturing the label,
    start and end values.  These are references in the 'text' object, where based on
    which the annotators did identify the tag in context. 
    
    This information is used to parse through the text (as tokens) and perform the BIO
    coding based on the start position matches.
    
    Input: 
    file_path: path of the file to be read in for processing
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc.
    attribute: specific attribute within the tag, from which to extract the value from
    
    Returns: 
    list of tokens, list of labels (IO coding)
    """
    
    tree = ET.ElementTree(file=file_path)
    root = tree.getroot()

    text = root.find('TEXT').text.lower()
    
    tokens, start, end = spans(text)
    labels_list = []
    label_start = []
    label_end = []
    filename = []


    for item in root.find("TAGS"):
        if item.tag  == tag:
            label = (item.tag + "." + item.attrib[attribute]).lower().replace(" ", "_")
        else:
            label = ""

        for sub_item in item.findall(item.tag):
            if 'start' in sub_item.attrib.keys():
                labels_list.append(label)
                label_start.append(int(sub_item.attrib['start']))
                label_end.append(int(sub_item.attrib['end']))

        bio_labels = []

        count = 0

    while len(start) > count:
        if start[count] in label_start:
            label_start_index = label_start.index(start[count])
            end_index = label_end[label_start_index]
            word_label = labels_list[label_start_index]
            phrase = text[start[count]:end_index]
            phrase_tokens = nltk.word_tokenize(phrase)
            # update the tag to 'I-' so that this generates IO-Coding
            next_tag="I-"
            for word in phrase_tokens:
                if (len(word_label) > 0):
                    new_label = next_tag + word_label
                    bio_labels.append(new_label)
                    next_tag="I-"
                else:
                    bio_labels.append("O")
                count += 1
        else:
            bio_labels.append("O")
            count += 1 

    # build this list to hold name of the file the token belongs to
    # this is for the purpose of evaluation of the model from test results
    for i in range(0, len(tokens)):
        filename.append(file_path)
        
    return filename, tokens, bio_labels

In [7]:
def getIOCoding_data(tag, attrib, filenames):

    """
    All files in the list (which holds the list of files in the directory) are parsed through
    and the Generate_BIO_Coding function is called by passing individual files within the folder.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    list of tokens, list of labels (BIO coding, done across all files in the path for the tag/attribute)
    """
    
    all_tokens = []
    all_labels = []
    all_filenames = []

    for file in filenames:
        #print("processing file ... ", file)
        filename, tokens, bio_labels = Generate_IO_Coding(file_path=file, tag=tag, attribute=attrib)

        all_tokens.extend(tokens)
        all_labels.extend(bio_labels)
        all_filenames.extend(filename)  # this is for the purpose of validating test results
        #print("finished processing file ", file, "; and token length is ", len(all_tokens), "; and label length: ", len(all_labels))
        
    return all_filenames, all_tokens, all_labels


### Form Test Data

In the script in cell below, we have chosen the test xml files that start with '11'.  This is just to do a sample prediction against the model built using BERT as classifier.

The condition should be removed to generate / test the complete set across all XML files in the 'testing' folder.

In [9]:
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))
datafolder = (codefolder.replace("Code", "Dataset\\testing-RiskFactors-Complete"))
#print (datafolder)

testfilenames = []
xml_contents = []

for file in os.listdir(datafolder):
    filename = os.fsdecode(os.fsencode((str(datafolder)+'\\'+file)))
    #if filename.endswith( ('.xml') ): # select xml files
    if file.startswith( ('11') ):
        #print(filename)
        testfilenames.append(filename)


### Get Test Tokens for 'Hypertension' (to test bert output)

In [10]:
# get data for model #5
tag = 'HYPERTENSION'
attribute = 'indicator'

test_hypertension_indicator_filenames, test_hypertension_indicator_tokens, test_hypertension_indicator_labels = getIOCoding_data(tag, attribute, testfilenames)
#hypertension_indicator_labels

In [11]:
# capture data into dataframe to work with it
test_df_hypertension = pd.DataFrame({'filename': test_hypertension_indicator_filenames, 'test_token': test_hypertension_indicator_tokens, 'test_label': test_hypertension_indicator_labels})

In [12]:
# actual counts of labels in test set for hypertension mention and high_bp
test_df_hypertension['test_label'].value_counts()

O                         23091
I-hypertension.mention       45
I-hypertension.high_bp       24
Name: test_label, dtype: int64

### Value Counts of Labels from BERT Classifier

Value counts of labels from BERT classifier (manually obtained from test_results file which holds probabilities for each class):

* Class 0: 23097
* Class 1: 47
* Class 2: 16

### Running manual count checks

In [13]:
tdf = test_df_hypertension
tdf.shape

(23160, 3)

In [14]:
# check random values to see if the labels match output from bert (bert_run1_test_results)

tdf[488:495]
# bert output predicted I-hypertension.mention for token in position 490

Unnamed: 0,filename,test_token,test_label
488,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,),O
489,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,hypertension,I-hypertension.mention
490,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,.,O
491,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,under,O
492,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,good,O
493,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,control,O
494,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,.,O


#### Extract only filename from full path

In [15]:
def get_filename(fullpath):
    return fullpath.replace("C:\\Users\\sudha\\Documents\\W266-NLP\\Final-Project-W266\\Dataset\\testing-RiskFactors-Complete\\", "")
    
tdf = test_df_hypertension
tdf['file'] = tdf['filename'].apply(get_filename)
tdf.drop('filename', 1, inplace=True)
tdf.head(10)

Unnamed: 0,test_token,test_label,file
0,record,O,110-01.xml
1,date,O,110-01.xml
2,:,O,110-01.xml
3,2069-04-07,O,110-01.xml
4,mr.,O,110-01.xml
5,villegas,O,110-01.xml
6,is,O,110-01.xml
7,seen,O,110-01.xml
8,today,O,110-01.xml
9,.,O,110-01.xml


### Test LABELS for TOKENS in TEST Dataset against BERT Outputs

BERT Classifier has returned results for the tokens passed in 'test.tsv' file.  The returned values are probabilities, that need to be converted into equivalent class labels based on majority class.  Then, the class label should be compared against the actual label from the code above to extract the IO-Coding from the xml files.  This is a brute-force approach or a manual way of verifying the validity of the predictions


#### Checking I-hypertension.mention labels 

In [16]:
tdf[75:85]
# bert output predicted I-hypertension.mention for token in position 78

Unnamed: 0,test_token,test_label,file
75,atenolol,O,110-01.xml
76,for,O,110-01.xml
77,hypertension,I-hypertension.mention,110-01.xml
78,and,O,110-01.xml
79,1,O,110-01.xml
80,hydroxychloroquine,O,110-01.xml
81,tablet,O,110-01.xml
82,.,O,110-01.xml
83,he,O,110-01.xml
84,is,O,110-01.xml


In [17]:
tdf[710:715]
# bert output predicted I-hypertension.mention for token in position 712

Unnamed: 0,test_token,test_label,file
710,arthritis,O,110-03.xml
711,htn,I-hypertension.mention,110-03.xml
712,right,O,110-03.xml
713,carotid,O,110-03.xml
714,artery,O,110-03.xml


In [18]:
tdf[1990:2000]
# bert output predicted I-hypertension.mention for token in position 1991

Unnamed: 0,test_token,test_label,file
1990,hypertension,I-hypertension.mention,111-01.xml
1991,.,O,111-01.xml
1992,medications,O,111-01.xml
1993,:,O,111-01.xml
1994,zestril,O,111-01.xml
1995,",",O,111-01.xml
1996,zocor,O,111-01.xml
1997,",",O,111-01.xml
1998,hydrochlorothiazide,O,111-01.xml
1999,",",O,111-01.xml


In [19]:
tdf[3080:3090]
# bert output predicted I-hypertension.mention for token in position 3083

Unnamed: 0,test_token,test_label,file
3080,history,O,111-04.xml
3081,:,O,111-04.xml
3082,hypertension,I-hypertension.mention,111-04.xml
3083,",",O,111-04.xml
3084,diabetes,O,111-04.xml
3085,",",O,111-04.xml
3086,cad,O,111-04.xml
3087,",",O,111-04.xml
3088,and,O,111-04.xml
3089,peripheral,O,111-04.xml


In [20]:
tdf[6720:6730]
# bert output predicted I-hypertension.mention for token in position 6726

Unnamed: 0,test_token,test_label,file
6720,also,O,113-02.xml
6721,has,O,113-02.xml
6722,a,O,113-02.xml
6723,history,O,113-02.xml
6724,of,O,113-02.xml
6725,hypertension,I-hypertension.mention,113-02.xml
6726,who,O,113-02.xml
6727,has,O,113-02.xml
6728,been,O,113-02.xml
6729,experiencing,O,113-02.xml


#### Checking I-hypertension.high_bp labels

In [21]:
tdf[445:455]
# bert output predicted I-hypertension.high_bp for token in position 451

Unnamed: 0,test_token,test_label,file
445,well-appearing,O,110-02.xml
446,male,O,110-02.xml
447,.,O,110-02.xml
448,blood,O,110-02.xml
449,pressure,O,110-02.xml
450,142/74,I-hypertension.high_bp,110-02.xml
451,.,I-hypertension.high_bp,110-02.xml
452,chest,O,110-02.xml
453,clear,O,110-02.xml
454,.,O,110-02.xml


In [22]:
tdf[790:800]
# bert output predicted I-hypertension.high_bp for token in position 794

Unnamed: 0,test_token,test_label,file
790,59,O,110-03.xml
791,bp,I-hypertension.high_bp,110-03.xml
792,:,I-hypertension.high_bp,110-03.xml
793,158/72,I-hypertension.high_bp,110-03.xml
794,rr,O,110-03.xml
795,:,O,110-03.xml
796,20,O,110-03.xml
797,sat,O,110-03.xml
798,:,O,110-03.xml
799,97,O,110-03.xml


In [23]:
tdf[2360:2370]
# bert output predicted I-hypertension.high_bp for token in position 2366

Unnamed: 0,test_token,test_label,file
2360,include,O,111-02.xml
2361,a,O,111-02.xml
2362,blood,O,111-02.xml
2363,pressure,O,111-02.xml
2364,of,O,111-02.xml
2365,179/85,I-hypertension.high_bp,111-02.xml
2366,",",O,111-02.xml
2367,a,O,111-02.xml
2368,pulse,O,111-02.xml
2369,of,O,111-02.xml


In [24]:
tdf[12570:12580]
# bert output predicted I-hypertension.high_bp for token in position 12574 and 12576

Unnamed: 0,test_token,test_label,file
12570,blood,O,115-01.xml
12571,pressure,O,115-01.xml
12572,:,O,115-01.xml
12573,150/86,I-hypertension.high_bp,115-01.xml
12574,",",O,115-01.xml
12575,148/84,I-hypertension.high_bp,115-01.xml
12576,.,O,115-01.xml
12577,pulse,O,115-01.xml
12578,78.,O,115-01.xml
12579,perrl,O,115-01.xml


In [25]:
tdf[18405:18418]
# bert output predicted I-hypertension.high_bp for token in position 18415

Unnamed: 0,test_token,test_label,file
18405,in,O,118-01.xml
18406,past,O,118-01.xml
18407,.,O,118-01.xml
18408,er,O,118-01.xml
18409,:,O,118-01.xml
18410,97.6,O,118-01.xml
18411,",",O,118-01.xml
18412,77,O,118-01.xml
18413,",",O,118-01.xml
18414,161/84,I-hypertension.high_bp,118-01.xml


### Extract Labeling by XML File

Using the dataset captured above, extract information on the count of I-hypertension.mention and I-hypertension.high_bp tags in each of the files passed in test dataset and map them to the corresponding TAG and INDICATOR values.  This gives a high level counts of the tags identified in each of the files included in the test data.  This will be useful for error analysis and should be a base point for constructing the tags and start/end points if needed.


In [26]:
tdf.groupby(['file', 'test_label']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,test_token
file,test_label,Unnamed: 2_level_1
110-01.xml,I-hypertension.mention,1
110-01.xml,O,333
110-02.xml,I-hypertension.high_bp,2
110-02.xml,I-hypertension.mention,1
110-02.xml,O,250
110-03.xml,I-hypertension.high_bp,3
110-03.xml,I-hypertension.mention,1
110-03.xml,O,510
110-04.xml,I-hypertension.mention,1
110-04.xml,O,787


In [27]:
# Get counts by labels for hypertension mention and high_bp across all test files
tdf.head(10)

Unnamed: 0,test_token,test_label,file
0,record,O,110-01.xml
1,date,O,110-01.xml
2,:,O,110-01.xml
3,2069-04-07,O,110-01.xml
4,mr.,O,110-01.xml
5,villegas,O,110-01.xml
6,is,O,110-01.xml
7,seen,O,110-01.xml
8,today,O,110-01.xml
9,.,O,110-01.xml


In [29]:
tdf.to_csv('test_results_withfilenames.csv')

### Read in results from BERT Predicitons to the above dataset

The above dataset is derived from IO-Coding applied as done on the training set. This is what should be based on the annotation process. Nowe, we have to read in the predictions from bert, which is a set of class probabilities across all 3 classes and we have to merget that with the above dataset for comparison and error analysis. 


In [30]:
# read in the test results captured for BERT Hypertension model and specify columns as the actual file has no header
bert_hypertension_results = pd.read_csv("bert_output_results/hypertension/bert_run1_test_results.tsv", sep='\t',header=None)
bert_hypertension_results.columns=["Class1", "Class2", "Class3"]

In [31]:
bert_hypertension_results.head(5)

Unnamed: 0,Class1,Class2,Class3
0,5.5e-05,9e-06,0.999937
1,5.4e-05,8e-06,0.999938
2,0.000189,2.8e-05,0.999783
3,5.4e-05,8e-06,0.999938
4,5.4e-05,9e-06,0.999937


#### Class Mappings

Class labels correspond as follows:

* Class1 --> I-hypertension.high_bp
* Class2 --> I-hypertension.mention
* Class3 --> O


In [32]:
import numpy as np

In [33]:
ntest = np.array(bert_hypertension_results)
ntest.argmax(axis=1)

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [34]:
ntest.shape

(23160, 3)

In [35]:
bert_hypertension_results['classLabel'] = bert_hypertension_results.idxmax(axis=1)

In [36]:
bert_hypertension_results.head(5)

Unnamed: 0,Class1,Class2,Class3,classLabel
0,5.5e-05,9e-06,0.999937,Class3
1,5.4e-05,8e-06,0.999938,Class3
2,0.000189,2.8e-05,0.999783,Class3
3,5.4e-05,8e-06,0.999938,Class3
4,5.4e-05,9e-06,0.999937,Class3


In [37]:
def set_labels(classlabel):
    if (classlabel=='Class1'):
        return 'I-hypertension.high_bp'
    elif (classlabel=='Class2'):
        return 'I-hypertension.mention'
    else:
        return 'O'

bert_hypertension_results['PredictedLabel'] = bert_hypertension_results['classLabel'].apply(set_labels)
bert_hypertension_results.drop('classLabel', 1, inplace=True)


In [38]:
bert_hypertension_results.head(10)

Unnamed: 0,Class1,Class2,Class3,PredictedLabel
0,5.5e-05,9e-06,0.999937,O
1,5.4e-05,8e-06,0.999938,O
2,0.000189,2.8e-05,0.999783,O
3,5.4e-05,8e-06,0.999938,O
4,5.4e-05,9e-06,0.999937,O
5,5.6e-05,9e-06,0.999935,O
6,0.000565,9.6e-05,0.99934,O
7,5.7e-05,9e-06,0.999933,O
8,0.000136,2.3e-05,0.999841,O
9,5.5e-05,9e-06,0.999936,O


In [39]:
# validating the counts by label
bert_hypertension_results['PredictedLabel'].value_counts()

O                         23097
I-hypertension.mention       47
I-hypertension.high_bp       16
Name: PredictedLabel, dtype: int64

In [40]:
test_hypertension_combined = pd.concat([tdf, bert_hypertension_results['PredictedLabel']], axis=1)

In [41]:
test_hypertension_combined.head(10)

Unnamed: 0,test_token,test_label,file,PredictedLabel
0,record,O,110-01.xml,O
1,date,O,110-01.xml,O
2,:,O,110-01.xml,O
3,2069-04-07,O,110-01.xml,O
4,mr.,O,110-01.xml,O
5,villegas,O,110-01.xml,O
6,is,O,110-01.xml,O
7,seen,O,110-01.xml,O
8,today,O,110-01.xml,O
9,.,O,110-01.xml,O


In [42]:
# testing (spot-checking) where model predicted labels 1 & 2
test_hypertension_combined[70:85]

Unnamed: 0,test_token,test_label,file,PredictedLabel
70,that,O,110-01.xml,O
71,he,O,110-01.xml,O
72,continues,O,110-01.xml,O
73,taking,O,110-01.xml,O
74,his,O,110-01.xml,O
75,atenolol,O,110-01.xml,O
76,for,O,110-01.xml,O
77,hypertension,I-hypertension.mention,110-01.xml,I-hypertension.mention
78,and,O,110-01.xml,O
79,1,O,110-01.xml,O


In [43]:
test_hypertension_combined[485:495]

Unnamed: 0,test_token,test_label,file,PredictedLabel
485,.,O,110-02.xml,O
486,(,O,110-02.xml,O
487,3,O,110-02.xml,O
488,),O,110-02.xml,O
489,hypertension,I-hypertension.mention,110-02.xml,I-hypertension.mention
490,.,O,110-02.xml,O
491,under,O,110-02.xml,O
492,good,O,110-02.xml,O
493,control,O,110-02.xml,O
494,.,O,110-02.xml,O


In [44]:
test_hypertension_combined[450:460]

Unnamed: 0,test_token,test_label,file,PredictedLabel
450,142/74,I-hypertension.high_bp,110-02.xml,I-hypertension.high_bp
451,.,I-hypertension.high_bp,110-02.xml,O
452,chest,O,110-02.xml,O
453,clear,O,110-02.xml,O
454,.,O,110-02.xml,O
455,cor,O,110-02.xml,O
456,:,O,110-02.xml,O
457,no,O,110-02.xml,O
458,murmur,O,110-02.xml,O
459,.,O,110-02.xml,O


In [45]:
test_hypertension_combined[790:800]

Unnamed: 0,test_token,test_label,file,PredictedLabel
790,59,O,110-03.xml,O
791,bp,I-hypertension.high_bp,110-03.xml,O
792,:,I-hypertension.high_bp,110-03.xml,O
793,158/72,I-hypertension.high_bp,110-03.xml,I-hypertension.high_bp
794,rr,O,110-03.xml,O
795,:,O,110-03.xml,O
796,20,O,110-03.xml,O
797,sat,O,110-03.xml,O
798,:,O,110-03.xml,O
799,97,O,110-03.xml,O


### Interpreting the predictions compared against actual test labels

As seen above, BERT predictions seem very accurate and it seems to predict only after it has seen the complete context.  Also, punctuation marks are not labeled as one of the relevant classes, although a human annotator has done based on the instructions provided as part of the annotation process.  