In [24]:
import xml.etree.cElementTree as ET
import os
import nltk
import string

### Get the files for parsing

In [107]:
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))
datafolder = (codefolder.replace("Code", "Dataset\\training-RiskFactors-Complete-Set1"))
#print (datafolder)

filenames = []
xml_contents = []

for file in os.listdir(datafolder):
    filename = os.fsdecode(os.fsencode((str(datafolder)+'\\'+file)))
    if filename.endswith( ('.xml') ): # select xml files
        #print(filename)
        filenames.append(filename)


### Define function to get tokens & their attributes

In [44]:
def spans(text):
    tokens = nltk.word_tokenize(text.lower())
    offset = 0
    start_indices = []
    end_indices = []
    for token in tokens:
        offset = text.find(token, offset)
        start, end = offset, offset+len(token)
        offset += len(token)
        start_indices.append(start)
        end_indices.append(end)
    return tokens, start_indices, end_indices

In [79]:
#print(filenames[30])

### Define function to generate BIO Coding (Model2)

BIO coding is a technique for extracting entities, where in the input sentence is tokenized and analyzed for occurences of words that belong to an entity of interest.  It follows the following scheme:

* B - marks beginning of the entity
* I - marks inside of the entity
* O - marks that the token is NOT part of any entity


In [108]:
def Generate_BIO_Coding(file_path, tag, attribute):
    """
    Function defined to take in a file for processing, tag and attributes to identify
    within the file (xml).  The tags are first read into objects, capturing the label,
    start and end values.  These are references in the 'text' object, where based on
    which the annotators did identify the tag in context. 
    
    This information is used to parse through the text (as tokens) and perform the BIO
    coding based on the start position matches.
    
    Input: 
    file_path: path of the file to be read in for processing
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc.
    attribute: specific attribute within the tag, from which to extract the value from
    
    Returns: 
    list of tokens, list of labels (BIO coding)
    """
    
    tree = ET.ElementTree(file=file_path)
    root = tree.getroot()

    text = root.find('TEXT').text.lower()
    
    tokens, start, end = spans(text)
    labels_list = []
    label_start = []
    label_end = []


    for item in root.find("TAGS"):
        if item.tag  == tag:
            label = (item.tag + "." + item.attrib[attribute]).lower().replace(" ", "_")
        else:
            label = ""

        for sub_item in item.findall(item.tag):
            if 'start' in sub_item.attrib.keys():
                labels_list.append(label)
                label_start.append(int(sub_item.attrib['start']))
                label_end.append(int(sub_item.attrib['end']))

        bio_labels = []

        count = 0

    while len(start) > count:
        if start[count] in label_start:
            label_start_index = label_start.index(start[count])
            end_index = label_end[label_start_index]
            word_label = labels_list[label_start_index]
            phrase = text[start[count]:end_index]
            phrase_tokens = nltk.word_tokenize(phrase)
            next_tag="B-"
            for word in phrase_tokens:
                if (len(word_label) > 0):
                    new_label = next_tag + word_label
                    bio_labels.append(new_label)
                    next_tag="I-"
                else:
                    bio_labels.append("O")
                count += 1
        else:
            bio_labels.append("O")
            count += 1 

    return tokens, bio_labels

In [89]:
def getBIOCoding_data(tag, attrib, filenames):

    """
    All files in the list (which holds the list of files in the directory) are parsed through
    and the Generate_BIO_Coding function is called by passing individual files within the folder.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    list of tokens, list of labels (BIO coding, done across all files in the path for the tag/attribute)
    """
    
    all_tokens = []
    all_labels = []

    for file in filenames:
        #print("processing file ... ", file)
        tokens, bio_labels = Generate_BIO_Coding(file_path=file, tag=tag, attribute=attrib)

        all_tokens.extend(tokens)
        all_labels.extend(bio_labels)
        #print("finished processing file ", file, "; and token length is ", len(all_tokens), "; and label length: ", len(all_labels))
        
    return all_tokens, all_labels


### Generate Data for Models

The goal is to generate the tokens and the corresponding labels specific to model of interest.  Below, we will be building the data for 'indicator' attribute for the following tags:

* DIABETES
* CAD
* HYPERTENSION
* HYPERLIPIDEMIA
* OBESE
* FAMILY_HIST

In [90]:
# get data for model #2
tag = 'DIABETES'
attribute = 'indicator'

diabetes_indicator_tokens, diabetes_indicator_labels = getBIOCoding_data(tag, attribute, filenames)
#diabetes_indicator_labels

In [91]:
# get data for model #4
tag = 'CAD'
attribute = 'indicator'

cad_indicator_tokens, cad_indicator_labels = getBIOCoding_data(tag, attribute, filenames)
#cad_indicator_labels

In [105]:
# get data for model #5
tag = 'HYPERTENSION'
attribute = 'indicator'

hypertension_indicator_tokens, hypertension_indicator_labels = getBIOCoding_data(tag, attribute, filenames)
#hypertension_indicator_labels

In [104]:
# get data for model #6
tag = 'HYPERLIPIDEMIA'
attribute = 'indicator'

hyperlipidemia_indicator_tokens, hyperlipidemia_indicator_labels = getBIOCoding_data(tag, attribute, filenames)


In [103]:
# get data for model #7
tag = 'OBESE'
attribute = 'indicator'

obese_indicator_tokens, obese_indicator_labels = getBIOCoding_data(tag, attribute, filenames)


In [102]:
# get data for model #8
tag = 'FAMILY_HIST'
attribute = 'indicator'

familyhist_indicator_tokens, familyhist_indicator_labels = getBIOCoding_data(tag, attribute, filenames)


In [14]:
import pandas as pd

In [100]:
# capture data into dataframe to work with it
df = pd.DataFrame({'token': familyhist_indicator_tokens, 'label': familyhist_indicator_labels})

In [101]:
df[df['label']!='O']

Unnamed: 0,token,label
11496,father,B-family_hist.present
11497,:,I-family_hist.present
11498,extensive,I-family_hist.present
11499,cad,I-family_hist.present
11500,",",I-family_hist.present
11501,with,I-family_hist.present
11502,first,I-family_hist.present
11503,mi,I-family_hist.present
11504,in,I-family_hist.present
11505,50,I-family_hist.present
