In [25]:
#basics
import random
import pandas as pd
import torch
#extra:
import os
import nltk
import string
from glob import glob
from lxml import etree

In [37]:
def parse_data(data_dir):
    
    id2word = {}
    id2ner = {0:'none', 1:'group', 2:'drug_n', 3:'drug', 4:'brand'}
    data_list = []
    ner_list = []
    data_dir = glob("{}/*".format(data_dir)) #glob returns a possibly-empty list of path names that match data_dir 
                                            #...in this case a list with the two subdirectories 'Test' and 'Train'                                           
    for subdir in data_dir: #looping through 'Test' and 'Train'
        split = os.path.basename(subdir) #get the directory name without path
        subdir = glob("{}/*".format(subdir))
        if split == 'Train':
            for folder in subdir:
                folder = glob("{}/*".format(folder))
                for xml_file in folder:
                    token_instances, ner_instances, id2word = parse_xml(xml_file, split, id2word, id2ner)
                    #print("NER_INSTANCES (Train): ", ner_instances)
                    data_list = data_list + token_instances
                    for instance in ner_instances:
                        if instance:
                            ner_list.append(instance)
        elif split == 'Test':
            for folder in subdir:  #looping through 'Test for DDI Extraction task' and 'Test for DrugNER task'
                folder = glob("{}/*".format(folder))
                for subfolder in folder: #looping through 'DrugBank' and 'MedLine'
                    subfolder = glob("{}/*".format(subfolder))
                    for xml_file in subfolder:
                        token_instances, ner_instances, id2word = parse_xml(xml_file, split, id2word, id2ner)
                        #print("NER_INSTANCES (Test): ", ner_instances)
                        data_list = data_list + token_instances
                        for instance in ner_instances:
                            if instance:
                                ner_list.append(instance)
    
    vocab = list(id2word.values()) #keeping track of unique words in the data
    data_df, ner_df = list2df(data_list, ner_list) #turn lists into dataframes
    #df1 = data_df[data_df.isnull().any(axis=1)]
    #display(df1)
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #    display(data_df)
    return data_df, ner_df
                    
def list2df(data_list, ner_list):
    data_df = pd.DataFrame.from_records(data_list, columns=['sentence_id', 'token_id', 'char_start_id', 'char_end_id', 'split'])
    #data_df = data_df[~data_df['token'].isin(list(string.punctuation))] #remove tokens that are just punctuation 
    #data_df.drop('token', inplace=True, axis=1) #remove 'token' column since it's not needed anymore
    #'inPlace=True' means we are working on the original df, 'axis=1' refers to the column axis
    train_samples = data_df[data_df['split']=='Train'].sample(frac=0.15) #sample 15 % of 'Train'-labeled rows
    train_samples.split='Val' #replace those 'Train' labels with 'Val'
    data_df.update(train_samples) #incorporate the modified train samples back into the original dataframe
    ner_df = pd.DataFrame.from_records(ner_list, columns=['sentence_id', 'ner_id', 'char_start_id', 'char_end_id'])
    return data_df, ner_df    
                        
def parse_xml(xml_file, split, id2word, id2ner):
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    token_instances = [] #save all token 
    ner_instances = []
    
    for elem in root: #loop over sentence tags
        if elem.tag == 'sentence':
            sent_id = elem.attrib['id'] #get sentence id
            text = elem.attrib['text']  #get the sentence as a string of text
            text = text.replace('-', ' ') #replaces all hyphens with whitespace for easier split of compound words
            char_pos = -1 #variable for keeping track of character-based positions of the words in the sentence
            nltk_tokens = nltk.word_tokenize(text)
            for token in nltk_tokens:
                char_pos, token_instance, id2word  = get_token_instance(char_pos, sent_id, token, split, id2word)
                token_instances.append(token_instance)
        for subelem in elem: #looping through children tags (i.e. 'entity', 'pair') of sentence_id
            if subelem.tag == 'entity':
                ner_instance = get_ner_instance(sent_id, subelem, id2ner)
                #print("ner_instance (parse_xml): ", ner_instance)
                for instance in ner_instance: #loop through list of returned NER instances
                    ner_instances.append(instance) #save them individually in the ner_instances list
    #print("NER_INSTANCES: ", ner_instances)
    return token_instances, ner_instances, id2word

def get_token_instance(char_pos, sent_id, token, split, id2word):
    char_pos += 1
    char_start = char_pos
    char_end = char_start + len(token)-1
    token_id, id2word = map_token_to_id(token, id2word)
    token_instance = [sent_id, int(token_id), int(char_start), int(char_end), split]
    #print("TOKEN INSTANCE: ", token_instance)(
    char_pos=char_end+1 #increase by 1 to account for the whitespace between the current and the next word
    return char_pos, token_instance, id2word

def get_ner_id_as_int(ner_id, id2ner):
    for key, value in id2ner.items(): 
         if ner_id == value: 
            return key 
    else:
        return "key doesn't exist"
    

def get_ner_instance(sent_id, entity, id2ner):
    #Problem of this approach: if a NER might be tokenized differently from the token dataframe
    ner_instances = []
    #ner_token = entity.attrib['text']
    #tokenized_ner = entity.attrib['text'].split() 
    charOffset = entity.attrib['charOffset']
    #HAPPY PATH: if the character span is a single span:
    if ';' not in charOffset:
        char_start = charOffset.split('-')[0]
        char_end = charOffset.split('-')[1]
        ner_id = get_ner_id_as_int(entity.attrib['type'], id2ner)
        #ner_id = entity.attrib['type'] #getting the label: 'brand', 'drug', 'drug_n' or 'group'
        ner_instance = [sent_id, ner_id, char_start, char_end]
        return [ner_instance]
    #PATH OF DOOM: for multiword entities with several character spans:
    if ';' in charOffset:
        for span in charOffset.split(';'):
            ner_id = entity.attrib['type'] #getting the label: 'brand', 'drug', 'drug_n' or 'group'
            char_start = span.split('-')[0]
            char_end = span.split('-')[1]
            ner_instance = [sent_id, ner_id, int(char_start), int(char_end)]
            ner_instances.append(ner_instance)
            #print("SPECIAL NER_INSTANCE: ", ner_instance)
    return ner_instances
    
def map_token_to_id(token, id2word):
    res = False
    for key in id2word: 
        if(id2word[key] == token):
            res = True
            return key, id2word
    if res == False:
        token_id = len(id2word)+1
        id2word[token_id] = token
        return token_id, id2word

In [38]:
parse_data('/home/guserbto@GU.GU.SE/lt2316-h20-aa/DDICorpus')

(                 sentence_id  token_id  char_start_id  char_end_id  split
 0       DDI-DrugBank.d610.s0       1.0            0.0         14.0   Test
 1       DDI-DrugBank.d610.s0       2.0           16.0         25.0   Test
 2       DDI-DrugBank.d610.s0       3.0           27.0         28.0   Test
 3       DDI-DrugBank.d610.s0       4.0           30.0         37.0   Test
 4       DDI-DrugBank.d610.s0       5.0           39.0         42.0   Test
 ...                      ...       ...            ...          ...    ...
 195940   DDI-MedLine.d113.s6     636.0          107.0        112.0    Val
 195941   DDI-MedLine.d113.s6       3.0          114.0        115.0  Train
 195942   DDI-MedLine.d113.s6     372.0          117.0        120.0  Train
 195943   DDI-MedLine.d113.s6     510.0          122.0        126.0  Train
 195944   DDI-MedLine.d113.s6      17.0          128.0        128.0  Train
 
 [195945 rows x 5 columns],
                 sentence_id ner_id char_start_id char_end_id
 0      