In [1]:
#basics
import random
import pandas as pd
import torch
#extra:
import os
import nltk
from glob import glob
from lxml import etree

In [24]:
#sentence_id : id for the sentence, can be found in the xml data
#token_id: id for the word in the vocabulary
#char_start_id: where the character starts in the sentence, can be found in the xml data
#char_end_id: where the character ends in the sentence, can be found in the xml data
#split: which data split the token belong to, e.g. TRAIN, VAL or TEST.
#ner_id: id of the NER label, e.g. if we have 3 labels there would be 3 ids.from glob import glob

def parse_data(data_dir):
    
    data_list = []
    ner_list = []
    vocab = [] #keeping track of unique words in the data
    data_dir = glob("{}/*".format(data_dir)) #glob returns a possibly-empty list of path names that match data_dir 
                                            #...in this case a list with the two subdirectories 'Test' and 'Train'                                           
    for subdir in data_dir: #looping through 'Test' and 'Train'
        split = os.path.basename(subdir) #get the directory name without path
        subdir = glob("{}/*".format(subdir))
        for folder in subdir:  #looping through 'Test for DDI Extraction task' and 'Test for DrugNER task'
            folder = glob("{}/*".format(folder))
            for subfolder in folder: #looping through 'DrugBank' and 'MedLine'
                subfolder = glob("{}/*".format(subfolder))
                for xml_file in subfolder:
                    token_instances, ner_instances, vocab = parse_xml(xml_file, split, vocab)
                    data_list = handle_lists(token_instances, data_list)
                    ner_list = handle_lists(ner_instances, ner_list)

def handle_lists(returned_instances, existing_list):
    if not existing_list: #check if list is empty
        existing_list = returned_instances #if it is, then populate it with the newly returned instances
    else:
        existing_list = existing_list + returned_instances #else concatenate the existing list with the newly returned instance list
    return existing_list
                        
def parse_xml(xml_file, split, vocab):    
    
    tree = etree.parse(xml_file)
    root = tree.getroot()
    token_instances = [] 
    ner_instances = []
    
    for elem in root: #loop over sentence tags
        if elem.tag == 'sentence':
            sent_id = elem.attrib['id'] #get sentence id
            text = elem.attrib['text']  #get the sentence as a string of text
            text = text.replace('-', ' ') #replaces all hyphens with whitespace for easier split of compound words
            char_pos = -1 #variable for keeping track of character-based positions of the words in the sentence
            nltk_tokens = nltk.word_tokenize(text)
            for token in nltk_tokens:
                char_pos += 1
                char_start = char_pos
                char_end = char_start + len(token)-1
                token_id, vocab = map_token_to_id(token, vocab)
                #TODO: ta bort 'token' efter testning:
                token_instance = [sent_id, token, token_id, char_start, char_end, split]
                token_instances.append(token_instance)
                char_pos=char_end+1 #increase by 1 to account for the whitespace between the current and the next word 
        for subelem in elem: #looping through children of sentence_id, such as 'entity' and 'pair' 
            if subelem.tag == 'entity':
                ner_id = subelem.attrib['text']
                char_start = subelem.attrib['charOffset'].split('-')[0]
                char_end = subelem.attrib['charOffset'].split('-')[1]
                ner_instance = [sent_id, ner_id, char_start, char_end]
                ner_instances.append(ner_instance)
    return token_instances, ner_instances, vocab
    
def map_token_to_id(token, vocab):
    vocab = vocab
    if token not in vocab:
        vocab.append(token)
    token_id = vocab.index(token)
    return token_id, vocab

In [25]:
parse_data('/home/guserbto@GU.GU.SE/lt2316-h20-aa/DDICorpus')

['DDI-DrugBank.d610.s0', 'abacavir', '30', '37']
