In [19]:
#basics
import random
import pandas as pd
import torch
#extra:
import os
import nltk
import string
from glob import glob
from lxml import etree

In [62]:
#sentence_id : id for the sentence, can be found in the xml data
#token_id: id for the word in the vocabulary
#char_start_id: where the character starts in the sentence, can be found in the xml data
#char_end_id: where the character ends in the sentence, can be found in the xml data
#split: which data split the token belong to, e.g. TRAIN, VAL or TEST.
#ner_id: id of the NER label, e.g. if we have 3 labels there would be 3 ids.from glob import glob

def parse_data(data_dir):
    
    data_list = []
    ner_list = []
    vocab = [] #keeping track of unique words in the data
    data_dir = glob("{}/*".format(data_dir)) #glob returns a possibly-empty list of path names that match data_dir 
                                            #...in this case a list with the two subdirectories 'Test' and 'Train'                                           
    for subdir in data_dir: #looping through 'Test' and 'Train'
        split = os.path.basename(subdir) #get the directory name without path
        subdir = glob("{}/*".format(subdir))
        for folder in subdir:  #looping through 'Test for DDI Extraction task' and 'Test for DrugNER task'
            folder = glob("{}/*".format(folder))
            for subfolder in folder: #looping through 'DrugBank' and 'MedLine'
                subfolder = glob("{}/*".format(subfolder))
                for xml_file in subfolder:
                    token_instances, ner_instances, vocab = parse_xml(xml_file, split, vocab)
                    data_list = handle_lists(token_instances, data_list)
                    ner_list = handle_lists(ner_instances, ner_list)
    
    data_df, ner_df = list2df(data_list, ner_list) #turn lists into dataframes
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(data_df)
        
    #return display(data_df), print(ner_df)
                    
def list2df(data_list, ner_list):
    data_df = pd.DataFrame.from_records(data_list, columns=['sentence_id', 'token', 'token_id', 'char_start_id', 'char_end_id', 'split'])
    data_df = data_df[~data_df['token'].isin(list(string.punctuation))] #remove tokens that are just punctuation 
    data_df.drop('token', inplace=True, axis=1) #remove whole 'token' column since it's not needed anymore
    #'inPlace=True' means we are working on the original df, 'axis=1' refers to column
    
    #count number of 'Train' rows
    #replace 15 % of Train labels with 'Val' at random
    
    ner_df = pd.DataFrame.from_records(ner_list, columns=['sentence_id', 'ner_id', 'char_start_id', 'char_end_id'])
    return data_df, ner_df    
                    
def handle_lists(returned_instances, existing_list):
    if not existing_list: #check if list is empty
        existing_list = returned_instances #if it is, then populate it with the newly returned instances
    else:
        existing_list = existing_list + returned_instances #else concatenate the existing list with the newly returned instance list
    return existing_list
                        
def parse_xml(xml_file, split, vocab):    
    
    tree = etree.parse(xml_file)
    root = tree.getroot()
    token_instances = [] 
    ner_instances = []
    
    for elem in root: #loop over sentence tags
        if elem.tag == 'sentence':
            sent_id = elem.attrib['id'] #get sentence id
            text = elem.attrib['text']  #get the sentence as a string of text
            text = text.replace('-', ' ') #replaces all hyphens with whitespace for easier split of compound words
            char_pos = -1 #variable for keeping track of character-based positions of the words in the sentence
            nltk_tokens = nltk.word_tokenize(text)
            for token in nltk_tokens:
                char_pos, token_instance, vocab  = get_token_instance(char_pos, sent_id, token, split, vocab)
                token_instances.append(token_instance)
        for subelem in elem: #looping through children of sentence_id, such as 'entity' and 'pair' 
            if subelem.tag == 'entity':
                ner_instance = get_ner_instance(sent_id, subelem)
                ner_instances.append(ner_instance)
    return token_instances, ner_instances, vocab

def get_token_instance(char_pos, sent_id, token, split, vocab):
    char_pos += 1
    char_start = char_pos
    char_end = char_start + len(token)-1
    token_id, vocab = map_token_to_id(token, vocab)
    #TODO: ta bort 'token' efter dataframe-skapande
    token_instance = [sent_id, token, token_id, char_start, char_end, split]
    #print("TOKEN INSTACE: ", token_instance)
    char_pos=char_end+1 #increase by 1 to account for the whitespace between the current and the next word
    return char_pos, token_instance, vocab

def get_ner_instance(sent_id, entity):
    #TODO: take multispan NERs into consideration adding 
    #hyphen_count = entity.attrib['charOffset'].count('-')
    ner_id = entity.attrib['type']
    char_start = entity.attrib['charOffset'].split('-')[0]
    char_end = entity.attrib['charOffset'].split('-')[1]
    ner_instance = [sent_id, ner_id, char_start, char_end]
    #print("NER_INSTANCE: ", ner_instance)
    return ner_instance
    
def map_token_to_id(token, vocab):
    vocab = vocab
    if token not in vocab:
        vocab.append(token)
    token_id = vocab.index(token)
    return token_id, vocab

In [63]:
parse_data('/home/guserbto@GU.GU.SE/lt2316-h20-aa/DDICorpus')

                sentence_id  token_id  char_start_id  char_end_id split
0      DDI-DrugBank.d610.s0         0              0           14  Test
1      DDI-DrugBank.d610.s0         1             16           25  Test
2      DDI-DrugBank.d610.s0         2             27           28  Test
3      DDI-DrugBank.d610.s0         3             30           37  Test
4      DDI-DrugBank.d610.s0         4             39           42  Test
...                     ...       ...            ...          ...   ...
45551   DDI-MedLine.d153.s8       765            208          209  Test
45552   DDI-MedLine.d153.s8        57            211          212  Test
45553   DDI-MedLine.d153.s8        67            214          217  Test
45554   DDI-MedLine.d153.s8        40            219          220  Test
45555   DDI-MedLine.d153.s8       108            222          225  Test

[39620 rows x 5 columns]
