In [4]:
#basics
import random
import pandas as pd
import torch
#extra:
import os
import nltk
import string
from glob import glob
from lxml import etree

In [15]:
def parse_data(data_dir):
    
    data_list = []
    ner_list = []
    vocab = [] #keeping track of unique words in the data
    data_dir = glob("{}/*".format(data_dir)) #glob returns a possibly-empty list of path names that match data_dir 
                                            #...in this case a list with the two subdirectories 'Test' and 'Train'                                           
    for subdir in data_dir: #looping through 'Test' and 'Train'
        split = os.path.basename(subdir) #get the directory name without path
        subdir = glob("{}/*".format(subdir))
        if split == 'Train':
            for folder in subdir:
                folder = glob("{}/*".format(folder))
                for xml_file in folder:
                    token_instances, ner_instances, vocab = parse_xml(xml_file, split, vocab)
                    data_list = data_list + token_instances
                    ner_list = ner_list + ner_instances
        elif split == 'Test':
            for folder in subdir:  #looping through 'Test for DDI Extraction task' and 'Test for DrugNER task'
                folder = glob("{}/*".format(folder))
                for subfolder in folder: #looping through 'DrugBank' and 'MedLine'
                    subfolder = glob("{}/*".format(subfolder))
                    for xml_file in subfolder:
                        token_instances, ner_instances, vocab = parse_xml(xml_file, split, vocab)
                        data_list = data_list + token_instances
                        ner_list = ner_list + ner_instances

    data_df, ner_df = list2df(data_list, ner_list) #turn lists into dataframes
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_df)
    return data_df, ner_df
                    
def list2df(data_list, ner_list):
    data_df = pd.DataFrame.from_records(data_list, columns=['sentence_id', 'token', 'token_id', 'char_start_id', 'char_end_id', 'split'])
    data_df = data_df[~data_df['token'].isin(list(string.punctuation))] #remove tokens that are just punctuation 
    data_df.drop('token', inplace=True, axis=1) #remove 'token' column since it's not needed anymore
    #'inPlace=True' means we are working on the original df, 'axis=1' refers to the column axis
    train_samples = data_df[data_df['split']=='Train'].sample(frac=0.15) #sample 15 % of 'Train'-labeled rows
    train_samples.split='Val' #replace those 'Train' labels with 'Val'
    data_df.update(train_samples) #incorporate the modified train samples back into the original dataframe
    ner_df = pd.DataFrame.from_records(ner_list, columns=['sentence_id', 'ner_id', 'char_start_id', 'char_end_id'])
    return data_df, ner_df    
                        
def parse_xml(xml_file, split, vocab):    
    
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    token_instances = [] #save all token 
    ner_instances = []
    
    for elem in root: #loop over sentence tags
        if elem.tag == 'sentence':
            sent_id = elem.attrib['id'] #get sentence id
            text = elem.attrib['text']  #get the sentence as a string of text
            text = text.replace('-', ' ') #replaces all hyphens with whitespace for easier split of compound words
            char_pos = -1 #variable for keeping track of character-based positions of the words in the sentence
            nltk_tokens = nltk.word_tokenize(text)
            for token in nltk_tokens:
                char_pos, token_instance, vocab  = get_token_instance(char_pos, sent_id, token, split, vocab)
                token_instances.append(token_instance)
        for subelem in elem: #looping through children tags (i.e. 'entity', 'pair') of sentence_id
            if subelem.tag == 'entity':
                ner_instance = get_ner_instance(sent_id, subelem)
                ner_instances.append(ner_instance)
    return token_instances, ner_instances, vocab

def get_token_instance(char_pos, sent_id, token, split, vocab):
    char_pos += 1
    char_start = char_pos
    char_end = char_start + len(token)-1
    token_id, vocab = map_token_to_id(token, vocab)
    token_instance = [sent_id, token, token_id, char_start, char_end, split]
    char_pos=char_end+1 #increase by 1 to account for the whitespace between the current and the next word
    return char_pos, token_instance, vocab

def get_ner_instance(sent_id, entity):
    #TODO: take multispan NERs into consideration?? 
    #hyphen_count = entity.attrib['charOffset'].count('-')
    ner_id = entity.attrib['type']
    char_start = entity.attrib['charOffset'].split('-')[0]
    char_end = entity.attrib['charOffset'].split('-')[1]
    ner_instance = [sent_id, ner_id, char_start, char_end]
    return ner_instance
    
def map_token_to_id(token, vocab):
    vocab = vocab
    if token not in vocab:
        vocab.append(token)
    token_id = vocab.index(token)
    return token_id, vocab

In [16]:
parse_data('/home/guserbto@GU.GU.SE/lt2316-h20-aa/DDICorpus')

Unnamed: 0,sentence_id,token_id,char_start_id,char_end_id,split
0,DDI-DrugBank.d610.s0,0.0,0.0,14.0,Test
1,DDI-DrugBank.d610.s0,1.0,16.0,25.0,Test
2,DDI-DrugBank.d610.s0,2.0,27.0,28.0,Test
3,DDI-DrugBank.d610.s0,3.0,30.0,37.0,Test
4,DDI-DrugBank.d610.s0,4.0,39.0,42.0,Test
5,DDI-DrugBank.d610.s0,5.0,44.0,46.0,Test
6,DDI-DrugBank.d610.s0,6.0,48.0,54.0,Test
7,DDI-DrugBank.d610.s0,7.0,56.0,57.0,Test
8,DDI-DrugBank.d610.s0,8.0,59.0,61.0,Test
9,DDI-DrugBank.d610.s0,9.0,63.0,70.0,Test


(                 sentence_id  token_id  char_start_id  char_end_id  split
 0       DDI-DrugBank.d610.s0       0.0            0.0         14.0   Test
 1       DDI-DrugBank.d610.s0       1.0           16.0         25.0   Test
 2       DDI-DrugBank.d610.s0       2.0           27.0         28.0   Test
 3       DDI-DrugBank.d610.s0       3.0           30.0         37.0   Test
 4       DDI-DrugBank.d610.s0       4.0           39.0         42.0   Test
 ...                      ...       ...            ...          ...    ...
 195939   DDI-MedLine.d113.s6    3099.0           91.0        105.0  Train
 195940   DDI-MedLine.d113.s6     635.0          107.0        112.0    Val
 195941   DDI-MedLine.d113.s6       2.0          114.0        115.0    Val
 195942   DDI-MedLine.d113.s6     371.0          117.0        120.0  Train
 195943   DDI-MedLine.d113.s6     509.0          122.0        126.0  Train
 
 [170445 rows x 5 columns],
                 sentence_id ner_id char_start_id char_end_id
 0      