In [1]:
#basics
import random
import pandas as pd
import torch
#extra:
import os
import nltk
import string
from glob import glob
from lxml import etree

In [2]:
def parse_data(data_dir):
    
    id2word = {}
    id2ner = {0:'none', 1:'group', 2:'drug_n', 3:'drug', 4:'brand'}
    data_list = []
    ner_list = []
    data_dir = glob("{}/*".format(data_dir)) #glob returns a possibly-empty list of path names that match data_dir 
                                            #...in this case a list with the two subdirectories 'Test' and 'Train'                                           
    for subdir in data_dir: #looping through 'Test' and 'Train'
        split = os.path.basename(subdir) #get the directory name without path
        subdir = glob("{}/*".format(subdir))
        if split == 'Train':
            for folder in subdir:
                folder = glob("{}/*".format(folder))
                for xml_file in folder:
                    token_instances, ner_instances, id2word = parse_xml(xml_file, split, id2word, id2ner)
                    #print("NER_INSTANCES (Train): ", ner_instances)
                    data_list = data_list + token_instances
                    for instance in ner_instances:
                        if instance:
                            ner_list.append(instance)
        elif split == 'Test':
            for folder in subdir:  #looping through 'Test for DDI Extraction task' and 'Test for DrugNER task'
                folder = glob("{}/*".format(folder))
                for subfolder in folder: #looping through 'DrugBank' and 'MedLine'
                    subfolder = glob("{}/*".format(subfolder))
                    for xml_file in subfolder:
                        token_instances, ner_instances, id2word = parse_xml(xml_file, split, id2word, id2ner)
                        #print("NER_INSTANCES (Test): ", ner_instances)
                        data_list = data_list + token_instances
                        for instance in ner_instances:
                            if instance:
                                ner_list.append(instance)
    
    vocab = list(id2word.values()) #keeping track of unique words in the data
    data_df, ner_df = list2df(data_list, ner_list) #turn lists into dataframes
    #df1 = data_df[data_df.isnull().any(axis=1)]
    #display(data_df)
    #display(ner_df)
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #    display(data_df)
    return data_df, ner_df
                    
def list2df(data_list, ner_list):
    data_df = pd.DataFrame.from_records(data_list, columns=['sentence_id', 'token_id', 'char_start_id', 'char_end_id', 'split'])
    #data_df = data_df[~data_df['token'].isin(list(string.punctuation))] #remove tokens that are just punctuation 
    #data_df.drop('token', inplace=True, axis=1) #remove 'token' column since it's not needed anymore
    #'inPlace=True' means we are working on the original df, 'axis=1' refers to the column axis
    #train_samples = data_df[data_df['split']=='Train'].sample(frac=0.15) #sample 15 % of 'Train'-labeled rows --> SENTENCES
    #train_samples.split='Val' #replace those 'Train' labels with 'Val'
    #data_df.update(train_samples) #incorporate the modified train samples back into the original dataframe
    train_df = data_df[data_df['split']=='Train']
    unique_sent_in_train = list(train_df['sentence_id'].unique())
    print("UNIQUE TRAIN SENT: ", len(unique_sent_in_train))
    val_sample_sentences = unique_sent_in_train[:int(len(unique_sent_in_train) * .15)]
    print("UNIQUE VAL SENT: ", len(val_sample_sentences))
    val_df = data_df[data_df['sentence_id'].isin(val_sample_sentences)]
    val_df.split='Val'
    data_df.update(val_df)
    val_df = data_df[data_df.split=='Val']
    ner_df = pd.DataFrame.from_records(ner_list, columns=['sentence_id', 'ner_id', 'char_start_id', 'char_end_id'])
    counts = ner_df["ner_id"].value_counts()
    print(counts)
    return data_df, ner_df    
                        
def parse_xml(xml_file, split, id2word, id2ner):
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    token_instances = [] #save all token 
    ner_instances = []
    
    for elem in root: #loop over sentence tags
        if elem.tag == 'sentence':
            sent_id = elem.attrib['id'] #get sentence id
            text = elem.attrib['text']  #get the sentence as a string of text
            text = text.replace('-', ' ') #replaces all hyphens with whitespace for easier split of compound words
            char_pos = -1 #variable for keeping track of character-based positions of the words in the sentence
            nltk_tokens = nltk.word_tokenize(text)
            for token in nltk_tokens:
                char_pos, token_instance, id2word  = get_token_instance(char_pos, sent_id, token, split, id2word)
                token_instances.append(token_instance)
        for subelem in elem: #looping through children tags (i.e. 'entity', 'pair') of sentence_id
            if subelem.tag == 'entity':
                ner_instance = get_ner_instance(sent_id, subelem, id2ner)
                #print("ner_instance (parse_xml): ", ner_instance)
                for instance in ner_instance: #loop through list of returned NER instances
                    ner_instances.append(instance) #save them individually in the ner_instances list
    #print("NER_INSTANCES: ", ner_instances)
    return token_instances, ner_instances, id2word

def get_token_instance(char_pos, sent_id, token, split, id2word):
    char_pos += 1
    char_start = char_pos
    char_end = char_start + len(token)-1
    token_id, id2word = map_token_to_id(token, id2word)
    token_instance = [sent_id, int(token_id), int(char_start), int(char_end), split]
    #print("TOKEN INSTANCE: ", token_instance)(
    char_pos=char_end+1 #increase by 1 to account for the whitespace between the current and the next word
    return char_pos, token_instance, id2word

def get_ner_id_as_int(ner_id, id2ner):
    for key, value in id2ner.items(): 
         if ner_id == value: 
            return key 
    else:
        return "key doesn't exist"
    

def get_ner_instance(sent_id, entity, id2ner):
    #Problem of this approach: if a NER might be tokenized differently from the token dataframe
    ner_instances = []
    #ner_token = entity.attrib['text']
    #tokenized_ner = entity.attrib['text'].split() 
    charOffset = entity.attrib['charOffset']
    #HAPPY PATH: if the character span is a single span:
    if ';' not in charOffset:
        char_start = charOffset.split('-')[0]
        char_end = charOffset.split('-')[1]
        ner_id = get_ner_id_as_int(entity.attrib['type'], id2ner)
        #ner_id = entity.attrib['type'] #getting the label: 'brand', 'drug', 'drug_n' or 'group'
        ner_instance = [sent_id, int(ner_id), int(char_start), int(char_end)]
        return [ner_instance]
    #PATH OF DOOM: for multiword entities with several character spans:
    if ';' in charOffset:
        for span in charOffset.split(';'):
            ner_id = get_ner_id_as_int(entity.attrib['type'], id2ner) #getting the label: 'brand', 'drug', 'drug_n' or 'group'
            char_start = span.split('-')[0]
            char_end = span.split('-')[1]
            ner_instance = [sent_id, int(ner_id), int(char_start), int(char_end)]
            ner_instances.append(ner_instance)
            #print("SPECIAL NER_INSTANCE: ", ner_instance)
    return ner_instances
    
def map_token_to_id(token, id2word):
    res = False
    for key in id2word: 
        if(id2word[key] == token):
            res = True
            return key, id2word
    if res == False:
        token_id = len(id2word)+1
        id2word[token_id] = token
        return token_id, id2word


In [3]:
data_df, ner_df = parse_data('/home/guserbto@GU.GU.SE/lt2316-h20-aa/DDICorpus')

UNIQUE TRAIN SENT:  6905
UNIQUE VAL SENT:  1035
3    11656
1     4254
4     1865
2      766
Name: ner_id, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [4]:
def get_y(data_df, ner_df):
    # Should return a tensor containing the ner labels for all samples in each split.
    # the tensors should have the following following dimensions:
    # (NUMBER_SAMPLES, MAX_SAMPLE_LENGTH)
    # NOTE! the labels for each split should be on the GPU
    device = torch.device('cuda:1')
    
    # divide df by splits
    df_train = data_df[data_df.split=='Train']
    print("Unique sent in Train: ", len(list(df_train['sentence_id'].unique()))) 
    #print("df_train size: ", df_train.size)
    df_val = data_df[data_df.split=='Val']
    #print("df_val size: ", df_val.size)
    print("Unique sent in Val: ", len(list(df_val['sentence_id'].unique()))) 
    df_test = data_df[data_df.split=='Test']
    #print("df_test size: ", df_test.size)
    print("Unique sent in Test: ", len(list(df_test['sentence_id'].unique()))) 
    
    max_sample_length, sample_lengths_dict = get_sample_lengths(data_df)
    
    #get labels
    train_labels = label_tokens(df_train, ner_df, max_sample_length, sample_lengths_dict)
    #print("train labels:", len(train_labels))
    train_tensor = torch.LongTensor(train_labels)
    train_tensor = train_tensor.to(device)
    
    test_labels = label_tokens(df_test, ner_df, max_sample_length, sample_lengths_dict)
    test_tensor = torch.LongTensor(test_labels)
    test_tensor = test_tensor.to(device)
    
    val_labels = label_tokens(df_val, ner_df, max_sample_length, sample_lengths_dict)
    val_tensor = torch.LongTensor(val_labels)
    val_tensor = val_tensor.to(device)
  
    print("val labels:", len(val_labels))
    print("test labels:", len(test_labels))
    print("test labels:", len(train_labels))


    return train_tensor, val_tensor, test_tensor
    #return val_tensor

def label_tokens(df, ner_df, max_sample_length, sample_lengths_dict):
    labels = []
    
    df_as_list = df.values.tolist()
    #print("df_as_list[:50]: ", df_as_list[:50])
    ner_df_as_list = ner_df.values.tolist()
    
    #max_sample_length = 165
    
    sentence_labels = []
    match_found_count = 0
    for df_row in df_as_list:
        sentence_id = df_row[0]
        sentence_length = sample_lengths_dict[sentence_id]
        #print("SENTENCE_LENGTH: ", sentence_length)
        match_found = False 
        for ner_row in ner_df_as_list:
            #compare sentence_id, char_start, char_end between df_row and ner_rows: 
            if df_row[0] == ner_row[0]:
                if int(df_row[2]) == ner_row[2] and int(df_row[3]) == ner_row[3]:
                    label = ner_row[1]
                    match_found = True
                    #print("match found", df_row, "<3", ner_row) 
                    sentence_labels.append(label)
                    #continue
        if match_found == False:
            label = 0
            sentence_labels.append(label)
        if len(sentence_labels) == sentence_length:
            #print("************SENTENCE DONE************")
            #print("SENTENCE_LABELS BEFORE PADDING: ", sentence_labels)
            padded_sentence_labels = get_padding(sentence_labels, max_sample_length)
            #print("PADDED SENTENCE LABELS: ", sentence_labels)
            #if padded_sentence_labels is not None:
            labels.append(padded_sentence_labels)
            #print("LABELS AFTER ADDING PADDED SENT: ", labels)
            sentence_labels = []
            #else:
            #    print("omg it's a none", sentence_id)
    #print("LABELS BEFORE RETURN: ", labels)
            
    return labels 

def get_sample_lengths(data_df):
    max_sample_length = max(data_df.groupby('sentence_id').size())
    sample_lengths = data_df.groupby('sentence_id').size().tolist() 
    unique_sentences = data_df['sentence_id'].unique() 
    sentences_list = sorted(unique_sentences) 
    sample_length_dict = {sentences_list[i]: sample_lengths[i] for i in range(len(sentences_list))} 
    #display(data_df.groupby('sentence_id').size().nlargest(5))
    #display(data_df.groupby('sentence_id').size()) 
    return max_sample_length, sample_length_dict

def get_padding(sentence_labels, max_sample_length):
    #print("SENTENCE LABELS: ", sentence_labels)
    diff = max_sample_length - len(sentence_labels)
    #print("DIFF: ", diff)
    if int(diff) == 0:
        print("SENTENCE WITH NO DIFF: ", sentence_labels, "DIFF: ", diff)
        return sentence_labels
    else:
        padding = [0] * diff
        sentence_labels.extend(padding)
    #print("PADDED SENTENCE_LABELS BEFORE RETURN: ", sentence_labels)
    #if diff == 0:
    #    padded_sentence_labels = sentence_labels
    #    print("PAD_SENT AFTER DIFF==0: ", padded_sentence_labels)
    #else: 
    #padded_sentence_labels = sentence_labels.extend(padding)
    #print("PADDED_SENT BEFORE RETURN: ", padded_sentence_labels)
    #print("PADDED SENTENCE LABELS: ", padded_sentence_labels)
    return sentence_labels

In [5]:
get_y(data_df, ner_df)

Unique sent in Train:  5870
Unique sent in Val:  1035
Unique sent in Test:  1964
SENTENCE WITH NO DIFF:  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] DIFF:  0
val labels: 1035
test labels: 1684
test labels: 5870


(tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:1'),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:1'),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [3, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:1'))

### 