In [1]:
#basics
import random
import pandas as pd
import torch
#extra:
import os
import nltk
import string
from glob import glob
from lxml import etree

In [2]:
def parse_data(data_dir):
    
    id2word = {}
    id2ner = {0:'none', 1:'group', 2:'drug_n', 3:'drug', 4:'brand'}
    data_list = []
    ner_list = []
    data_dir = glob("{}/*".format(data_dir)) #glob returns a possibly-empty list of path names that match data_dir 
                                            #...in this case a list with the two subdirectories 'Test' and 'Train'                                           
    for subdir in data_dir: #looping through 'Test' and 'Train'
        split = os.path.basename(subdir) #get the directory name without path
        subdir = glob("{}/*".format(subdir))
        if split == 'Train':
            for folder in subdir:
                folder = glob("{}/*".format(folder))
                for xml_file in folder:
                    token_instances, ner_instances, id2word = parse_xml(xml_file, split, id2word, id2ner)
                    #print("NER_INSTANCES (Train): ", ner_instances)
                    data_list = data_list + token_instances
                    for instance in ner_instances:
                        if instance:
                            ner_list.append(instance)
        elif split == 'Test':
            for folder in subdir:  #looping through 'Test for DDI Extraction task' and 'Test for DrugNER task'
                folder = glob("{}/*".format(folder))
                for subfolder in folder: #looping through 'DrugBank' and 'MedLine'
                    subfolder = glob("{}/*".format(subfolder))
                    for xml_file in subfolder:
                        token_instances, ner_instances, id2word = parse_xml(xml_file, split, id2word, id2ner)
                        #print("NER_INSTANCES (Test): ", ner_instances)
                        data_list = data_list + token_instances
                        for instance in ner_instances:
                            if instance:
                                ner_list.append(instance)
    
    vocab = list(id2word.values()) #keeping track of unique words in the data
    data_df, ner_df = list2df(data_list, ner_list) #turn lists into dataframes
    #df1 = data_df[data_df.isnull().any(axis=1)]
    display(data_df)
    #display(ner_df)
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #    display(data_df)
    return data_df, ner_df, vocab, id2word
                    
def list2df(data_list, ner_list):
    data_df = pd.DataFrame.from_records(data_list, columns=['sentence_id', 'token_id', 'char_start_id', 'char_end_id', 'split'])
    #data_df = data_df[~data_df['token'].isin(list(string.punctuation))] #remove tokens that are just punctuation 
    #data_df.drop('token', inplace=True, axis=1) #remove 'token' column since it's not needed anymore
    #'inPlace=True' means we are working on the original df, 'axis=1' refers to the column axis
    #train_samples = data_df[data_df['split']=='Train'].sample(frac=0.15) #sample 15 % of 'Train'-labeled rows --> SENTENCES
    #train_samples.split='Val' #replace those 'Train' labels with 'Val'
    #data_df.update(train_samples) #incorporate the modified train samples back into the original dataframe
    train_df = data_df[data_df['split']=='Train']
    unique_sent_in_train = list(train_df['sentence_id'].unique())
    print("UNIQUE TRAIN SENT: ", len(unique_sent_in_train))
    val_sample_sentences = unique_sent_in_train[:int(len(unique_sent_in_train) * .15)]
    print("UNIQUE VAL SENT: ", len(val_sample_sentences))
    val_df = data_df[data_df['sentence_id'].isin(val_sample_sentences)]
    val_df.split='Val'
    data_df.update(val_df)
    val_df = data_df[data_df.split=='Val']
    ner_df = pd.DataFrame.from_records(ner_list, columns=['sentence_id', 'ner_id', 'char_start_id', 'char_end_id'])
    counts = ner_df["ner_id"].value_counts()
    print(counts)
    return data_df, ner_df    
                        
def parse_xml(xml_file, split, id2word, id2ner):
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    token_instances = [] #save all token 
    ner_instances = []
    
    for elem in root: #loop over sentence tags
        if elem.tag == 'sentence':
            sent_id = elem.attrib['id'] #get sentence id
            text = elem.attrib['text']  #get the sentence as a string of text
            text = text.replace('-', ' ') #replaces all hyphens with whitespace for easier split of compound words
            char_pos = -1 #variable for keeping track of character-based positions of the words in the sentence
            nltk_tokens = nltk.word_tokenize(text)
            for token in nltk_tokens:
                char_pos, token_instance, id2word  = get_token_instance(char_pos, sent_id, token, split, id2word)
                token_instances.append(token_instance)
        for subelem in elem: #looping through children tags (i.e. 'entity', 'pair') of sentence_id
            if subelem.tag == 'entity':
                ner_instance = get_ner_instance(sent_id, subelem, id2ner)
                #print("ner_instance (parse_xml): ", ner_instance)
                for instance in ner_instance: #loop through list of returned NER instances
                    ner_instances.append(instance) #save them individually in the ner_instances list
    #print("NER_INSTANCES: ", ner_instances)
    return token_instances, ner_instances, id2word

def get_token_instance(char_pos, sent_id, token, split, id2word):
    char_pos += 1
    char_start = char_pos
    char_end = char_start + len(token)-1
    token_id, id2word = map_token_to_id(token, id2word)
    token_instance = [sent_id, int(token_id), int(char_start), int(char_end), split]
    #print("TOKEN INSTANC E: ", token_instance)(
    char_pos=char_end+1 #increase by 1 to account for the whitespace between the current and the next word
    return char_pos, token_instance, id2word

def get_ner_id_as_int(ner_id, id2ner):
    for key, value in id2ner.items(): 
         if ner_id == value: 
            return key 
    else:
        return "key doesn't exist"
    

def get_ner_instance(sent_id, entity, id2ner):
    #Problem of this approach: if a NER might be tokenized differently from the token dataframe
    ner_instances = []
    #ner_token = entity.attrib['text']
    #tokenized_ner = entity.attrib['text'].split() 
    charOffset = entity.attrib['charOffset']
    #HAPPY PATH: if the character span is a single span:
    if ';' not in charOffset:
        char_start = charOffset.split('-')[0]
        char_end = charOffset.split('-')[1]
        ner_id = get_ner_id_as_int(entity.attrib['type'], id2ner)
        #ner_id = entity.attrib['type'] #getting the label: 'brand', 'drug', 'drug_n' or 'group'
        ner_instance = [sent_id, int(ner_id), int(char_start), int(char_end)]
        return [ner_instance]
    #PATH OF DOOM: for multiword entities with several character spans:
    if ';' in charOffset:
        for span in charOffset.split(';'):
            ner_id = get_ner_id_as_int(entity.attrib['type'], id2ner) #getting the label: 'brand', 'drug', 'drug_n' or 'group'
            char_start = span.split('-')[0]
            char_end = span.split('-')[1]
            ner_instance = [sent_id, int(ner_id), int(char_start), int(char_end)]
            ner_instances.append(ner_instance)
            #print("SPECIAL NER_INSTANCE: ", ner_instance)
    return ner_instances
    
def map_token_to_id(token, id2word):
    res = False
    for key in id2word: 
        if(id2word[key] == token):
            res = True
            return key, id2word
    if res == False:
        token_id = len(id2word)+1
        id2word[token_id] = token
    return token_id, id2word


In [3]:
data_df, ner_df, vocab, id2word = parse_data('/home/guserbto@GU.GU.SE/lt2316-h20-aa/DDICorpus')

UNIQUE TRAIN SENT:  6905
UNIQUE VAL SENT:  1035
3    11656
1     4254
4     1865
2      766
Name: ner_id, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,sentence_id,token_id,char_start_id,char_end_id,split
0,DDI-DrugBank.d610.s0,1.0,0.0,14.0,Test
1,DDI-DrugBank.d610.s0,2.0,16.0,25.0,Test
2,DDI-DrugBank.d610.s0,3.0,27.0,28.0,Test
3,DDI-DrugBank.d610.s0,4.0,30.0,37.0,Test
4,DDI-DrugBank.d610.s0,5.0,39.0,42.0,Test
5,DDI-DrugBank.d610.s0,6.0,44.0,46.0,Test
6,DDI-DrugBank.d610.s0,7.0,48.0,54.0,Test
7,DDI-DrugBank.d610.s0,8.0,56.0,57.0,Test
8,DDI-DrugBank.d610.s0,9.0,59.0,61.0,Test
9,DDI-DrugBank.d610.s0,10.0,63.0,70.0,Test


In [4]:
def get_y(data_df, ner_df):
    # Should return a tensor containing the ner labels for all samples in each split.
    # the tensors should have the following following dimensions:
    # (NUMBER_SAMPLES, MAX_SAMPLE_LENGTH)
    # NOTE! the labels for each split should be on the GPU
    device = torch.device('cuda:1')
    
    # divide df by splits
    df_train = data_df[data_df.split=='Train']
    print("Unique sent in Train: ", len(list(df_train['sentence_id'].unique()))) 
    #print("df_train size: ", df_train.size)
    df_val = data_df[data_df.split=='Val']
    #print("df_val size: ", df_val.size)
    print("Unique sent in Val: ", len(list(df_val['sentence_id'].unique()))) 
    df_test = data_df[data_df.split=='Test']
    #print("df_test size: ", df_test.size)
    print("Unique sent in Test: ", len(list(df_test['sentence_id'].unique()))) 
    
    max_sample_length, sample_lengths_dict = get_sample_lengths(data_df)
    
    #get labels
    train_labels = label_tokens(df_train, ner_df, max_sample_length, sample_lengths_dict)
    #print("train labels:", len(train_labels))
    train_tensor = torch.LongTensor(train_labels)
    train_tensor = train_tensor.to(device)
    
    test_labels = label_tokens(df_test, ner_df, max_sample_length, sample_lengths_dict)
    test_tensor = torch.LongTensor(test_labels)
    test_tensor = test_tensor.to(device)
    
    val_labels = label_tokens(df_val, ner_df, max_sample_length, sample_lengths_dict)
    val_tensor = torch.LongTensor(val_labels)
    val_tensor = val_tensor.to(device)
  
    print("val labels:", len(val_labels))
    print("test labels:", len(test_labels))
    print("test labels:", len(train_labels))


    return train_tensor, val_tensor, test_tensor

def label_tokens(df, ner_df, max_sample_length, sample_lengths_dict):
    labels = []
    
    df_as_list = df.values.tolist()
    #print("df_as_list[:50]: ", df_as_list[:50])
    ner_df_as_list = ner_df.values.tolist()
    
    #max_sample_length = 165
    
    sentence_labels = []
    match_found_count = 0
    for df_row in df_as_list:
        sentence_id = df_row[0]
        sentence_length = sample_lengths_dict[sentence_id]
        #print("SENTENCE_LENGTH: ", sentence_length)
        match_found = False 
        for ner_row in ner_df_as_list:
            #compare sentence_id, char_start, char_end between df_row and ner_rows: 
            if df_row[0] == ner_row[0]:
                if int(df_row[2]) == ner_row[2] and int(df_row[3]) == ner_row[3]:
                    label = ner_row[1]
                    match_found = True
                    #print("match found", df_row, "<3", ner_row) 
                    sentence_labels.append(label)
                    #continue
        if match_found == False:
            label = 0
            sentence_labels.append(label)
        if len(sentence_labels) == sentence_length:
            #print("************SENTENCE DONE************")
            #print("SENTENCE_LABELS BEFORE PADDING: ", sentence_labels)
            padded_sentence_labels = get_padding(sentence_labels, max_sample_length)
            #print("PADDED SENTENCE LABELS: ", sentence_labels)
            #if padded_sentence_labels is not None:
            labels.append(padded_sentence_labels)
            #print("LABELS AFTER ADDING PADDED SENT: ", labels)
            sentence_labels = []
            #else:
            #    print("omg it's a none", sentence_id)
    #print("LABELS BEFORE RETURN: ", labels)
            
    return labels 

def get_sample_lengths(data_df):
    max_sample_length = max(data_df.groupby('sentence_id').size())
    sample_lengths = data_df.groupby('sentence_id').size().tolist() 
    unique_sentences = data_df['sentence_id'].unique() 
    sentences_list = sorted(unique_sentences) 
    sample_length_dict = {sentences_list[i]: sample_lengths[i] for i in range(len(sentences_list))} 
    #display(data_df.groupby('sentence_id').size().nlargest(5))
    #display(data_df.groupby('sentence_id').size()) 
    return max_sample_length, sample_length_dict

def get_padding(sentence_labels, max_sample_length):
    #print("SENTENCE LABELS: ", sentence_labels)
    diff = max_sample_length - len(sentence_labels)
    #print("DIFF: ", diff)
    if int(diff) == 0:
        #print("SENTENCE WITH NO DIFF: ", sentence_labels, "DIFF: ", diff)
        return sentence_labels
    else:
        padding = [0] * diff
        sentence_labels.extend(padding)
    return sentence_labels

In [5]:
get_y(data_df, ner_df)

Unique sent in Train:  5870
Unique sent in Val:  1035
Unique sent in Test:  1964
val labels: 1035
test labels: 1684
test labels: 5870


(tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:1'),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:1'),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [3, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:1'))

In [6]:
def plot_split_ner_distribution(data_df, ner_df):
    
    id2ner = {0:'none', 1:'group', 2:'drug_n', 3:'drug', 4:'brand'}
     
    # divide df by splits and get unique sentences:
    df_train = data_df[data_df.split=='Train']
    train_sent = list(df_train['sentence_id'].unique())
    df_val = data_df[data_df.split=='Val']
    val_sent = list(df_val['sentence_id'].unique())
    df_test = data_df[data_df.split=='Test']
    test_sent = list(df_test['sentence_id'].unique())
    
    ner_df_as_list = ner_df.values.tolist()
    
    counts = {'Train': {'group': 0, 'drug_n': 0, 'drug': 0, 'brand':0}, 
              'Val': {'group': 0, 'drug_n': 0, 'drug': 0, 'brand':0}, 
              'Test': {'group': 0, 'drug_n': 0, 'drug': 0, 'brand':0}}
    for ner in ner_df_as_list:
        sent_id = ner[0]
        ner_label = id2ner[ner[1]]
        if sent_id in train_sent:
            counts['Train'][ner_label] += 1
        elif sent_id in val_sent:
            counts['Val'][ner_label] += 1
        elif sent_id in test_sent:
            counts['Test'][ner_label] += 1
    
    #print(counts)
    df = pd.DataFrame(counts)
    df_to_plot = df.transpose()
    df_to_plot.plot.bar()

In [7]:
plot_split_ner_distribution(data_df, ner_df)

In [25]:
import nltk

def get_padding(sentence, max_sample_length):
    #print("SENTENCE LABELS: ", sentence_labels)
    diff = max_sample_length - len(sentence)
    #print("DIFF: ", diff)
    if int(diff) == 0:
        #print("SENTENCE WITH NO DIFF: ", sentence_labels, "DIFF: ", diff)
        return sentence
    else:
        padding = [0] * diff
        sentence.extend(padding)
    return sentence

def get_sample_lengths(data_df):
    max_sample_length = max(data_df.groupby('sentence_id').size())
    sample_lengths = data_df.groupby('sentence_id').size().tolist() 
    unique_sentences = data_df['sentence_id'].unique() 
    sentences_list = sorted(unique_sentences) 
    sample_length_dict = {sentences_list[i]: sample_lengths[i] for i in range(len(sentences_list))} 
    #display(data_df.groupby('sentence_id').size().nlargest(5))
    #display(data_df.groupby('sentence_id').size()) 
    return max_sample_length, sample_length_dict

def extract_features(data_df):
    device = torch.device('cuda:1')
    print("extracting features")
    id2pos = {}
    # divide df by splits
    df_train = data_df[data_df.split=='Train']
    unique_sent_train = list(df_train['sentence_id'].unique())
    print("Unique sent in Train: ", len(list(df_train['sentence_id'].unique()))) 
    #print("df_train size: ", df_train.size)
    df_val = data_df[data_df.split=='Val']
    #print("df_val size: ", df_val.size)
    print("Unique sent in Val: ", len(list(df_val['sentence_id'].unique()))) 
    df_test = data_df[data_df.split=='Test']
    #print("df_test size: ", df_test.size)
    print("Unique sent in Test: ", len(list(df_test['sentence_id'].unique()))) 
    
    max_sample_length, sample_length_dict = get_sample_lengths(data_df)
    
    train_pos, id2pos = get_pos(df_train, max_sample_length, sample_length_dict, id2pos)
    test_pos, id2pos = get_pos(df_test, max_sample_length, sample_length_dict, id2pos)
    val_pos, id2pos = get_pos(df_val, max_sample_length, sample_length_dict, id2pos)
    
    print("val embeddings:", len(val_pos))
    print("test embeddings:", len(test_pos))
    print("train embeddings:", len(train_pos))
    
    #print("train labels:", len(train_labels))
    train_tensor = torch.LongTensor(train_pos)
    train_tensor = train_tensor.to(device)
    
    
    test_tensor = torch.LongTensor(test_pos)
    test_tensor = test_tensor.to(device)
    
   
    val_tensor = torch.LongTensor(val_pos)
    val_tensor = val_tensor.to(device)
    
    print("id2pos: ", id2pos)
    
    return val_tensor, test_tensor, train_tensor
    
    #return three tensor of the following dimensions: NUMBER_SAMPLES, MAX_SAMPLE_LENGTH, FEATURE_DIM

def get_pos(df, max_sample_length, sample_lengths_dict, id2pos):
    
    pos_sentences = []
    
    df_as_list = df.values.tolist()
    
    
    sentence = []
    sentence_count = 0
    for df_row in df_as_list:
        sentence_id = df_row[0]
        sentence_length = sample_lengths_dict[sentence_id]
        token_id = df_row[1]
        sentence.append(token_id)
        if len(sentence) == sentence_length:
            #print("SENT:", len(sentence))
            padded_sent = get_padding(sentence, max_sample_length)
            wordified_sent = []
            for token_id in padded_sent:
                if token_id in id2word.keys():
                    wordified_sent.append(id2word[token_id])
                else:
                    wordified_sent.append("PADDXNG")              
            #print("WORDIFIED:", wordified_sent)
            pos_tagged_sent = nltk.pos_tag(wordified_sent)
            #print("POS_TAGGED SENT:", pos_tagged_sent) 
            pos_id_sent = []
            for pos_tuple in pos_tagged_sent:
                pos_id, id2pos = map_pos_to_id(pos_tuple[1], id2pos)
                #print("pos_tuple[1]", pos_tuple[1])
                pos_id_sent.append(pos_id)
            #print("POS_ID SENT:", pos_id_sent)
            pos_sentences.append(pos_id_sent)
            sentence_count +=1
            sentence = []
    
    return pos_sentences, id2pos

def map_pos_to_id(pos, id2pos):
    res = False
    for key in id2pos:
        if(id2pos[key] == pos):
            res = True
            return key, id2pos
    if res == False:
        pos_id = len(id2pos)+1
        id2pos[pos_id] = pos
    return pos_id, id2pos

In [26]:
extract_features(data_df)

extracting features
Unique sent in Train:  5870
Unique sent in Val:  1035
Unique sent in Test:  1964
val embeddings: 1035
test embeddings: 1964
train embeddings: 5870
id2pos:  {1: 'NN', 2: ':', 3: 'IN', 4: 'NNS', 5: 'CC', 6: 'DT', 7: 'VBN', 8: '.', 9: 'NNP', 10: 'JJ', 11: 'VBZ', 12: 'RB', 13: 'TO', 14: 'PRP$', 15: ',', 16: 'MD', 17: 'VB', 18: 'VBD', 19: 'VBG', 20: 'VBP', 21: 'WDT', 22: 'CD', 23: '(', 24: ')', 25: 'EX', 26: 'JJS', 27: 'WP', 28: 'JJR', 29: 'FW', 30: '#', 31: 'PRP', 32: '$', 33: 'WRB', 34: 'POS', 35: 'RBR', 36: 'UH', 37: 'RP', 38: 'WP$', 39: 'NNPS', 40: 'PDT', 41: 'LS', 42: 'RBS', 43: '``', 44: "''"}


(tensor([[ 6,  1, 15,  ...,  9,  9,  9],
         [ 6, 10,  1,  ...,  9,  9,  9],
         [ 9,  9, 22,  ...,  9,  9,  9],
         ...,
         [ 4,  3, 10,  ...,  9,  9,  9],
         [12, 15,  6,  ...,  9,  9,  9],
         [ 1,  2,  1,  ...,  9,  9,  9]], device='cuda:1'),
 tensor([[10,  4,  3,  ...,  9,  9,  9],
         [ 6, 12, 10,  ...,  9,  9,  9],
         [ 9, 11,  6,  ...,  9,  9,  9],
         ...,
         [ 6,  1,  1,  ...,  9,  9,  9],
         [ 6,  1,  3,  ...,  9,  9,  9],
         [ 6,  1, 18,  ...,  9,  9,  9]], device='cuda:1'),
 tensor([[ 1,  2,  1,  ...,  9,  9,  9],
         [ 6, 10,  1,  ...,  9,  9,  9],
         [12, 15,  4,  ...,  9,  9,  9],
         ...,
         [10,  4, 15,  ...,  9,  9,  9],
         [ 6,  4, 20,  ...,  9,  9,  9],
         [ 6,  1, 18,  ...,  9,  9,  9]], device='cuda:1'))