# Generates the train-drugbank_filtered.csv (DDI-Dataset-Preprocess generates train-drugbank.csv)

Given e1_pos, e2_pos, sentence. Update the sentence to say DRUG and OTHER_DRUG and update the positions

'The bear_e1 ran home_e2 to_e2 the mountains'

In [1]:
%load_ext autoreload

In [2]:
%autoreload
import sys
sys.path.append('../../../../')
import ast
import relation_extraction.data.utils as utils
import os
import pandas as pd

RESOURCE_PATH = "/data/medg/misc/semeval_2010/medical-data/DDICorpus/pre-processed/extraction/"
def res(path): return os.path.join(RESOURCE_PATH, path)

[nltk_data] Downloading package wordnet to
[nltk_data]     /afs/csail.mit.edu/u/g/geeticka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Clean up rows where e1 doesn't happen before e2 and the two entities are the same

In [656]:
df = pd.read_csv(res('Train/MedLine/train-medline.csv')) # where you read from

In [657]:
len(df)

1546

In [658]:
# seems like when you read the csv file, the entity_number row becomes string instead of being a list of tuples,
# so need to segment with regular expressions
def get_problematic_entity_rows(df):
    problematic_entity_rows = [] # those which have e's appearing beyond the sentence 
    flipped_entity_rows = [] # (automatic) non overlapping entities but e0 appears after e1, can be flipped and that's it 
    flipped_entity_rows_non_adjacent = [] # (semi-automatic) can be flipped and then should be manually fixed
    overlapping_entity_rows = [] # (manual) overlapping entities, which needs to be manually fixed
    same_entity_rows = [] # (automatic) entities which are exactly the same, needs to be deleted
    non_adjacent_entity_rows = [] # (manual) entities which are not overlapping, but don't have continuous entity numbers, 
    # also needs to be manually fixed along with the sentence - could just let it be and not fix, but 
    # better to fix for accurate predictions. 
    def find_problematic_entity(row):
        sentence_len = len(row.tokenized_sentence)
        entity_number = ast.literal_eval(row.entity_number)
        e1 = entity_number[0]
        e2 = entity_number[1]
        range_e1 = [i for i in range(e1[0], e1[-1] + 1)]
        range_e2 = [i for i in range(e2[0], e2[-1] + 1)]
        if e1[0] >= sentence_len or e2[0] >= sentence_len:
            problematic_entity_rows.append(row.row_num)
        elif e1[0] == e2[0] or e1[-1] == e2[-1]: # this could either be overlap or totally equal entity
            if e1 == e2: # think about the asynchronous cases where 23, 25 is present
                same_entity_rows.append(row.row_num)
            else: # in this case there must be overlap
                overlapping_entity_rows.append(row.row_num)
        elif e1[0] > e2[0] or e1[-1] > e2[0]:
            if not set(range_e1).intersection(set(range_e2)): # don't overlap in any way
                if range_e1 != e1 or range_e2 != e2:
                    flipped_entity_rows_non_adjacent.append(row.row_num)
                else:
                    flipped_entity_rows.append(row.row_num) # this could also 
                #have non adjacent entity rows, but let it be
            else:
                overlapping_entity_rows.append(row.row_num)
        elif range_e1 != e1 or range_e2 != e2:
            non_adjacent_entity_rows.append(row.row_num)
    temp_df = df.copy()
    temp_df.insert(0, 'row_num', range(0, len(temp_df)))
    temp_df.apply(find_problematic_entity, axis=1)
    return problematic_entity_rows, flipped_entity_rows, flipped_entity_rows_non_adjacent, \
overlapping_entity_rows, same_entity_rows, non_adjacent_entity_rows

def get_problematic_entity_rows_array(problematic_entity_rows, df):
    problematic_entity_rows_array = []
    for index in problematic_entity_rows:
        e1 = df.iloc[index].e1
        e2 = df.iloc[index].e2
        sentence_text = df.iloc[index].sentence_text
        tokenized_sentence = df.iloc[index].tokenized_sentence
        entity_number = df.iloc[index].entity_number
        problematic_entity_rows_array.append([index, e1, e2, sentence_text, tokenized_sentence, entity_number])
    new_df = pd.DataFrame(data=problematic_entity_rows_array,    # values
             columns=['index_original', 'e1', 'e2', 'sentence_text', 'tokenized_sentence', 'entity_number'])
    return problematic_entity_rows_array, new_df

def get_problematic_entity_vals(df):
    problematic_entity_rows, flipped_entity_rows, flipped_entity_rows_non_adjacent, \
    overlapping_entity_rows, same_entity_rows, non_adjacent_entity_rows = get_problematic_entity_rows(df)
    _, df_problematic = get_problematic_entity_rows_array(problematic_entity_rows, df)
    _, df_flipped = get_problematic_entity_rows_array(flipped_entity_rows, df)
    _, df_flipped_non_adjacent = get_problematic_entity_rows_array(flipped_entity_rows_non_adjacent, df)
    _, df_overlapping = get_problematic_entity_rows_array(overlapping_entity_rows, df)
    _, df_same_entity = get_problematic_entity_rows_array(same_entity_rows, df)
    _, df_non_adjacent = get_problematic_entity_rows_array(non_adjacent_entity_rows, df)
    return df_problematic, df_flipped, df_flipped_non_adjacent, df_overlapping, df_same_entity, df_non_adjacent

In [659]:
df_problematic, df_flipped, df_flipped_non_adjacent, df_overlapping, \
df_same_entity, df_non_adjacent = get_problematic_entity_vals(df)

### Make sure that df_problematic is length 0

In [660]:
print(len(df_problematic), len(df_flipped), len(df_flipped_non_adjacent), 
     len(df_overlapping), len(df_same_entity), len(df_non_adjacent))

0 114 0 8 154 3


### perform flipping

In [661]:
def flip_entity_number(row):  
    curr_entity_nums = ast.literal_eval(row.old_entity_number)
    curr_e1_nums = curr_entity_nums[0]
    curr_e2_nums = curr_entity_nums[1]
    return curr_e2_nums, curr_e1_nums
def fix_flipped(df_flipped):
    new_df = df_flipped.copy()
    new_df = new_df.rename(columns={'entity_number': 'old_entity_number'})
    new_df['entity_number'] = new_df.apply(flip_entity_number, axis=1)
    new_df = new_df.rename(columns={'e1': 'e2', 'e2':'e1'})
    return new_df

for now, we are just going to delete the rows df_overlapping, df_same_entity, and we can keep df_non_adjacent because it probably doesn't hurt

In [662]:
# update based on entity number and e1 and e2 - no changes made to tokenized sentence
def update_df_with_fixed_vals(fromdf, todf):
    todf = todf.copy()
    for i in range(0, len(fromdf)):
        row = fromdf.iloc[i]
        idx = row.index_original
        entity_number = row.entity_number
        e1 = row.e1
        e2 = row.e2
        todf.at[idx, 'entity_number'] = entity_number
        todf.at[idx, 'e1'] = e1
        todf.at[idx, 'e2'] = e2
    return todf

In [663]:
fixed_df = df.copy()
if len(df_flipped) > 0 : 
    fixed_df_flipped = fix_flipped(df_flipped)
    fixed_df = update_df_with_fixed_vals(fixed_df_flipped, df)
if len(df_flipped_non_adjacent) > 0: 
    fixed_df_flipped_non_adjacent = fix_flipped(df_flipped_non_adjacent)
    fixed_df = update_df_with_fixed_vals(fixed_df_flipped_non_adjacent, fixed_df)

In [664]:
len(fixed_df)

1546

## Manual fixes performed for the test data

In [522]:
# def update(fixed_df, data):
#     def update_vals(row, data=data):
#         if row.index_original == data['index_original']:
#             row.entity_number = data['entity_number']
#         return row

#     fixed_df = fixed_df.apply(update_vals, axis=1)
#     return fixed_df

In [523]:
# df_overlapping.iloc[4]['tokenized_sentence']

'The objective of this study was to evaluate the effect of oral administration of ginseng stem - and - leaf saponins ( GSLS ) on the humoral immune responses of chickens to inactivated ND and AI vaccines .'

needed to label inactivated ND vaccines and inactivated AI vaccines

In [524]:
# # Below is for the df_overlapping of test data of MedLine for the extraction case

# data = [{   'index_original' :  142, 
#             'entity_number': ([10], [12,13])
#         }, 
#        {
#            'index_original': 145, 
#            'entity_number': ([15], [17,18])
#        },
#        {
#            'index_original': 151, 
#            'entity_number': ([12], [14,15])
#        },
#        {
#            'index_original': 163, 
#            'entity_number': ([23], [29])
#        },
#        {
#            'index_original': 170, 
#            'entity_number': ([32,33], [35,36])
#        }]
# fixed_df_overlapping = df_overlapping.copy()
# for d in data:
#     fixed_df_overlapping = update(fixed_df_overlapping, d)

In [495]:
# # Below is for the df_overlapping of test data of DrugBank for extraction case
# data = [{   'index_original' :  974, 
#             'entity_number': ([2,3,4,5], [16])
#         }, 
#        {
#            'index_original': 1807, 
#            'entity_number': ([8,9], [18])
#        },
#        {
#            'index_original': 3544, 
#            'entity_number': ([7,8], [30])
#        },
#        {
#            'index_original': 3545, 
#            'entity_number': ([7,8], [51])
#        },
#        {
#            'index_original': 3830, 
#            'entity_number': ([3], [5,6,7,8])
#        },
#        {
#            'index_original': 3933, 
#            'entity_number': ([23,24], [26,27,28])
#        },
#        {
#            'index_original': 4047, 
#            'entity_number': ([2,3,4,5], [14])
#        }]

# fixed_df_overlapping = df_overlapping.copy()
# for d in data:
#     fixed_df_overlapping = update(fixed_df_overlapping, d)

In [496]:
df_non_adjacent.iloc[8]['tokenized_sentence']

'Examples of some of the more potent CYP 3A4 inhibitors include macrolide antibiotics ( e.g. , erythromycin , troleandomycin , clarithromycin ) , HIV protease or reverse transcriptase inhibitors ( e.g. , ritonavir , indinavir , nelfinavir , delavirdine ) or azole antifungals ( e.g. , ketoconazole , itraconazole , voriconazole ) .'

In [525]:
# # Below is for the df_non_adjacent of test data of MedLine for the extraction case

# data = [{   'index_original' :  140, 
#             'entity_number': ([0], [10])
#         }, 
#        {
#            'index_original': 143, 
#            'entity_number': ([0], [15])
#        },
#        {
#            'index_original': 149, 
#            'entity_number': ([8], [12])
#        },
#        {
#            'index_original': 161, 
#            'entity_number': ([9], [23])
#        },
#        {
#            'index_original': 162, 
#            'entity_number': ([9], [29])
#        },
#        {
#            'index_original': 166, 
#            'entity_number': ([14, 15, 16, 17, 18, 19, 20], [32,33])
#        },
#        {
#            'index_original': 167, 
#            'entity_number': ([14, 15, 16, 17, 18, 19, 20], [35,36])
#        },
#        {
#            'index_original': 168, 
#            'entity_number': ([22], [32,33])
#        },
#        {
#            'index_original': 169, 
#            'entity_number': ([22], [35,36])
#        }]
# fixed_df_non_adjacent = df_non_adjacent.copy()
# for d in data:
#     fixed_df_non_adjacent = update(fixed_df_non_adjacent, d)

In [498]:
# # Below is for the df_non_adjacent of test data of DrugBank for extraction case
# def update_df_non_adj_test_drugbank(row):
#     entity_number = ast.literal_eval(row.entity_number)
#     e1 = entity_number[0]
#     e2 = entity_number[1]
#     if e1 == [23,24,28]:
#         e1 = [23,24]
#     if e2 == [23,24,28]:
#         e2 = [23,24]
#     return e1, e2

# fixed_df_non_adjacent = df_non_adjacent.copy()
# fixed_df_non_adjacent['entity_number'] = df_non_adjacent.apply(update_df_non_adj_test_drugbank, axis=1)

In [526]:
# if len(fixed_df_overlapping) > 0: 
#     fixed_df = update_df_with_fixed_vals(fixed_df_overlapping, fixed_df)
# if len(fixed_df_non_adjacent) > 0:
#     fixed_df = update_df_with_fixed_vals(fixed_df_non_adjacent, fixed_df)

## Now we are going to do deletions (just for the train data) - for the test data, just fix the entity numbering and do not edit the sentence

delete df_overlapping and df_same_entity

In [665]:
list_of_indexes_to_drop = df_overlapping['index_original'].tolist() + df_same_entity['index_original'].tolist()

In [666]:
len(list_of_indexes_to_drop)

162

In [667]:
fixed_df_with_dropped = fixed_df.drop(list_of_indexes_to_drop)

In [668]:
len(fixed_df_with_dropped)

1384

In [669]:
len(fixed_df_with_dropped) + len(list_of_indexes_to_drop) == len(fixed_df)

True

## Write this fixed dataframe down

In [670]:
# fixed_df_with_dropped.to_csv(res('Train/MedLine/train-medline_filtered.csv'), encoding='utf-8', index=False) #(for train data)

In [536]:
# fixed_df.to_csv(res('Test/MedLine/test-medline_filtered.csv'), encoding='utf-8', index=False) # for test data

### Combine the Train/Test data of MedLine and DrugBank

In [539]:
directory2 = 'Test/MedLine/'
file2 = 'test-medline_filtered'
directory1 = 'Test/DrugBank/'
file1 = 'test-drugbank_filtered'
outfile = 'test_filtered.txt'

In [540]:
# https://stackoverflow.com/questions/13613336/python-concatenate-text-files
filenames = [res(directory1 + file1+'.txt'), res(directory2 + file2+'.txt')]
with open(res(outfile), 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

## Now handle the entities

In [189]:
# df_copy = pd.read_csv(res('Train/DrugBank/train-drugbank.csv'))

In [1]:
sentence = 'The bear ran home to the mountains'

In [2]:
e1_pos = (1,1)
e2_pos = (3,4)

In [18]:
# given a tokenized and splitted sentence
def sentence_replace(sentence, positions, string_update):
    return sentence[:positions[0]] + [string_update] + sentence[positions[1]+1:]

In [20]:
sentence_replace(['The', 'bear', 'ran', 'home', 'to', 'the', 'mountains'], (3,4), 'DRUG')

['The', 'bear', 'ran', 'DRUG', 'the', 'mountains']

In [24]:
# sentence is the sentence to update and entity positions is a list of entity positions
def per_sentence_replacement_ddi(sentence, entity_positions):
    # if entity position is updated, then all positions after it also have to be updated
    sentence = sentence.split() # no need to do this for the code
    
    e0_pos = entity_positions[0]
    sentence = sentence_replace(sentence, e0_pos, 'DRUG1')
    new_e0_pos = (e0_pos[0], e0_pos[0])
   
    entity_positions[0] = new_e0_pos
    diff = e0_pos[1] - e0_pos[0] # if the entity is 2 word, then move every other e_pos down by 1
    if diff > 0:
        for i in range(1, len(entity_positions)):
            e_pos = entity_positions[i]
            if e_pos[0] > e0_pos[1]:
                entity_positions[i] = (entity_positions[i][0] - diff, entity_positions[i][1] - diff)
     
    e1_pos = entity_positions[1]
    sentence = sentence_replace(sentence, e1_pos, 'DRUG2')
    new_e1_pos = (e1_pos[0], e1_pos[0])
    
    entity_positions[1] = new_e1_pos
    diff = e1_pos[1] - e1_pos[0]
    if diff > 0 and len(entity_positions) > 2:
        for i in range(2, len(entity_positions)):
            e_pos = entity_positions[i]
            if e_pos[0] > e1_pos[1]:
                entity_positions[i] = (entity_positions[i][0] - diff, entity_positions[i][1] - diff)
    # then should handle for the case when there are more than entity 1 and entity 2 i.e. drug0 (any other drug)
    return sentence, entity_positions

In [27]:
per_entity_replacement('The bear ran home to the mountains', [(1,2), (3,4)])

(['The', 'DRUG0', 'DRUG1', 'the', 'mountains'], [(1, 1), (2, 2)])

In [551]:
res('Train/DrugBank/train-drugbank.txt')

'/data/medg/misc/semeval_2010/medical-data/DDICorpus/pre-processed/extraction/Train/DrugBank/train-drugbank.txt'

Despite fixes, there is still data in DrugBank that is messed up, need to just manually delete those rows

In [689]:
train_data = open(res('Train/DrugBank/train-drugbank_filtered.txt'))
train_data = utils.split_data_cut_sentence(train_data)

In [690]:
new_train_data = utils.replace_by_drug_ddi(train_data)

In [691]:
indexes = []
sentences, relations, e1_pos, e2_pos = new_train_data
for i in range(len(sentences)):
    sent = sentences[i]
    pos1 = e1_pos[i]
    pos2 = e2_pos[i]
    sentence_len = len(sent)
    if(pos1[0] >= sentence_len or pos2[0] >= sentence_len):
        print(len(sent), i)
        indexes.append(i)
        print(sent, pos1, pos2)

In [692]:
sentences, relations, e1_pos, e2_pos = train_data
for i in range(len(sentences)):
    if i in indexes:
        print(sentences[i], e1_pos[i], e2_pos[i])

Looking at these sentences in the csv files, it looks like they have that issue where they will say 'loop diuretics' is the entity but there are a bunch of words in between them in the sentence. This is a problem because for the predictive model, it expects entities to be next to each other, so while my algorithm is able to detect these entities correctly, unless I manually edit the sentences myself (which I can do), there is no way for the model to learn. For future reference, these exist in drugbank csv around 9371 if I want to fix it. Might actually make sense to edit these, and edit them for the test data as well.

issues present in train_drugbank.csv and test_medline.csv and fixed