# Type of examples for which stop words predict the wrong relation vs the baseline which predicts the correct relations

In [1]:
%load_ext autoreload

In [2]:
%autoreload
import os
from sys import path
import re
import pandas as pd
path.append('../../..')
import numpy as np
from scipy.stats import ttest_rel
from relation_extraction.data.summarize import *
from relation_extraction.data.converters.converter_semeval2010 import relation_dict
from relation_extraction.data.utils import split_data_cut_sentence
output_path = '/scratch/geeticka/relation-extraction-result/semeval-analyze/'
def res(path): return os.path.join(output_path, path)
original_sentences_path = os.path.join('/scratch/geeticka/relation-extraction-result/semeval-analyze/test_original_for_stop_word_analysis.txt')
stop_words_sentences_path = os.path.join('/scratch/geeticka/relation-extraction-result/semeval-analyze/test_punct_stop_digit_for_stop_word_analysis.txt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /afs/csail.mit.edu/u/g/geeticka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
relation_dict

{0: 'Component-Whole(e2,e1)',
 1: 'Instrument-Agency(e2,e1)',
 2: 'Member-Collection(e1,e2)',
 3: 'Cause-Effect(e2,e1)',
 4: 'Entity-Destination(e1,e2)',
 5: 'Content-Container(e1,e2)',
 6: 'Message-Topic(e1,e2)',
 7: 'Product-Producer(e2,e1)',
 8: 'Member-Collection(e2,e1)',
 9: 'Entity-Origin(e1,e2)',
 10: 'Cause-Effect(e1,e2)',
 11: 'Component-Whole(e1,e2)',
 12: 'Message-Topic(e2,e1)',
 13: 'Product-Producer(e1,e2)',
 14: 'Entity-Origin(e2,e1)',
 15: 'Content-Container(e2,e1)',
 16: 'Instrument-Agency(e1,e2)',
 17: 'Entity-Destination(e2,e1)',
 18: 'Other'}

Semeval uses border size 50 as default and the method split_data_cut_sentence drops some. We need to find out which so that we can update the original sentences with the stop words sentences so that they can match line by line

In [4]:
# below is a modified version of the split_data_cut_sentence which prints the lines that are dropped
def print_dropped_sentences(stop_words_sentences_path):
    counter = 0
    with open(stop_words_sentences_path, 'r') as file:
        for line in file:
            counter += 1 # this counter starts from 1 and demonstrates the current line that you are at
            line = line.strip().lower().split()
            left_start_pos = int(line[1])
            right_end_pos = int(line[4])
            if left_start_pos >= right_end_pos:
                print("Dropping line (starting from 1) %d \n"%counter)

In [5]:
print_dropped_sentences("/crimea/geeticka/data/relation_extraction/semeval2010/pre-processed/punct_stop_digit/test_punct_stop_digit.txt")

Dropping line (starting from 1) 82 

Dropping line (starting from 1) 1129 

Dropping line (starting from 1) 1255 



Now that we know which lines were dropped, we can do an exact match and drop the necessary lines from the train data. I am just going to manually do this and write the result for the original sentences in a separate folder, along with the deletion of the lines where the predictions were made for those lines. We only care about comparing for those lines where the stop words predicted wrong and so if there isnt even a prediction for a line, it makes no sense to keep it. 

In [6]:
def read_answers_line(line):
    linenum, relation = line.strip().split()
    return linenum, relation

In [7]:
def asstring(list_of_strings):
    return " ".join(list_of_strings)

In [8]:
def read_sentence_and_entities(line):
    line = line.strip().split()
    sentence = line[5:]
    relation = relation_dict[int(line[0])]
    entity1_idx = (int(line[1]), int(line[2]))
    entity2_idx = (int(line[3]), int(line[4]))
    entity1 = sentence[entity1_idx[0] : entity1_idx[1] + 1]
    entity2 = sentence[entity2_idx[0] : entity2_idx[1] + 1]
    
    return relation, asstring(entity1), asstring(entity2), asstring(sentence)

In [9]:
needed_linenum_and_relation = {}

In this case our correct one is baseline, but the incorrect is stop words. Put incorrect on the left, correct on the right. 

Error analysis here would not be accurate because the border size parameter cuts out a couple of sentences. Skip this analysis and mention that. 

In [10]:
with open(res('answers_for_dev-stop-words.txt')) as textfile1, open(res("answers_for_dev-baseline_for_stop_word_analysis.txt")) as textfile2, \
open(res('answers_for_dev-stop-words_gold.txt')) as textfile3: 
    for x, y, z in zip(textfile1, textfile2, textfile3):
        linenum, stop_words_relation = read_answers_line(x)
        _, baseline_relation = read_answers_line(y)
        _, gold_relation = read_answers_line(z)
        if baseline_relation == gold_relation and stop_words_relation != gold_relation:
            needed_linenum_and_relation[int(linenum) - 1] = (baseline_relation, stop_words_relation)

In [11]:
len(list(needed_linenum_and_relation.keys()))

364

In [12]:
len(list(needed_linenum_and_relation.keys()))/len(open(res('answers_for_dev-stop-words_gold.txt')).readlines()) * 100

13.411938098747237

Note that we list baseline first and then the entity blinded version but the prediction of the entity blinded version is correct (gold) whereas the baseline is incorrect. 

In [44]:
cw_e2e1 = 0; ia_e2e1 = 0; mc_e1e2 = 0; ce_e2e1 = 0; ed_e1e2 = 0; cc_e1e2 = 0; mt_e1e2 = 0; 
pp_e2e1 = 0; mc_e2e1 = 0; eo_e1e2 = 0; ce_e1e2 = 0; cw_e1e2 = 0; mt_e2e1 = 0; pp_e1e2 = 0; 
eo_e2e1 = 0; cc_e2e1 = 0; ia_e1e2 = 0; ed_e2e1 = 0; other = 0
# {0: 'Component-Whole(e2,e1)',
#  1: 'Instrument-Agency(e2,e1)',
#  2: 'Member-Collection(e1,e2)',
#  3: 'Cause-Effect(e2,e1)',
#  4: 'Entity-Destination(e1,e2)',
#  5: 'Content-Container(e1,e2)',
#  6: 'Message-Topic(e1,e2)',
#  7: 'Product-Producer(e2,e1)',
#  8: 'Member-Collection(e2,e1)',
#  9: 'Entity-Origin(e1,e2)',
#  10: 'Cause-Effect(e1,e2)',
#  11: 'Component-Whole(e1,e2)',
#  12: 'Message-Topic(e2,e1)',
#  13: 'Product-Producer(e1,e2)',
#  14: 'Entity-Origin(e2,e1)',
#  15: 'Content-Container(e2,e1)',
#  16: 'Instrument-Agency(e1,e2)',
#  17: 'Entity-Destination(e2,e1)',
#  18: 'Other'}

In [45]:
print('We print the baseline first and then the stop words version. Gold relation corresponds to baseline\n\n')
curr_linenum = 0
with open(original_sentences_path) as original_sentences, open(stop_words_sentences_path) as stop_words_sentences:
    for x, y in zip(original_sentences, stop_words_sentences):
        needed_linenums = list(needed_linenum_and_relation.keys())
        if curr_linenum in needed_linenums:
            _, e1_b, e2_b, s_b = read_sentence_and_entities(x.strip())
            _, e1_c, e2_c, s_c = read_sentence_and_entities(y.strip())
            r_b, r_c = needed_linenum_and_relation[curr_linenum]
            if r_b == 'Component-Whole(e2,e1)':
                cw_e2e1 += 1
            elif r_b == 'Instrument-Agency(e2,e1)':
                ia_e2e1 += 1
            elif r_b == 'Member-Collection(e1,e2)':
                mc_e1e2 += 1
            elif r_b == 'Cause-Effect(e2,e1)':
                ce_e2e1 += 1
            elif r_b == 'Entity-Destination(e1,e2)':
                ed_e1e2 += 1
            elif r_b == 'Content-Container(e1,e2)':
                cc_e1e2 += 1
            elif r_b == 'Message-Topic(e1,e2)':
                mt_e1e2 += 1
            elif r_b == 'Product-Producer(e2,e1)':
                pp_e2e1 += 1
            elif r_b == 'Member-Collection(e2,e1)':
                mc_e2e1 += 1
            elif r_b == 'Entity-Origin(e1,e2)':
                eo_e1e2 += 1
            elif r_b == 'Cause-Effect(e1,e2)':
                ce_e1e2 += 1
            elif r_b == 'Component-Whole(e1,e2)':
                cw_e1e2 += 1
            elif r_b == 'Message-Topic(e2,e1)':
                mt_e2e1 += 1
            elif r_b == 'Product-Producer(e1,e2)':
                pp_e1e2 += 1
            elif r_b == 'Entity-Origin(e2,e1)':
                eo_e2e1 += 1
            elif r_b == 'Content-Container(e2,e1)':
                cc_e2e1 += 1
            elif r_b == 'Instrument-Agency(e1,e2)':
                ia_e1e2 += 1 
            elif r_b == 'Entity-Destination(e2,e1)':
                ed_e2e1 += 1
            elif r_b == 'Other':
                other += 1
            print('Predicted Relation: \t {0}, {1} \nEntities: \t {2}, {3} \t {4}, {5} \nSentences: \n\t{6} \n\t {7}'.format(
                    r_b, r_c, e1_b, e1_c, e2_b, e2_c, s_b, s_c))
            print('\n')
        curr_linenum += 1

We print the baseline first and then the stop words version. Gold relation corresponds to baseline


Predicted Relation: 	 Message-Topic(e1,e2), Entity-Destination(e1,e2) 
Entities: 	 audits, audits 	 waste, waste 
Sentences: 
	The most common audits were about waste and recycling . 
	 common audits waste recycling


Predicted Relation: 	 Instrument-Agency(e2,e1), Component-Whole(e2,e1) 
Entities: 	 master, master 	 stick, stick 
Sentences: 
	The school master teaches the lesson with a stick . 
	 school master teaches lesson stick


Predicted Relation: 	 Component-Whole(e1,e2), Other 
Entities: 	 ear, ear 	 elephant, elephant 
Sentences: 
	The ear of the African elephant is significantly larger -- measuring 183 cm by 114 cm in the bush elephant . 
	 ear African elephant significantly larger measuring NUMBER cm NUMBER cm bush elephant


Predicted Relation: 	 Cause-Effect(e1,e2), Cause-Effect(e2,e1) 
Entities: 	 viruses, viruses 	 infections, infections 
Sentences: 
	Of the hundreds of s

In [46]:
print(cw_e2e1,  ia_e2e1,  mc_e1e2,  ce_e2e1,  ed_e1e2,  cc_e1e2,  mt_e1e2,  
pp_e2e1,  mc_e2e1,  eo_e1e2,  ce_e1e2,  cw_e1e2,  mt_e2e1,  pp_e1e2,  
eo_e2e1, cc_e2e1,  ia_e1e2,  ed_e2e1, other)

20 28 3 19 38 16 16 18 25 49 15 18 9 15 0 5 2 0 68


In [48]:
cw_e2e1 + ia_e2e1 + mc_e1e2 + ce_e2e1 + ed_e1e2 + cc_e1e2 + mt_e1e2 + \
pp_e2e1 + mc_e2e1 + eo_e1e2 + ce_e1e2 + cw_e1e2 + mt_e2e1 + pp_e1e2 + \
eo_e2e1 + cc_e2e1 + ia_e1e2 + ed_e2e1 + other

364