# Knowledge extraction dataset
In this dataset, we have extracted tuples `arg1, relation, arg2` with confidence scores. We add POS-tags as additional dimensions.

Data source: http://reverb.cs.washington.edu

In [1]:
import nltk
import pandas as pd
import numpy as np

from utils import string_utils

In [None]:
filename = 'woe_parse'
extractions = pd.read_csv('data/reverb_emnlp2011_data/extractions/{}.txt'.format(filename), sep='\t', header=None)
extractions.columns = ['id', 'arg1', 'relation', 'arg2', 'confidence']

# The original sentences
sentences = pd.read_csv('data/reverb_emnlp2011_data/sentences.txt', sep='\t', header=None)
sentences.columns = ['sentenceId', 'sentence']

# Correctness labels
labels = pd.read_csv('data/reverb_emnlp2011_data/labels.txt', sep='\t', header=None)
labels.columns = ['truth', 'sentenceId', 'arg1', 'relation', 'arg2']

### Data Preprocessing
We add POS-tags to `arg1`, `relation` and `arg2` by tagging the original sentence and searching for the extracted tokens in the tagged sentences.

In [6]:
sentences['tokenized'] = sentences['sentence'].apply(nltk.word_tokenize)
sentences['tagged'] = sentences['tokenized'].apply(lambda x: nltk.pos_tag(x, tagset='universal'))

extractions = extractions.merge(labels, on=['sentenceId', 'arg1', 'relation', 'arg2'], how='inner')
extractions = extractions.merge(sentences, on='sentenceId', how='left')

extractions['tokenized-arg1'] = extractions['arg1'].apply(nltk.word_tokenize)
extractions['tokenized-relation'] = extractions['relation'].apply(nltk.word_tokenize)
extractions['tokenized-arg2'] = extractions['arg2'].apply(nltk.word_tokenize)

In [8]:
old_shape = extractions.shape[0]

extractions['tagged-arg1'] = extractions.apply(lambda t: string_utils.find_subsequence(t['tokenized'], t['tokenized-arg1'], t['tagged']), axis=1)
extractions['tagged-relation'] = extractions.apply(lambda t: string_utils.find_subsequence(t['tokenized'], t['tokenized-relation'], t['tagged']), axis=1)
extractions['tagged-arg2'] = extractions.apply(lambda t: string_utils.find_subsequence(t['tokenized'], t['tokenized-arg2'], t['tagged']), axis=1)

# Remove tuples where we couldn't find all POS tags
extractions = extractions[extractions['tokenized-arg1'].apply(len) == extractions['tagged-arg1'].apply(len)]
extractions = extractions[extractions['tokenized-relation'].apply(len) == extractions['tagged-relation'].apply(len)]
extractions = extractions[extractions['tokenized-arg2'].apply(len) == extractions['tagged-arg2'].apply(len)]

print('Ignoring {} tuples'.format(old_shape - extractions.shape[0]))

Ignoring 52 tuples


### Formatting for Data X-Ray
Now, we format the tuples such that they can be read by Data X-Ray to find a cause assumption.

In [9]:
extractions['pos-arg1'] = extractions['tagged-arg1'].apply(lambda t: set(list(x[1] for x in t)))
extractions['pos-relation'] = extractions['tagged-relation'].apply(lambda t: set(list(x[1] for x in t)))
extractions['pos-arg2'] = extractions['tagged-arg2'].apply(lambda t: set(list(x[1] for x in t)))

extractions.head()

out = extractions[['confidence', 'pos-arg1', 'pos-relation', 'pos-arg2', 'tokenized-arg1', 'tokenized-relation', 'tokenized-arg2', 'truth']].copy()
out['confidence'] = 'a_' + out['confidence'].round(1).astype(str) + '_:'
out['arg1'] = 'a_' + out['pos-arg1'].apply(lambda x: '-'.join(x)) + '_' + out['tokenized-arg1'].apply(lambda x: '-'.join(x)) + '_:'
out['relation'] = 'a_' + out['pos-relation'].apply(lambda x: '-'.join(x)) + '_' + out['tokenized-relation'].apply(lambda x: '-'.join(x)) + '_:'
out['arg2'] = 'a_' + out['pos-arg2'].apply(lambda x: '-'.join(x)) + '_' + out['tokenized-arg2'].apply(lambda x: '-'.join(x)) + '_:'

out = out[['confidence', 'arg1', 'relation', 'arg2', 'truth']]
out.head()

Unnamed: 0,confidence,arg1,relation,arg2,truth
0,a_0.6_:,a_PRON_they_:,a_VERB_plan-raise_:,a_NOUN_premiums_:,0
1,a_0.6_:,a_PRON_they_:,a_VERB_plan-reduce_:,a_NOUN_benefits_:,0
2,a_0.7_:,a_ADJ-NOUN-PRT-DET_The-two-year-note-'s-yield_:,a_ADJ-VERB-ADP_was-unchanged-at_:,a_NOUN-NUM_5.95-percent_:,1
3,a_0.9_:,a_DET-NUM_The-12_:,a_VERB_is_:,a_.-NUM_$-70.00_:,0
4,a_0.9_:,a_ADJ-NOUN-DET_The-principal-opposition-parties_:,a_VERB_boycotted_:,a_NOUN-DET_the-polls_:,1


In [11]:
feature_vector = 'a:a:a:a:'
structure_vector = '1:1:1:1:'
max_dims = '1:2:2:2:'
error_rate = out['truth'].mean()
cost = 0.0

top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(len(out)) + ';0;'
top_row

'a:a:a:a:\t1:2:2:2:;0.37876960193003617;0.0;false;a:a:a:a:;1:1:1:1:;829;0;'

In [12]:
out['input-str'] = out['confidence'] + out['arg1'] + out['relation'] + out['arg2']
out['truth'] = out['truth'] == 1

In [25]:
# write Data X-Ray input file
with open('./data/{}-posAsSet-input.txt'.format(filename), 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, out['truth'].iloc[i], out['input-str'].iloc[i])) for i in range(len(out)))