# Reverb knowledge extraction dataset
Data source: http://reverb.cs.washington.edu

In [1]:
import nltk
import pandas as pd
import numpy as np

nltk.download(['punkt', 'averaged_perceptron_tagger', 'universal_tagset'])

[nltk_data] Downloading package punkt to /home/mgauch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mgauch/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/mgauch/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [2]:
filename = 'woe_parse' #'reverb'
reverb = pd.read_csv('data/reverb_emnlp2011_data/extractions/{}.txt'.format(filename), sep='\t', header=None)
reverb.columns = ['sentenceId', 'arg1', 'relation', 'arg2', 'confidence']
reverb.head()

Unnamed: 0,sentenceId,arg1,relation,arg2,confidence
0,0,they,plan raise,premiums,0.643515
1,0,they,plan reduce,benefits,0.643515
2,1,The two-year note 's yield,was unchanged at,5.95 percent,0.663007
3,2,The 12,is,$ 70.00,0.932481
4,3,The principal opposition parties,boycotted,the polls,0.861337


In [3]:
sentences = pd.read_csv('data/reverb_emnlp2011_data/sentences.txt', sep='\t', header=None)
sentences.columns = ['sentenceId', 'sentence']
sentences.head()

Unnamed: 0,sentenceId,sentence
0,0,The nation 's health maintenance organizations...
1,1,The two-year note 's yield was unchanged at 5....
2,2,The 12 to 18-month target price is $ 70.00 per...
3,3,The principal opposition parties boycotted the...
4,4,Gallery hours are 11 a.m. to 6 p.m. daily .


In [4]:
labels = pd.read_csv('data/reverb_emnlp2011_data/labels.txt', sep='\t', header=None)
labels.columns = ['truth', 'sentenceId', 'arg1', 'relation', 'arg2']
labels.head()

Unnamed: 0,truth,sentenceId,arg1,relation,arg2
0,0,437,the lawyers,dismissed by,he
1,0,53,many as 57 deputies,let,Tuesday night
2,1,159,people,stay in,bad marriages
3,1,182,you,can still buy,a single quart
4,0,264,a water-based liquid,are lined up beside richly colored berries soa...,a balsamic vinegar-brown sugar sauce


In [6]:
sentences['tokenized'] = sentences['sentence'].apply(nltk.word_tokenize)
sentences['tagged'] = sentences['tokenized'].apply(lambda x: nltk.pos_tag(x, tagset='universal'))

In [7]:
reverb = reverb.merge(labels, on=['sentenceId', 'arg1', 'relation', 'arg2'], how='inner')
reverb = reverb.merge(sentences, on='sentenceId', how='left')

reverb['tokenized-arg1'] = reverb['arg1'].apply(nltk.word_tokenize)
reverb['tokenized-relation'] = reverb['relation'].apply(nltk.word_tokenize)
reverb['tokenized-arg2'] = reverb['arg2'].apply(nltk.word_tokenize)

In [8]:
def lcs(sentence, subsentence, pos_tags):
    """ Get POS tags for arg1/relation/arg2 in a best-effort way """
    s1, s2 = sentence, subsentence
    matrix = [[list() for x in range(len(s2))] for x in range(len(s1))]
    pos_matrix = [[list() for x in range(len(s2))] for x in range(len(s1))]
    for i in range(len(s1)):
        for j in range(len(s2)):
            if s1[i] == s2[j]:
                if i == 0 or j == 0:
                    matrix[i][j] = [s1[i]]
                    pos_matrix[i][j] = [pos_tags[i]]
                else:
                    matrix[i][j] = matrix[i-1][j-1] + [s1[i]]
                    pos_matrix[i][j] = pos_matrix[i-1][j-1] + [pos_tags[i]]
            else:
                matrix[i][j] = max(matrix[i-1][j], matrix[i][j-1], key=len)
                pos_matrix[i][j] = max(pos_matrix[i-1][j], pos_matrix[i][j-1], key=len)

    return pos_matrix[-1][-1]

In [9]:
reverb['tagged-arg1'] = reverb.apply(lambda t: list(lcs(t['tokenized'], t['tokenized-arg1'], t['tagged'])), axis=1)
reverb['tagged-relation'] = reverb.apply(lambda t: list(lcs(t['tokenized'], t['tokenized-relation'], t['tagged'])), axis=1)
reverb['tagged-arg2'] = reverb.apply(lambda t: list(lcs(t['tokenized'], t['tokenized-arg2'], t['tagged'])), axis=1)

# Remove tuples where we couldn't find all POS tags
reverb = reverb[reverb['tokenized-arg1'].apply(len) == reverb['tagged-arg1'].apply(len)]
reverb = reverb[reverb['tokenized-relation'].apply(len) == reverb['tagged-relation'].apply(len)]
reverb = reverb[reverb['tokenized-arg2'].apply(len) == reverb['tagged-arg2'].apply(len)]

In [10]:
reverb['pos-arg1'] = reverb['tagged-arg1'].apply(lambda t: set(list(x[1] for x in t)))
reverb['pos-relation'] = reverb['tagged-relation'].apply(lambda t: set(list(x[1] for x in t)))
reverb['pos-arg2'] = reverb['tagged-arg2'].apply(lambda t: set(list(x[1] for x in t)))

reverb.head()

out = reverb[['confidence', 'pos-arg1', 'pos-relation', 'pos-arg2', 'tokenized-arg1', 'tokenized-relation', 'tokenized-arg2', 'truth']].copy()
out['confidence'] = 'a_' + out['confidence'].round(1).astype(str) + '_:'
out['arg1'] = 'a_' + out['pos-arg1'].apply(lambda x: '-'.join(x)) + '_' + out['tokenized-arg1'].apply(lambda x: '-'.join(x)) + '_:'
out['relation'] = 'a_' + out['pos-relation'].apply(lambda x: '-'.join(x)) + '_' + out['tokenized-relation'].apply(lambda x: '-'.join(x)) + '_:'
out['arg2'] = 'a_' + out['pos-arg2'].apply(lambda x: '-'.join(x)) + '_' + out['tokenized-arg2'].apply(lambda x: '-'.join(x)) + '_:'

out = out[['confidence', 'arg1', 'relation', 'arg2', 'truth']]
out.head()

Unnamed: 0,confidence,arg1,relation,arg2,truth
0,a_0.6_:,a_PRON_they_:,a_VERB_plan-raise_:,a_NOUN_premiums_:,0
1,a_0.6_:,a_PRON_they_:,a_VERB_plan-reduce_:,a_NOUN_benefits_:,0
2,a_0.7_:,a_PRT-DET-ADJ-NOUN_The-two-year-note-'s-yield_:,a_ADJ-VERB-ADP_was-unchanged-at_:,a_NOUN-NUM_5.95-percent_:,1
3,a_0.9_:,a_DET-NUM_The-12_:,a_VERB_is_:,a_.-NUM_$-70.00_:,0
4,a_0.9_:,a_DET-ADJ-NOUN_The-principal-opposition-parties_:,a_VERB_boycotted_:,a_DET-NOUN_the-polls_:,1


In [11]:
feature_vector = 'a:a:a:a:'
structure_vector = '1:1:1:1:'
max_dims = '1:2:2:2:'
error_rate = out['truth'].mean()
cost = 0.0

In [12]:
top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(len(out)) + ';0;'
top_row

'a:a:a:a:\t1:2:2:2:;0.37876960193003617;0.0;false;a:a:a:a:;1:1:1:1:;829;0;'

In [13]:
out['input-str'] = out['confidence'] + out['arg1'] + out['relation'] + out['arg2']
out['truth'] = out['truth'] == 1

In [14]:
# write Data X-Ray input file
with open('./data/{}-posAsSet-input.txt'.format(filename), 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, out['truth'].iloc[i], out['input-str'].iloc[i])) for i in range(len(out)))

In [15]:
print(reverb[reverb['pos-arg2'].apply(set) == set(['DET','NUM'])]['arg2'].values)

['the 19th']
