## Find additional tuples for causal analysis of knowledge extraction.
In the notebook `causal-analysis-knowledge-extraction`, we found that we don't have enough data for some strata.
Here, we try to find suitable tuples that would fall into these strata. Then, we label the tuples as correct/incorrect, so that we can use them to resume the analysis.

Data source: http://reverb.cs.washington.edu/

In [1]:
import pandas as pd
import numpy as np
import nltk
from urllib.request import urlopen
from joblib import Parallel, delayed

from utils import string_utils

In [37]:
# Load the raw data
extractions = pd.read_csv('/home/mgauch/scratch/reverb_clueweb_tuples-1.1.txt', skiprows=400000, nrows=100000, usecols=[1,2,3,8], sep='\t', header=None)
extractions.columns = ['arg1', 'relation', 'arg2', 'confidence']
extractions['arg1'] = extractions['arg1'].astype(str) 
extractions['relation'] = extractions['relation'].astype(str)
extractions['arg2'] = extractions['arg2'].astype(str)
extractions.head()

Unnamed: 0,arg1,relation,arg2,confidence
0,93.5 % of Americans,listen to,terrestrial radio,0.92645
1,93.5 % of residents,speak Spanish at,home,0.93811
2,93.5 % of the people,voted for,Lukashenka,0.96718
3,93.502to,describe the functions of,the ALJ,0.90323
4,93.7 %,are younger than,24,0.93811


## Data Preprocessing
Here, we tokenize and POS-tag the extraction tuples

In [40]:
extractions['tokenized'] = (extractions['arg1'] +' '+ extractions['relation'] +' '+ extractions['arg2']).apply(nltk.word_tokenize)
extractions['tagged'] = extractions['tokenized'].apply(lambda x: nltk.pos_tag(x, tagset='universal'))

extractions['tokenized-arg1'] = extractions['arg1'].apply(nltk.word_tokenize)
extractions['tokenized-relation'] = extractions['relation'].apply(nltk.word_tokenize)
extractions['tokenized-arg2'] = extractions['arg2'].apply(nltk.word_tokenize)

In [42]:
old_shape = extractions.shape[0]

extractions['tagged-arg1'] = extractions.apply(lambda t: string_utils.find_subsequence(t['tokenized'], t['tokenized-arg1'], t['tagged']), axis=1)
extractions['tagged-relation'] = extractions.apply(lambda t: string_utils.find_subsequence(t['tokenized'], t['tokenized-relation'], t['tagged']), axis=1)
extractions['tagged-arg2'] = extractions.apply(lambda t: string_utils.find_subsequence(t['tokenized'], t['tokenized-arg2'], t['tagged']), axis=1)

# Remove tuples where we couldn't find all POS tags
extractions = extractions[extractions['tokenized-arg1'].apply(len) == extractions['tagged-arg1'].apply(len)]
extractions = extractions[extractions['tokenized-relation'].apply(len) == extractions['tagged-relation'].apply(len)]
extractions = extractions[extractions['tokenized-arg2'].apply(len) == extractions['tagged-arg2'].apply(len)]

print('Ignoring {} tuples'.format(old_shape - extractions.shape[0]))

extractions['pos-arg1'] = extractions['tagged-arg1'].apply(lambda t: sorted(set(list(x[1] for x in t))))
extractions['pos-relation'] = extractions['tagged-relation'].apply(lambda t: sorted(set(list(x[1] for x in t))))
extractions['pos-arg2'] = extractions['tagged-arg2'].apply(lambda t: sorted(set(list(x[1] for x in t))))

Ignoring 76 tuples


In [18]:
# Store intermediate results to speed up analysis the next time
extractions.to_csv('data/extractions-clueweb-posTagged-9M+100K.csv')

In [44]:
data = extractions[['arg1', 'relation', 'arg2', 'confidence', 'pos-arg1', 'pos-relation', 'pos-arg2']]
# Treated tuples are the ones with PRON in arg2
treated = data[data['pos-arg2'].apply(lambda x: 'PRON' in x)]
untreated = data[data['pos-arg2'].apply(lambda x: 'PRON' not in x)]

# Need more treated tuples with ADP-VERB as relation
adp_verb = treated[treated['pos-relation'].apply(lambda x: x == ['ADP', 'VERB'])]
# Need more treated tuples with PRON as arg1
pron = treated[treated['pos-arg1'].apply(lambda x: 'PRON' in x)]
# Need more untreated tuples with ADP-VERB relation
adp_verb_untreated = untreated[untreated['pos-relation'].apply(lambda x: x == ['ADP', 'VERB'])]

In [None]:
# Here, we manually label tuples that we want to use to resume the causal analysis.
additional_data = pd.concat([adp_verb, pron], ignore_index=True)
additional_data.columns = ['arg1_1', 'relation_1', 'arg2_1', 'confidence', 'arg1_0', 'relation_0', 'arg2_0']
additional_data = additional_data[['confidence', 'arg1_0', 'arg1_1', 'relation_0', 'relation_1', 'arg2_0', 'arg2_1']]
additional_data['arg1_0'] = additional_data['arg1_0'].apply(lambda x: '-'.join(x))
additional_data['relation_0'] = additional_data['relation_0'].apply(lambda x: '-'.join(x))
additional_data['arg2_0'] = additional_data['arg2_0'].apply(lambda x: '-'.join(x))
additional_data['arg1_1'] = additional_data['arg1_1'].apply(lambda x: '-'.join(x.split(' ')))
additional_data['relation_1'] = additional_data['relation_1'].apply(lambda x: '-'.join(x.split(' ')))
additional_data['arg2_1'] = additional_data['arg2_1'].apply(lambda x: '-'.join(x.split(' ')))
additional_data['confidence'] = additional_data['confidence'].round(1)
additional_data['O'] = False
#additional_data['O'].iloc[[12,13,19,20,29,44,46,51,54,58,60,63,65,84,87,93,168]] = True # For first 500K tuples
#additional_data['O'].iloc[[6,7,8,10,11,12,13,14,25,38,60,85,92,103,111]] = True # For 500K + 100K tuples
#additional_data['O'].iloc[[8,10,11,12,18]] = True # For 9M + 100K tuples
additional_data['T'] = True
#additional_data.to_csv('data/knowledge-extraction-additionalData-9M+100K.csv')
#for i in range(len(additional_data)):
#    print(additional_data.iloc[i,[2,4,6]])

In [None]:
additional_data = pd.concat([adp_verb_untreated.tail(100)], ignore_index=True)
additional_data.columns = ['arg1_1', 'relation_1', 'arg2_1', 'confidence', 'arg1_0', 'relation_0', 'arg2_0']
additional_data = additional_data[['confidence', 'arg1_0', 'arg1_1', 'relation_0', 'relation_1', 'arg2_0', 'arg2_1']]
additional_data['arg1_0'] = additional_data['arg1_0'].apply(lambda x: '-'.join(x))
additional_data['relation_0'] = additional_data['relation_0'].apply(lambda x: '-'.join(x))
additional_data['arg2_0'] = additional_data['arg2_0'].apply(lambda x: '-'.join(x))
additional_data['arg1_1'] = additional_data['arg1_1'].apply(lambda x: '-'.join(x.split(' ')))
additional_data['relation_1'] = additional_data['relation_1'].apply(lambda x: '-'.join(x.split(' ')))
additional_data['arg2_1'] = additional_data['arg2_1'].apply(lambda x: '-'.join(x.split(' ')))
additional_data['confidence'] = additional_data['confidence'].round(1)
additional_data['O'] = False
#additional_data['O'].iloc[[0,1,3,4,5,6,7,10,11,12,13,15,16,17,19,20,21,23,24,27,28,30,32,33,34,37,40,42,43,46,47,49,50,51,52,55,56,58,61,64,66,68,70,71,72,73,74,76,77,78,81,83,84,85,86,87,88,89,91,92,93,94,95,96,97,98,99]] = True # For 9M head 100 tuples
#additional_data['O'].iloc[[1,2,3,6,8,9,12,13,19,21,22,23,25,26,27,28,29,31,32,33,35,36,38,39,42,43,45,46,48,49,51,52,53,54,55,56,57,58,59,60,61,63,64,65,67,69,70,71,72,73,76,77,80,82,83,84,85,86,87,88,89,90,91,94,96,97,98,99]] = True # For 9M tail 100 tuples
#additional_data['O'].iloc[[2,4,5,6,8,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,69,70,71,72,73,74,76,77,78,79,80,81,82,83,85,86,87,89,90,91,92,93,94,95,96,97,98,99]] = True # For 500K + 100 tuples
additional_data['O'].iloc[[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,29,21,22,23,25,26,27,28,31,32,33,36,38,39,40,42,45,46,48,49,51,52,53,54,55,57,58,59,60,61,67,70,71,75,77,78,79,80,81,82,83,84,85,86,87,88,89,90,92,93,94,95,96,97,98,99]] = True # For 400K+100K tail 100 tuples
additional_data['T'] = False
additional_data.to_csv('data/knowledge-extraction-additionalData-untreated-400K+100Ktail100.csv')
#for i in range(len(additional_data)):
#    print(additional_data.iloc[i,[2,4,6]])