In [None]:
### notebook to analyze textual similarity of clueweb instances that share the same entity-relationship-entity triple
### uses the 5000 most common triples which are stored in 
### /Users/corbinrosset/Dropbox/Arora/QA-data/VanDurme_FB_annotations/annotated_clueweb/ClueWeb09_English_1/processed/grouped_textual_triples.pkl


'''
Only look at "clueweb_FB15k_all-<lhs, rhs, rel, sent>.pkl" files
or perhaps the raw versions thereof. 

Investigate overlap of Clueweb textual mentions and FB15k triples
- How many unique textual mentions are associated with each triple
 	- with the most common triples (have the most examples)
 	- then investigate how paraphrastic these unique textual mentions are for 
 	  a given triple (triple is the label - distant supervision)
- which and how many unique text strings are associated with many triples?
	- if there are many, it confounds the assumption that text is 
	  representative of the triples they are mentions of

'''
from __future__ import division
import sys
import matplotlib.pyplot as plt
import cPickle as pickle
import numpy as np
import scipy.sparse as sp
import operator
import time
import random
import itertools
from collections import Counter

###############################################################################
###                         	Globals                                     ###
###############################################################################

data_path = '/Users/corbinrosset/Dropbox/Arora/QA-data/FB15k_Clueweb/processed_text/'
# FB15k_path = '/Users/corbinrosset/Dropbox/Arora/QA-code/src/TransE_Text/data/'
execute_path = '/Users/corbinrosset/Dropbox/Arora/QA-code/src/process_clueweb/'
manifest_file = 'manifest.txt'
input_prefix = 'grouped_FB15k_clueweb_triples'
# grouped_FB15k_clueweb_triples-counts.pkl
# grouped_FB15k_clueweb_triples-text_sets.pkl

datatyp = 'all' # which .pkl files to load
NUM_FB1K_TRIPLES = 47291696
NUM_UNIQUE_SENTENCES = 6194853

entity2idx = None 
idx2entity = None 
num_entities_union_rels = 0 # sum of |Entities| + |Relatiionships| in FB15k
USE_FINAL = False #True
min_words_per_text = 2 ### min number of words needed to count a textual triple
num_triples = 5000 ### put top num_triples into the triples_to_extract below


unique_sent_map = {}
idx_2_sent_map = {}

text_per_triple_cntr = {} # count total number of textual instances f.e. triple
unique_text_per_triple = {} # set of unique text mentions f.e. triple



In [None]:
### load data
start = time.clock() 

text_per_triple_cntr = pickle.load(open(data_path + input_prefix + '-counts.pkl', 'r'))
unique_text_per_triple = pickle.load(open(data_path + input_prefix + '-text_sets.pkl', 'r'))
triple_per_text = pickle.load(open(data_path + input_prefix + '-triple_per_text_count.pkl', 'r'))
entity2idx = pickle.load(open(data_path + 'FB15k_entity2idx.pkl', 'r'))
idx2entity = pickle.load(open(data_path + 'FB15k_idx2entity.pkl', 'r'))
unique_sent_map = pickle.load(open(data_path + 'clueweb_FB15k_' + 'all-sent2idx.pkl', 'r'))
idx_2_sent_map = pickle.load(open(data_path + 'clueweb_FB15k_' + 'all-idx2sent.pkl', 'r'))
num_entities_union_rels = np.max(entity2idx.values()) + 1
srtd_triples = sorted(text_per_triple_cntr, key=text_per_triple_cntr.__getitem__, reverse=True)

elapsed = time.clock()
elapsed = elapsed - start
print "loaded data in: ", elapsed
assert len(unique_text_per_triple.keys()) == len(text_per_triple_cntr.keys())
print 'Number of unique triples: ' + str(len(unique_text_per_triple.keys()))


In [None]:
rels_I_want = ['food', 'medicine', 'spaceflight', 'architecture', 'education', \
               'business', 'biology', 'olympics', 'aviation', \
               'music', 'chemistry', 'law', 'religion', \
               'spaceflight']

counter = 0
print 'starting'
for i in srtd_triples:
    if rels_I_want[14] not in idx2entity[i[1]]:
        continue
    if counter > 1000:
        break
    counter += 1
    avg_triple_per_text = np.average([triple_per_text[j] for j in unique_text_per_triple[i]])
    print '\t' + str(idx2entity[i[0]]) + ' ' + str(idx2entity[i[1]]) \
        + ' ' + str(idx2entity[i[2]]) + ' count: ' + str(text_per_triple_cntr[i]) \
        + ' num unique texts: ' + str(len(unique_text_per_triple[i])) \
        + ' average number of other triples associated with the sentences here: ' + str(avg_triple_per_text)

In [None]:
### show hist of number of different TRIPLES assigned to each text instance
triples_per_text_srtd = [np.ceil(np.log2(i)) for i in triple_per_text.values()]
cntr = Counter(triples_per_text_srtd)
plt.hist(triples_per_text_srtd, bins=20)
plt.xlabel('log base 2 of the number of distinct triples assigned to a textual instance (rounded up)')
plt.ylabel('Number of Textual Instances')
plt.title('Histogram of the number of triples assigned to each textual instance')
plt.show()
print cntr

In [None]:
### show hist of number of different RELATIONS assigned to each text instance
triples_per_text_srtd = [np.ceil(np.log2(i)) for i in triple_per_text.values()]
cntr = Counter(triples_per_text_srtd)
plt.hist(triples_per_text_srtd, bins=20)
plt.xlabel('log base 2 of the number of distinct triples assigned to a textual instance (rounded up)')
plt.ylabel('Number of Textual Instances')
plt.title('Histogram of the number of triples assigned to each textual instance')
plt.show()
print cntr