In [1]:
import pandas as pd
import numpy as np
import os
import glob
import json
import re
from collections import defaultdict

In [7]:
# Data with Entities

ROOT_FOLDER = "/raid6/home/ramraj/2021/ir/entity-reranking/Entity-Linking/"

# BENCHMARK_TRAIN_FOLD_FOLDER = os.path.join(ROOT_FOLDER, "benchmark-train-relevance-v2.0")
# BENCHMARK_TEST_FILE = os.path.join(ROOT_FOLDER, "test-data", "ramraj-test-data-top100-BM25.json")
BENCHMARK_TRAIN_FOLD_FOLDER = os.path.join(ROOT_FOLDER, "Train-with-entities")
BENCHMARK_TEST_FILE = os.path.join(ROOT_FOLDER, "Test-with-entities/ramraj-test-data-top100-BM25-opt.json")

# 1. Load train data

In [3]:
train_files = sorted(glob.glob(os.path.join(BENCHMARK_TRAIN_FOLD_FOLDER, "fold-*.json")))
len(train_files)


5

In [4]:
train_data = []
for train_file in train_files:
    tmp_data = json.load(open(train_file, 'r'))
    train_data.extend(tmp_data)
    
len(train_data)

1937

In [5]:
train_data[0]

{'qString': 'Ice bath Techniques Ice baths versus cold baths',
 'RelevantDocuments': [{'docScore': 1.0,
   'docID': '54bacb1c81f70a1db3d4d8e16fb551a5298eafbf',
   'docText': 'Several sources suggest that cold baths (60â\x80\x9375 degrees Fahrenheit) were preferable to ice baths. Physiotherapist Tony Wilson of the University of Southampton said that extremely cold temperatures were unnecessary and a "cold bath" would be just as effective as an ice bath. Another agreed that a mere cold bath is preferable to ice baths which are "unnecessary." A third report suggested that cool water (60â\x80\x9375 degrees Fahrenheit) was just as good as cold water (54â\x80\x9360 degrees Fahrenheit) and that eight to ten minutes should be sufficient time, and warned against exceeding ten minutes.',
   'dEntities': [{'mention': 'Fahrenheit',
     'entity_title': 'Fahrenheit',
     'score': 0.2771810293197632,
     'entity_id': 11524},
    {'mention': 'ice',
     'entity_title': 'Ice',
     'score': 0.196264

In [6]:
doc_cnt = 0
for train_data_sample in train_data:    
    for rel_docs in train_data_sample['RelevantDocuments']:        
        doc_cnt += 1
        
doc_cnt

4863

# 2. Load test data

In [8]:
test_data = json.load(open(BENCHMARK_TEST_FILE, 'r'))
len(test_data)

2254

In [None]:
# easier format

for tmp in test_data:
    print(tmp)
    break

# 3. Analyze vocab in train & test data

In [10]:
def tokenise(text):
    # Replace annoying unicode with a space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # The following replacements are suggested in the paper
    # BidAF (Seo et al., 2016)
    text = text.replace("''", '" ')
    text = text.replace("``", '" ')

    # Space out punctuation
    space_list = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~"
    space_list = "!\"#$%&()*+,./:;<=>?@[\\]^_`{|}~"
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in space_list}))

    # space out singlequotes a bit better (and like stanford)
    text = text.replace("'", " '")
    
    # use any APIs
    text = text.replace('\t', '').replace('\n', '').lower().strip()
    return text

# 4. Count Entities in Q & D

In [18]:
train_q_ent_vocab = set()
train_d_ent_vocab = set()

for train_data_sample in train_data:
    q_ent_list = train_data_sample['qEntities']
    for ent in q_ent_list:
        train_q_ent_vocab.add(ent['entity_title'])
    
    
    for rel_docs in train_data_sample['RelevantDocuments']:
        d_ent_list = rel_docs['dEntities']
        for ent in d_ent_list:
            train_d_ent_vocab.add(ent['entity_title'])

    
len(train_q_ent_vocab), len(train_d_ent_vocab)

(1576, 19642)

In [19]:
test_q_ent_vocab = set()
test_d_ent_vocab = set()

for test_data_sample in test_data:
    q_ent_list = test_data_sample['qEntities']
    for ent in q_ent_list:
        test_q_ent_vocab.add(ent['entity_title'])
    
    
    for rel_docs in test_data_sample['RelevantDocuments']:
        d_ent_list = rel_docs['dEntities']
        for ent in d_ent_list:
            test_d_ent_vocab.add(ent['entity_title'])

    
len(test_q_ent_vocab), len(test_d_ent_vocab)

(1830, 168710)

# 5. Sample-wise entity & word count

In [24]:
train_q_ent_cnt = []
train_d_ent_cnt = []
train_q_word_cnt = []
train_d_word_cnt = []

for train_data_sample in train_data:
    q_ent_list = train_data_sample['qEntities']
    train_q_ent_cnt.append(len(q_ent_list))
    
    q_text = tokenise( train_data_sample['qString'] ).split()
    train_q_word_cnt.append(len(q_text))
    
    
    for rel_docs in train_data_sample['RelevantDocuments']:
        d_ent_list = rel_docs['dEntities']
        train_d_ent_cnt.append(len(d_ent_list))

        doc_text = tokenise( rel_docs['docText'] ).split()
        
        train_d_word_cnt.append(len(doc_text))

    
print("Entities -> Query : Min: ", min(train_q_ent_cnt), " Max : ", max(train_q_ent_cnt))
print("Entities -> Doc   : Min: ", min(train_d_ent_cnt), " Max : ", max(train_d_ent_cnt))
print("Words    -> Query : Min: ", min(train_q_word_cnt), " Max : ", max(train_q_word_cnt))
print("Words    -> Doc   : Min: ", min(train_d_word_cnt), " Max : ", max(train_d_word_cnt))

Entities -> Query : Min:  0  Max :  11
Entities -> Doc   : Min:  0  Max :  117
Words    -> Query : Min:  1  Max :  29
Words    -> Doc   : Min:  2  Max :  404


In [25]:
train_q_0_ent_samples = 0
train_q_1_word_samples = 0
# train_2_ent_samples = 0
# train_2_word_samples = 0
train_d_0_ent_samples = 0
train_d_2_word_samples = 0

for train_data_sample in train_data:
    q_ent_list = train_data_sample['qEntities']
    
    if len(q_ent_list) == 0:
        train_q_0_ent_samples += 1

    q_text = tokenise( train_data_sample['qString'] ).split()
    
    if len(q_text) == 1:
        train_q_1_word_samples += 1

    
    for rel_docs in train_data_sample['RelevantDocuments']:
        d_ent_list = rel_docs['dEntities']
        
        if len(d_ent_list) == 0:
            train_d_0_ent_samples += 1

        doc_text = tokenise( rel_docs['docText'] ).split()
        
        if len(doc_text) == 2:
            train_d_2_word_samples += 1

    
print("0 Entities -> Query : ", train_q_0_ent_samples)
print("0 Entities -> Doc   : ", train_d_0_ent_samples)
print("1 Words    -> Query : ", train_q_1_word_samples)
print("2 Words    -> Doc   : ", train_d_2_word_samples)

0 Entities -> Query :  41
0 Entities -> Doc   :  31
1 Words    -> Query :  48
2 Words    -> Doc   :  6
