In [42]:
import pandas as pd
import numpy as np
import os
import glob
import json
import re
from collections import defaultdict

In [3]:
ROOT_FOLDER = "/raid6/home/ramraj/2021/ir/Contextual-Reranking/Data/"

BENCHMARK_TRAIN_FOLD_FOLDER = os.path.join(ROOT_FOLDER, "benchmark-train-relevance-v2.0")
BENCHMARK_TEST_FILE = os.path.join(ROOT_FOLDER, "test-data", "ramraj-test-data-top100-BM25.json")

# 1. Load train data

In [5]:
train_files = sorted(glob.glob(os.path.join(BENCHMARK_TRAIN_FOLD_FOLDER, "fold-*-train.pages.cbor-hierarchical.benchmark.json")))
len(train_files)


5

In [12]:
train_data = []
for train_file in train_files:
    tmp_data = json.load(open(train_file, 'r'))
    train_data.extend(tmp_data)
    
len(train_data)

1937

In [13]:
train_data[0]

{'qString': 'Ice bath Techniques Ice baths versus cold baths',
 'RelevantDocuments': [{'docScore': 1.0,
   'docID': '54bacb1c81f70a1db3d4d8e16fb551a5298eafbf',
   'docText': 'Several sources suggest that cold baths (60â\x80\x9375 degrees Fahrenheit) were preferable to ice baths. Physiotherapist Tony Wilson of the University of Southampton said that extremely cold temperatures were unnecessary and a "cold bath" would be just as effective as an ice bath. Another agreed that a mere cold bath is preferable to ice baths which are "unnecessary." A third report suggested that cool water (60â\x80\x9375 degrees Fahrenheit) was just as good as cold water (54â\x80\x9360 degrees Fahrenheit) and that eight to ten minutes should be sufficient time, and warned against exceeding ten minutes.'}],
 'qID': 'enwiki:Ice%20bath/Techniques/Ice%20baths%20versus%20cold%20baths'}

In [78]:
doc_cnt = 0
for train_data_sample in train_data:    
    for rel_docs in train_data_sample['RelevantDocuments']:        
        doc_cnt += 1
        
doc_cnt

4863

# 2. Load test data

In [14]:
test_data = json.load(open(BENCHMARK_TEST_FILE, 'r'))
len(test_data)

225156

In [16]:
# easier format

for tmp in test_data:
    print(tmp)
    break

{'DocID': 'a28ff3028b5669ed187a0a7138350af332ec7ed1', 'Feedback': '', 'QueryID': 'enwiki:Aftertaste', 'DocText': "In wine tasting the aftertaste or finish of a wine, is an important part of the evaluation. After tasting a wine, a taster will determine the wine's aftertaste, which is a major determinant of the wine's quality. The aftertaste of a wine can be described as bitter, persistent, short, sweet, smooth, or even non-existent. Included in assessing the aftertaste of a wine is consideration of the aromas still present after swallowing. High quality wines typically have long finishes accompanied by pleasant aromas. By assessing the combination of olfactory and aftertaste sensations, wine tasting actually determines not only the aftertaste profile of a wine, but its flavor profile as well.", 'QueryText': 'Aftertaste'}


# 3. Analyze vocab in train & test data

In [18]:
def tokenise(text):
    # Replace annoying unicode with a space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # The following replacements are suggested in the paper
    # BidAF (Seo et al., 2016)
    text = text.replace("''", '" ')
    text = text.replace("``", '" ')

    # Space out punctuation
    space_list = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~"
    space_list = "!\"#$%&()*+,./:;<=>?@[\\]^_`{|}~"
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in space_list}))

    # space out singlequotes a bit better (and like stanford)
    text = text.replace("'", " '")
    
    # use any APIs
    text = text.replace('\t', '').replace('\n', '').lower().strip()
    return text

In [31]:
train_q_vocab = set()
train_d_vocab = set()

for train_data_sample in train_data:
    q_text = tokenise( train_data_sample['qString'] )
    train_q_vocab.update( q_text.split() )
    
    for rel_docs in train_data_sample['RelevantDocuments']:
        doc_text = tokenise( rel_docs['docText'] )
        
        train_d_vocab.update( doc_text.split() )

    
len(train_q_vocab), len(train_d_vocab)

(1988, 26990)

In [40]:
test_q_vocab = set()
test_d_vocab = set()

for idx, test_data_sample in enumerate(test_data):
    q_text = tokenise( test_data_sample['QueryText'] )
    doc_text = tokenise( test_data_sample['DocText'] )

    test_q_vocab.update(q_text.split())
    test_d_vocab.update(doc_text.split())
    
    if idx % 10000 == 0: print("Finished : ", idx)
    
len(test_q_vocab), len(test_d_vocab)

(2400, 189721)

# Format test-data to convenient format

In [45]:

# test_qd_map = defaultdict(list)

qID_list = []
qText_list = []
dID_list = []
dText_list = []

for idx, test_data_sample in enumerate(test_data):
    q_text = test_data_sample['QueryText']
    doc_text = test_data_sample['DocText']
    
    qID = test_data_sample['QueryID']
    dID = test_data_sample['DocID']
    
    qID_list.append(qID)
    qText_list.append(q_text)
    dID_list.append(dID)
    dText_list.append(doc_text)

#     if idx % 10000 == 0: print("Finished : ", idx)
        
df = pd.DataFrame({"qID": qID_list, "docID": dID_list, "qText": qText_list, "docText": dText_list})
df.shape

(225156, 4)

In [70]:
len(set(df['qID'])), len(set(df['docID']))

(2254, 93124)

In [76]:
grouped_df = df.groupby('qID')

test_qd_map = []

missing_100_retrievals_queries_cnt = 0

for key, query_df in grouped_df:
    
    qID = query_df.iloc[0]['qID']
    qText = query_df.iloc[0]['qText']
    
    query_sample_info = {}
    query_sample_info['qID'] = qID
    query_sample_info['qString'] = qText
    query_sample_info['RelevantDocuments'] = []
    
    if query_df.shape[0] != 100:
        print('... Not 100 search retrievals')
        missing_100_retrievals_queries_cnt += 1
    
    rank_cnt = 0
    for idx, row in query_df.iterrows():
        
        docID = row['docID']
        docText = row['docText']        
        
        query_sample_info['RelevantDocuments'].append({'docID': docID, 'docText': docText, 'docScore': rank_cnt})
        rank_cnt += 1
        
    test_qd_map.append( query_sample_info )
       
missing_100_retrievals_queries_cnt

... Not 100 search retrievals
... Not 100 search retrievals
... Not 100 search retrievals
... Not 100 search retrievals
... Not 100 search retrievals


5

In [77]:
SAVE_NEW_FORMAT_TEST_FILE = os.path.join(ROOT_FOLDER, "test-data", "ramraj-test-data-top100-BM25-opt.json")

with open(SAVE_NEW_FORMAT_TEST_FILE, 'w') as f:
    json.dump(test_qd_map, f, indent=4)