In [1]:
import pandas as pd
import numpy as np
import os
import glob
import json
import re
from collections import defaultdict

import torchtext
from torchtext.legacy.data import Example

from wikipedia2vec import Wikipedia2Vec

In [2]:
# Data with Entities

ROOT_FOLDER = "/raid6/home/ramraj/2021/ir/entity-reranking/Entity-Linking/"

# BENCHMARK_TRAIN_FOLD_FOLDER = os.path.join(ROOT_FOLDER, "benchmark-train-relevance-v2.0")
# BENCHMARK_TEST_FILE = os.path.join(ROOT_FOLDER, "test-data", "ramraj-test-data-top100-BM25.json")
# BENCHMARK_TRAIN_FOLD_FOLDER = os.path.join(ROOT_FOLDER, "Train-with-entities")
# BENCHMARK_TEST_FILE = os.path.join(ROOT_FOLDER, "Test-with-entities/ramraj-test-data-top100-BM25-opt.json")

BENCHMARK_TRAIN_FOLD_FOLDER = os.path.join(ROOT_FOLDER, "Train-with-entities-sepEntAndWords")
BENCHMARK_TEST_FILE = os.path.join(ROOT_FOLDER, "Test-with-entities/ramraj-test-data-top100-BM25-opt-entWordSEP.json")

# laod train

In [3]:
train_files = sorted(glob.glob(os.path.join(BENCHMARK_TRAIN_FOLD_FOLDER, "fold-*.json")))
print( len(train_files) )

train_data = []
for train_file in train_files:
    tmp_data = json.load(open(train_file, 'r'))
    train_data.extend(tmp_data)
    
len(train_data)

1


1937

# load test

In [4]:
test_data = json.load(open(BENCHMARK_TEST_FILE, 'r'))
len(test_data)

2254

# utils

In [5]:
def tokenise(text):
    # Replace annoying unicode with a space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # The following replacements are suggested in the paper
    # BidAF (Seo et al., 2016)
    text = text.replace("''", '" ')
    text = text.replace("``", '" ')

    # Space out punctuation
    space_list = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~"
    space_list = "!\"#$%&()*+,./:;<=>?@[\\]^_`{|}~"
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in space_list}))

    # space out singlequotes a bit better (and like stanford)
    text = text.replace("'", " '")
    
    # use any APIs
    text = text.replace('\t', '').replace('\n', '').lower().strip()
    return text

In [6]:
def alpha_numeric_filtering(text):
    regex = re.compile('[^a-zA-Z]')
    #First parameter is the replacement, second parameter is your input string
    return regex.sub(' ', text)     

# load wikie

In [7]:
import torchtext.vocab as vocab

MODEL_FILE = "../entity-embedding-modelling/data/enwiki_20180420_300d.txt"
# https://github.com/pytorch/text/issues/722

custom_embeddings = vocab.Vectors(name = MODEL_FILE, cache="/raid6/home/ramraj/2021/ir/entity-reranking/entity-embedding-modelling/data/cache-enwiki")

In [8]:
PKL_MODEL_FILE = "../entity-embedding-modelling/data/enwiki_20180420_100d.pkl"
wiki2vec = Wikipedia2Vec.load(PKL_MODEL_FILE)

# vocab

In [10]:
PAD_TOKEN = '<PAD>'
SOS_TOKEN = '<SOS>'
UNK_TOKEN = '<UNK>'
EOS_TOKEN = '<EOS>'

MAX_Q_LENGTH = 20
MAX_D_LEN = 200

Q_TEXT = torchtext.legacy.data.Field(sequential=True, 
                                tokenize=lambda x: x.split('<SEP>'),
                                eos_token=EOS_TOKEN,
                                pad_token=PAD_TOKEN,
                                include_lengths=True,
                                batch_first=True,
                                fix_length=MAX_Q_LENGTH,
                                lower=True)

DOC_TEXT = torchtext.legacy.data.Field(sequential=True, 
                                tokenize=lambda x: x.split('<SEP>'),
                                eos_token=EOS_TOKEN,
                                pad_token=PAD_TOKEN,
                                include_lengths=True,
                                batch_first=True,
                                fix_length=MAX_D_LEN,
                                lower=True)

In [11]:
example_texts = []

FIELDS = [('doc_text', DOC_TEXT), ('q_text', Q_TEXT)]
# https://github.com/pytorch/text/issues/722


for train_data_sample in train_data:
    q_text = train_data_sample['qString']
    
    for rel_docs in train_data_sample['RelevantDocuments']:
        doc_text = rel_docs['docText']
        
        example_texts.append( Example.fromlist([doc_text, q_text] , FIELDS ) )
        
for test_data_sample in test_data:
    q_text = test_data_sample['qString']
    
    for rel_docs in test_data_sample['RelevantDocuments']:
        doc_text = rel_docs['docText']
        
        example_texts.append( Example.fromlist([doc_text, q_text] , FIELDS ) )

In [12]:
torchtext_dataset = torchtext.legacy.data.Dataset(example_texts, fields=FIELDS)

Q_TEXT.build_vocab(torchtext_dataset, vectors=custom_embeddings, vectors_cache="/raid6/home/ramraj/2021/ir/entity-reranking/entity-embedding-modelling/data/cache-enwiki")
DOC_TEXT.build_vocab(torchtext_dataset, vectors=custom_embeddings, vectors_cache="/raid6/home/ramraj/2021/ir/entity-reranking/entity-embedding-modelling/data/cache-enwiki")

print("======= Query ========")
print(len(Q_TEXT.vocab) )
print(Q_TEXT.vocab.vectors.size() )
print("======= Doc ========")
print( len(DOC_TEXT.vocab))
print( DOC_TEXT.vocab.vectors.size())

4004
torch.Size([4004, 300])
229696
torch.Size([229696, 300])


In [13]:
print( DOC_TEXT.vocab.stoi['university of southampton'], DOC_TEXT.vocab.itos[14] )
print( DOC_TEXT.vocab.stoi['fahrenheit'], DOC_TEXT.vocab.itos[3] )

18345 new
16342 


# dataset

In [14]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch

from pytorch_transformers import WarmupLinearSchedule, AdamW
from transformers import get_linear_schedule_with_warmup

def map_to_torch_float(encoding):
    encoding = torch.FloatTensor(encoding)
    encoding.requires_grad_(False)
    return encoding

def map_to_torch(encoding):
    encoding = torch.LongTensor(encoding)
    encoding.requires_grad_(False)
    return encoding


class TestTRECDataset(Dataset):
    def __init__(self, raw_data, q_field, d_field, query_max_len=20, max_len=512, is_train=True):
        self.is_train = is_train
        self.max_len = max_len
        self.query_max_len = query_max_len
        
        self.q_field = q_field
        self.d_field = d_field

        self.qLookup = defaultdict()
        self.dLookup = defaultdict()
        self.data_df = self.load_data(raw_data)
        self.len = len(self.data_df)
        
    def load_data(self, data):
        return_data = {'qID': [], 'qText': [], 'dID': [], 'dText': [], 'label': []}
        for data_sample in data:
            qID = data_sample['qID']
            qText = data_sample['qString']

            if qText == '': continue
            self.qLookup[qID] = qText

            for rel_docs in data_sample['RelevantDocuments']:
                dID = rel_docs['docID']
                docText = rel_docs['docText']
                
                if docText == '': continue
                self.dLookup[dID] = docText

                return_data['qID'].append(qID)
                return_data['dID'].append(dID)
                # return_data['qText'].append( qText )
                # return_data['dText'].append( docText )

        # df = pd.DataFrame(return_data, columns=["qID", "qText", "dID", "dText"])
        df = pd.DataFrame(return_data, columns=["qID", "dID"])
        return df
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        index = index % self.len
        inst = self.data_df.loc[index]
        docID = inst['dID']
        qID = inst['qID']
        docText = self.dLookup[docID] # self.dLookup[docID] # inst['dText']
        qText = self.qLookup[qID] # self.qLookup[qID] # inst['qText']
        
        # print(qID)
        # print(qText)
        # print()
        # print(docID)
        # print(docText)

        # =============== prepare text before feed ===============
        q_text_processed = self.q_field.numericalize( self.q_field.pad([ [l for l in self.q_field.preprocess(qText) if l.strip()] ]) )
        d_text_processed = self.d_field.numericalize( self.d_field.pad([ [l for l in self.d_field.preprocess(docText) if l.strip()] ]) )

        return q_text_processed, d_text_processed, qID, docID
    

test_trec_dataset = TestTRECDataset(test_data, Q_TEXT, DOC_TEXT)
print( test_trec_dataset.__len__() )

q, d, qID, dID = test_trec_dataset.__getitem__(1)

print(q[0].shape)
print(d[0].shape)
# print(q[0][0].shape)
# print(out[1][0].shape)
# print(out[2][0].shape)
# print()
# print(out[0][0].shape)

225156
torch.Size([1, 20])
torch.Size([1, 200])


In [15]:
q_str = []
for qq in q[0][0]:
    q_str.append(Q_TEXT.vocab.itos[qq])
print(' '.join( q_str) )
print()

q_str = []
for qq in d[0][0]:
    q_str.append(DOC_TEXT.vocab.itos[qq])
print(' '.join( q_str) )
print()

aftertaste <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

aftertaste taste intensity food drink perception immediately food drink removed mouth aftertaste different food drink tin can vary intensity time unifying feature aftertaste perception food drink either swallowed spat neuroscience evolution taste aftertaste signal transduction taste receptor mouth brain yet fully understood however primary taste food processing area located insular cortex observed involved aftertaste perception <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

# Model

In [18]:
import typing
import torch.nn as nn
import torch.nn.functional as F


class SimScorer(nn.Module):
    def __init__(self, q_text_field, d_text_field, with_multi_layer_perceptron=True, mlp_num_layers=2, mlp_num_units=300, mlp_num_fan_out=1, padding_idx=1, embedding_freeze=True):
        super(SimScorer, self).__init__()
        self.with_multi_layer_perceptron = with_multi_layer_perceptron

        self.mlp_num_layers = mlp_num_layers
        self.mlp_num_units = mlp_num_units
        self.mlp_num_fan_out = mlp_num_fan_out
        self.q_text_field = q_text_field
        self.d_text_field = d_text_field
        self.padding_idx = padding_idx
        self.embedding_freeze = embedding_freeze
        self.build()
            
    def build(self):
        """
        Build model structure.
        """
        self.q_embedding = nn.Embedding.from_pretrained(self.q_text_field.vocab.vectors, padding_idx=self.padding_idx, freeze=self.embedding_freeze)
        self.d_embedding = nn.Embedding.from_pretrained(self.d_text_field.vocab.vectors, padding_idx=self.padding_idx, freeze=self.embedding_freeze)

    def forward(self, inputs):
        input_left, input_right = inputs['text_left'], inputs['text_right']
        
        input_left_emb = self.q_embedding(input_left)
        input_right_emb = self.d_embedding(input_right)
        
        # print("input_left_emb : ", input_left_emb.shape)
        # print("input_right_emb : ", input_right_emb.shape)
        
        input_left_emb_avg = torch.mean(input_left_emb, dim=1)
        input_right_emb_avg = torch.mean(input_right_emb, dim=1)
        
        # print("input_left_emb_avg : ", input_left_emb_avg.shape)
        # print("input_right_emb_avg : ", input_right_emb_avg.shape)
    
        # Dot product with cosine similarity.
        x = F.cosine_similarity(input_left_emb_avg, input_right_emb_avg)
        # x = x.unsqueeze(dim=1)
        # print("x : ", x.shape)

        return x


net = SimScorer(Q_TEXT, DOC_TEXT)
net.to(device)
print('Models built and ready to go!')

# tmp_q_text = torch.randint(low=1, high=30, size=(16, 20))
# tmp_d_text = torch.randint(low=1, high=30, size=(16, 200))
# print(tmp_q_text.shape)
# print(tmp_d_text.shape)

# input_text = {'text_left': tmp_q_text, 'text_right': tmp_d_text}

# output = net.forward(input_text)
# print(output)
# output.shape

Models built and ready to go!


In [24]:
# import config
import random
from bunch import Bunch
import time
import datetime

CONFIG_FILE = "config.json"

def format_time(elapsed_time):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    """
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed_time)))
    return str(datetime.timedelta(seconds=elapsed_rounded)) # Format as hh:mm:ss

def get_config_from_json(json_file):
    """
        Get the config from a json file
        :param json_file:
        :return: config(namespace) or config(dictionary)
        """
    # parse the configurations from the config json file provided
    with open(json_file, 'r') as config_file:
        config_dict = json.load(config_file)

    # convert the dictionary to a namespace using bunch lib
    config = Bunch(config_dict)

    return config, config_dict

import torch
config, _ = get_config_from_json(CONFIG_FILE)
seed_val = config.cmd_args['seed']
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
os.makedirs(config.data['results_dir'], exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


test_dataloader = DataLoader(dataset=test_trec_dataset,
                              batch_size=config.training["test_batch_size"],
                              pin_memory=device,
                              num_workers=config.training['num_workers'],
                              shuffle=True)
n_test_batches = len(test_dataloader)
print("Number of test batches : ", n_test_batches)
net.eval()

cuda
Number of test batches :  880


SimScorer(
  (q_embedding): Embedding(4004, 300, padding_idx=1)
  (d_embedding): Embedding(229696, 300, padding_idx=1)
)

# test

In [25]:
qID_list = []
paraID_list = []
pScore_list = []
t1 = time.time()
for batch_idx, test_batch_data in enumerate(test_dataloader):
    # Converting these to cuda tensors
    q_text, d_text, qID, passageID = test_batch_data
#     print(type(q_text))
#     x = q_text[0]
#     y = q_text[1]
#     print(x.shape)
#     print(y.shape)
#     break
    q_text, d_text = torch.squeeze(q_text[0].to(device)), torch.squeeze(d_text[0]).to(device)

    with torch.no_grad():
        net_output = net({'text_left': q_text, 'text_right': d_text})
        net_output = net_output.detach().cpu().numpy()

        for i in range(len(qID)):
            qID_list.append(qID[i])
            paraID_list.append(passageID[i])
            pScore_list.append(net_output[i])
    elapsed = format_time(time.time() - t1)
    if batch_idx % 50 == 0:
        print('  Batch {:>5,}  of  {:>5,}  :  processed    Elapsed: {:}.'.format(batch_idx,
                                                                                 n_test_batches,
                                                                                 elapsed))

pScore_list = [float(e) for e in pScore_list]
predicted_df = pd.DataFrame({"qID": qID_list,
                             "pID": paraID_list,
                             "pScore": pScore_list}, columns=["qID", "pID", "pScore"])
if not config.cmd_args['mode'] == "experiment":
    predicted_df.to_csv(os.path.join(config.data['results_dir'], "predictions.csv"))
print()

  Batch     0  of    880  :  processed    Elapsed: 0:00:00.
  Batch    50  of    880  :  processed    Elapsed: 0:00:06.
  Batch   100  of    880  :  processed    Elapsed: 0:00:12.
  Batch   150  of    880  :  processed    Elapsed: 0:00:18.
  Batch   200  of    880  :  processed    Elapsed: 0:00:24.
  Batch   250  of    880  :  processed    Elapsed: 0:00:31.
  Batch   300  of    880  :  processed    Elapsed: 0:00:37.
  Batch   350  of    880  :  processed    Elapsed: 0:00:43.
  Batch   400  of    880  :  processed    Elapsed: 0:00:50.
  Batch   450  of    880  :  processed    Elapsed: 0:00:56.
  Batch   500  of    880  :  processed    Elapsed: 0:01:02.
  Batch   550  of    880  :  processed    Elapsed: 0:01:08.
  Batch   600  of    880  :  processed    Elapsed: 0:01:14.
  Batch   650  of    880  :  processed    Elapsed: 0:01:21.
  Batch   700  of    880  :  processed    Elapsed: 0:01:26.
  Batch   750  of    880  :  processed    Elapsed: 0:01:32.
  Batch   800  of    880  :  processed  

In [26]:
# ================================================
#               Reverse Sorting Relevance
# ================================================
predicted_df = predicted_df[['qID', 'pID', 'pScore']]
grouped_pred_df = predicted_df.groupby(["qID"])
num_queries = len(grouped_pred_df)
missing_q_sets = 0
save_ranked_file = os.path.join(config.data['results_dir'], "ranked.test.relevance.txt")
with open(save_ranked_file, 'w') as write_file:
    q_cnt = 1
    for name, row_group in grouped_pred_df:
        rank_cnt = 1

        # ======= SORTING =======
        sorted_row_group = row_group.sort_values(by='pScore', ascending=False, inplace=False)
        # =======================

        if len(sorted_row_group) != 100:
            # print(">>>>>>>>>>> Missing query %s with shape %s" % (name, sorted_row_group.shape))
            # print(">>>>>>>>>>> Missing query with size %s" % sorted_row_group.shape[0])
            missing_q_sets += 1

        for i, row in sorted_row_group.iterrows():
            write_file.write("%s\tQ0\t%s\t%s\t%s\trchan31\n" % \
                             (row["qID"], row["pID"], rank_cnt, row["pScore"]))
            rank_cnt += 1

        print("Finished composing for query number : %s / %s" % (q_cnt, num_queries))
        q_cnt += 1
print()
print("Missing query-doc pairs : ", missing_q_sets)
print("Done train, val, and test !!!")

Finished composing for query number : 1 / 2254
Finished composing for query number : 2 / 2254
Finished composing for query number : 3 / 2254
Finished composing for query number : 4 / 2254
Finished composing for query number : 5 / 2254
Finished composing for query number : 6 / 2254
Finished composing for query number : 7 / 2254
Finished composing for query number : 8 / 2254
Finished composing for query number : 9 / 2254
Finished composing for query number : 10 / 2254
Finished composing for query number : 11 / 2254
Finished composing for query number : 12 / 2254
Finished composing for query number : 13 / 2254
Finished composing for query number : 14 / 2254
Finished composing for query number : 15 / 2254
Finished composing for query number : 16 / 2254
Finished composing for query number : 17 / 2254
Finished composing for query number : 18 / 2254
Finished composing for query number : 19 / 2254
Finished composing for query number : 20 / 2254
Finished composing for query number : 21 / 2254
F

Finished composing for query number : 182 / 2254
Finished composing for query number : 183 / 2254
Finished composing for query number : 184 / 2254
Finished composing for query number : 185 / 2254
Finished composing for query number : 186 / 2254
Finished composing for query number : 187 / 2254
Finished composing for query number : 188 / 2254
Finished composing for query number : 189 / 2254
Finished composing for query number : 190 / 2254
Finished composing for query number : 191 / 2254
Finished composing for query number : 192 / 2254
Finished composing for query number : 193 / 2254
Finished composing for query number : 194 / 2254
Finished composing for query number : 195 / 2254
Finished composing for query number : 196 / 2254
Finished composing for query number : 197 / 2254
Finished composing for query number : 198 / 2254
Finished composing for query number : 199 / 2254
Finished composing for query number : 200 / 2254
Finished composing for query number : 201 / 2254
Finished composing f

Finished composing for query number : 360 / 2254
Finished composing for query number : 361 / 2254
Finished composing for query number : 362 / 2254
Finished composing for query number : 363 / 2254
Finished composing for query number : 364 / 2254
Finished composing for query number : 365 / 2254
Finished composing for query number : 366 / 2254
Finished composing for query number : 367 / 2254
Finished composing for query number : 368 / 2254
Finished composing for query number : 369 / 2254
Finished composing for query number : 370 / 2254
Finished composing for query number : 371 / 2254
Finished composing for query number : 372 / 2254
Finished composing for query number : 373 / 2254
Finished composing for query number : 374 / 2254
Finished composing for query number : 375 / 2254
Finished composing for query number : 376 / 2254
Finished composing for query number : 377 / 2254
Finished composing for query number : 378 / 2254
Finished composing for query number : 379 / 2254
Finished composing f

Finished composing for query number : 540 / 2254
Finished composing for query number : 541 / 2254
Finished composing for query number : 542 / 2254
Finished composing for query number : 543 / 2254
Finished composing for query number : 544 / 2254
Finished composing for query number : 545 / 2254
Finished composing for query number : 546 / 2254
Finished composing for query number : 547 / 2254
Finished composing for query number : 548 / 2254
Finished composing for query number : 549 / 2254
Finished composing for query number : 550 / 2254
Finished composing for query number : 551 / 2254
Finished composing for query number : 552 / 2254
Finished composing for query number : 553 / 2254
Finished composing for query number : 554 / 2254
Finished composing for query number : 555 / 2254
Finished composing for query number : 556 / 2254
Finished composing for query number : 557 / 2254
Finished composing for query number : 558 / 2254
Finished composing for query number : 559 / 2254
Finished composing f

Finished composing for query number : 712 / 2254
Finished composing for query number : 713 / 2254
Finished composing for query number : 714 / 2254
Finished composing for query number : 715 / 2254
Finished composing for query number : 716 / 2254
Finished composing for query number : 717 / 2254
Finished composing for query number : 718 / 2254
Finished composing for query number : 719 / 2254
Finished composing for query number : 720 / 2254
Finished composing for query number : 721 / 2254
Finished composing for query number : 722 / 2254
Finished composing for query number : 723 / 2254
Finished composing for query number : 724 / 2254
Finished composing for query number : 725 / 2254
Finished composing for query number : 726 / 2254
Finished composing for query number : 727 / 2254
Finished composing for query number : 728 / 2254
Finished composing for query number : 729 / 2254
Finished composing for query number : 730 / 2254
Finished composing for query number : 731 / 2254
Finished composing f

Finished composing for query number : 890 / 2254
Finished composing for query number : 891 / 2254
Finished composing for query number : 892 / 2254
Finished composing for query number : 893 / 2254
Finished composing for query number : 894 / 2254
Finished composing for query number : 895 / 2254
Finished composing for query number : 896 / 2254
Finished composing for query number : 897 / 2254
Finished composing for query number : 898 / 2254
Finished composing for query number : 899 / 2254
Finished composing for query number : 900 / 2254
Finished composing for query number : 901 / 2254
Finished composing for query number : 902 / 2254
Finished composing for query number : 903 / 2254
Finished composing for query number : 904 / 2254
Finished composing for query number : 905 / 2254
Finished composing for query number : 906 / 2254
Finished composing for query number : 907 / 2254
Finished composing for query number : 908 / 2254
Finished composing for query number : 909 / 2254
Finished composing f

Finished composing for query number : 1060 / 2254
Finished composing for query number : 1061 / 2254
Finished composing for query number : 1062 / 2254
Finished composing for query number : 1063 / 2254
Finished composing for query number : 1064 / 2254
Finished composing for query number : 1065 / 2254
Finished composing for query number : 1066 / 2254
Finished composing for query number : 1067 / 2254
Finished composing for query number : 1068 / 2254
Finished composing for query number : 1069 / 2254
Finished composing for query number : 1070 / 2254
Finished composing for query number : 1071 / 2254
Finished composing for query number : 1072 / 2254
Finished composing for query number : 1073 / 2254
Finished composing for query number : 1074 / 2254
Finished composing for query number : 1075 / 2254
Finished composing for query number : 1076 / 2254
Finished composing for query number : 1077 / 2254
Finished composing for query number : 1078 / 2254
Finished composing for query number : 1079 / 2254


Finished composing for query number : 1231 / 2254
Finished composing for query number : 1232 / 2254
Finished composing for query number : 1233 / 2254
Finished composing for query number : 1234 / 2254
Finished composing for query number : 1235 / 2254
Finished composing for query number : 1236 / 2254
Finished composing for query number : 1237 / 2254
Finished composing for query number : 1238 / 2254
Finished composing for query number : 1239 / 2254
Finished composing for query number : 1240 / 2254
Finished composing for query number : 1241 / 2254
Finished composing for query number : 1242 / 2254
Finished composing for query number : 1243 / 2254
Finished composing for query number : 1244 / 2254
Finished composing for query number : 1245 / 2254
Finished composing for query number : 1246 / 2254
Finished composing for query number : 1247 / 2254
Finished composing for query number : 1248 / 2254
Finished composing for query number : 1249 / 2254
Finished composing for query number : 1250 / 2254


Finished composing for query number : 1404 / 2254
Finished composing for query number : 1405 / 2254
Finished composing for query number : 1406 / 2254
Finished composing for query number : 1407 / 2254
Finished composing for query number : 1408 / 2254
Finished composing for query number : 1409 / 2254
Finished composing for query number : 1410 / 2254
Finished composing for query number : 1411 / 2254
Finished composing for query number : 1412 / 2254
Finished composing for query number : 1413 / 2254
Finished composing for query number : 1414 / 2254
Finished composing for query number : 1415 / 2254
Finished composing for query number : 1416 / 2254
Finished composing for query number : 1417 / 2254
Finished composing for query number : 1418 / 2254
Finished composing for query number : 1419 / 2254
Finished composing for query number : 1420 / 2254
Finished composing for query number : 1421 / 2254
Finished composing for query number : 1422 / 2254
Finished composing for query number : 1423 / 2254


Finished composing for query number : 1572 / 2254
Finished composing for query number : 1573 / 2254
Finished composing for query number : 1574 / 2254
Finished composing for query number : 1575 / 2254
Finished composing for query number : 1576 / 2254
Finished composing for query number : 1577 / 2254
Finished composing for query number : 1578 / 2254
Finished composing for query number : 1579 / 2254
Finished composing for query number : 1580 / 2254
Finished composing for query number : 1581 / 2254
Finished composing for query number : 1582 / 2254
Finished composing for query number : 1583 / 2254
Finished composing for query number : 1584 / 2254
Finished composing for query number : 1585 / 2254
Finished composing for query number : 1586 / 2254
Finished composing for query number : 1587 / 2254
Finished composing for query number : 1588 / 2254
Finished composing for query number : 1589 / 2254
Finished composing for query number : 1590 / 2254
Finished composing for query number : 1591 / 2254


Finished composing for query number : 1746 / 2254
Finished composing for query number : 1747 / 2254
Finished composing for query number : 1748 / 2254
Finished composing for query number : 1749 / 2254
Finished composing for query number : 1750 / 2254
Finished composing for query number : 1751 / 2254
Finished composing for query number : 1752 / 2254
Finished composing for query number : 1753 / 2254
Finished composing for query number : 1754 / 2254
Finished composing for query number : 1755 / 2254
Finished composing for query number : 1756 / 2254
Finished composing for query number : 1757 / 2254
Finished composing for query number : 1758 / 2254
Finished composing for query number : 1759 / 2254
Finished composing for query number : 1760 / 2254
Finished composing for query number : 1761 / 2254
Finished composing for query number : 1762 / 2254
Finished composing for query number : 1763 / 2254
Finished composing for query number : 1764 / 2254
Finished composing for query number : 1765 / 2254


Finished composing for query number : 1917 / 2254
Finished composing for query number : 1918 / 2254
Finished composing for query number : 1919 / 2254
Finished composing for query number : 1920 / 2254
Finished composing for query number : 1921 / 2254
Finished composing for query number : 1922 / 2254
Finished composing for query number : 1923 / 2254
Finished composing for query number : 1924 / 2254
Finished composing for query number : 1925 / 2254
Finished composing for query number : 1926 / 2254
Finished composing for query number : 1927 / 2254
Finished composing for query number : 1928 / 2254
Finished composing for query number : 1929 / 2254
Finished composing for query number : 1930 / 2254
Finished composing for query number : 1931 / 2254
Finished composing for query number : 1932 / 2254
Finished composing for query number : 1933 / 2254
Finished composing for query number : 1934 / 2254
Finished composing for query number : 1935 / 2254
Finished composing for query number : 1936 / 2254


Finished composing for query number : 2094 / 2254
Finished composing for query number : 2095 / 2254
Finished composing for query number : 2096 / 2254
Finished composing for query number : 2097 / 2254
Finished composing for query number : 2098 / 2254
Finished composing for query number : 2099 / 2254
Finished composing for query number : 2100 / 2254
Finished composing for query number : 2101 / 2254
Finished composing for query number : 2102 / 2254
Finished composing for query number : 2103 / 2254
Finished composing for query number : 2104 / 2254
Finished composing for query number : 2105 / 2254
Finished composing for query number : 2106 / 2254
Finished composing for query number : 2107 / 2254
Finished composing for query number : 2108 / 2254
Finished composing for query number : 2109 / 2254
Finished composing for query number : 2110 / 2254
Finished composing for query number : 2111 / 2254
Finished composing for query number : 2112 / 2254
Finished composing for query number : 2113 / 2254


In [28]:
! ls exp1_sim

ranked.test.relevance.txt


In [32]:
! ../Eval/trec_eval-master/trec_eval ../Eval/qrelsY1-test.V2.0/automatic-test.pages.cbor-hierarchical.qrels exp1_sim/ranked.test.relevance.txt -m all_trec
# > exp1_sim_trec_results_withTestVocab.txt



runid                 	all	rchan31
num_q                 	all	2254
num_ret               	all	225156
num_rel               	all	6192
num_rel_ret           	all	2375
map                   	all	0.0670
gm_map                	all	0.0025
Rprec                 	all	0.0395
bpref                 	all	0.4689
recip_rank            	all	0.1058
iprec_at_recall_0.00  	all	0.1086
iprec_at_recall_0.10  	all	0.1082
iprec_at_recall_0.20  	all	0.1006
iprec_at_recall_0.30  	all	0.0866
iprec_at_recall_0.40  	all	0.0720
iprec_at_recall_0.50  	all	0.0692
iprec_at_recall_0.60  	all	0.0500
iprec_at_recall_0.70  	all	0.0486
iprec_at_recall_0.80  	all	0.0439
iprec_at_recall_0.90  	all	0.0433
iprec_at_recall_1.00  	all	0.0433
P_5                   	all	0.0307
P_10                  	all	0.0255
P_15                  	all	0.0219
P_20                  	all	0.0197
P_30                  	all	0.0171
P_100                 	all	0.0105
P_200                 	all	0.0053
P_500                 	al

In [33]:
! ../Eval/trec_eval-master/trec_eval ../Eval/qrelsY1-test.V2.0/automatic-test.pages.cbor-hierarchical.qrels exp1_sim/ranked.test.relevance.txt  -m all_trec > exp1_sim_trec_results_withTestVocab.txt

