# Download NER Models

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
# You may need to upgrade transformers to avoid the error during decision of similarity score
!pip install transformers==4.30.2

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# Set Parameters

In [None]:
dataset = "nfcorpus" # Specify a dataset that you want to try: nfcorpus, scifact, arguana, scidocs, climate-fever, trec-covid, quora, germanquad, vihealthqa, ma-amazon
query_aug = "qgen" # Specify the query augmentation that you want to try: crop, rm3, axiomaticqe, summarization, flan, open_llama, qgen. Note that qgen is only one tested on Vietnamese and German datasets.
ques_per_passage = 1 # Number of synthetic quries per document
retrieval_model_name = "msmarco-distilbert-base-v3" # Specify a retrieval model that you want to try:
# For English, all-mpnet-base-v2, msmarco-distilbert-base-v3, Muennighoff/SGPT-125M-weightedmean-msmarco, sentence-transformers/distiluse-base-multilingual-cased-v2
# For German, svalabs/bi-electra-ms-marco-german-uncased, T-Systems-onsite/cross-en-de-roberta-sentence-transformer, sentence-transformers/distiluse-base-multilingual-cased-v2
# For Vietnamese, VoVanPhuc/sup-SimCSE-VietNamese-phobert-base, keepitreal/vietnamese-sbert, sentence-transformers/distiluse-base-multilingual-cased-v2

gamma = 0.7 # gamma value for decision of similarity model
delta = 0.4 # delta value for decision of similarity score

In [None]:
# For Muennighoff/SGPT-125M-weightedmean-msmarco, install the following
!pip install --upgrade git+https://github.com/Muennighoff/sentence-transformers.git@sgpt_poolings_specb

In [None]:
# For open_llama, install the following
!pip install protobuf==3.20

In [None]:
# For rm3 and axiomaticqe, install the following
!pip install python-terrier


In [None]:
# For multiple calling of rm3 and axiomaticqe, remove index folder
!rm index -r

# Download Dataset

In [None]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.generation import QueryGenerator as QGen
from beir.generation.models import QGenModel
from beir.retrieval.train import TrainRetriever
from sentence_transformers import SentenceTransformer, losses, models
import sentence_transformers.util as utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pathlib, os
from time import time
from tqdm import tqdm
import json
import re
import pandas as pd

if dataset != "ma-amazon":

  url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
  data_path = util.download_and_unzip(url, 'datasets')

  document, queries, qrels = GenericDataLoader(data_path).load(split="test")

  ori_document = document

  # Detect the language of a dataset
  if dataset == "vihealthqa":
    lang = "Vietnamese"
  elif dataset == "germanquad":
    lang = "German"
  else:
    lang = "English"

else:

  data_path = "esci-data/recall/" ### Please check the saved location of ma-amazon dataset if you meet the FileNotFoundError
  with open(data_path + "corpus.jsonl", "r") as file:
    for line in file:
      document = json.loads(line)

  with open(data_path + "queries.jsonl", "r") as file:
    for line in file:
      queries = json.loads(line)

  with open(data_path + "qrels.jsonl", "r") as file:
    for line in file:
      qrels = json.loads(line)

  lang = "English"

print("Language of dataset:", lang)


In [None]:
# Here, we reduce the size of documents (500) during generating the synthetic queries.
# This helps to reproduce the results and ideas quickly.
# You are welcome to run on the whole dataset as well but it may take time.
loc = 0
new_document = {}
for key, val in document.items():
  new_document[key] = val
  loc += 1
  if loc == 500:
    break

document = new_document
print("Size of Document:", len(document))

In [None]:
# Merge the title and text in each document
key_loc = []
document_title_text = []

for key in document.keys():
  document_title_text.append(document[key]['title'] + '. ' + document[key]['text'])
  key_loc.append(key)

print(len(key_loc), len(document_title_text))

# Decision of Similarity Model

In [None]:
tr_idf_model  = TfidfVectorizer(use_idf=True)
tf_idf_vector = tr_idf_model.fit_transform(document_title_text)

over_size = 36000 # max_features in TF-IDF
if tf_idf_vector.shape[1] > over_size:
  print('Original size: ', tf_idf_vector.shape[1])
  over_size = tf_idf_vector.shape[1]
  tr_idf_model  = TfidfVectorizer(use_idf=True, max_features = 36000) # helpful to prevent the sparsity problem in TF-IDF
  tf_idf_vector = tr_idf_model.fit_transform(document_title_text)

tf_idf_array = tf_idf_vector.toarray()
print(tf_idf_array.shape)


In [None]:
from scipy.stats import entropy
import numpy as np

# 1. Measure (# of terms with entropy > 1) /  (# of terms with entropy <= 1).
# 2. If it is higher than gamma, considers Pre-trained LM for calculating cosine-similarity. If not, uses TF-IDF for measuring cosine-similarity.

shannon = entropy(tf_idf_array, base=2) * (36000 / over_size)

#### Replace nan to zero #######
shannon[np.isnan(shannon)] = 0

shannon_selection = np.sum(shannon > 1)  / np.sum(shannon < 1)

choice = ""
if shannon_selection > gamma:
  choice = "Pre-trained LM"
  del tf_idf_array
  del tf_idf_vector
  del tr_idf_model
else:
  choice = "TF-IDF"

print("Similarity Model Decision:", choice)

In [None]:
if choice == "Pre-trained LM":
  if lang == "Vietnamese":
    entropy_model_path = 'keepitreal/vietnamese-sbert'
  elif lang == "German":
    entropy_model_path = 'bert-base-german-cased'
  else:
    entropy_model_path = 'all-mpnet-base-v2'

  model = SentenceTransformer(entropy_model_path)
  print("Pre-trained LM for similarity measurement:", entropy_model_path)

  transformer_list = []

  for sample in document_title_text:
    transformer_list.append(model.encode(sample))

  transformer_array = np.array(transformer_list)




# Decision of Similarity Score

In [None]:
# Translation for non-English datasets
keyword_document = document_title_text

if lang in ["Vietnamese", "German"]:
  from googletrans import Translator
  translator = Translator()

  for i in range(len(keyword_document)):
    if lang == "Vietnamese":
      keyword_document[i] = translator.translate(keyword_document[i], src="vi", dest="en").text
    if lang == "German":
      keyword_document[i] = translator.translate(keyword_document[i], src="de", dest="en").text


In [None]:
import spacy
import spacy_transformers
import re

spacy.require_gpu() # comment this, if you meet the GPU error
standard = spacy.load('en_core_web_trf')
prof = spacy.load('en_core_sci_scibert')
vocab_standard = 50265 # size of vocab is shared in https://github.com/explosion/spacy-models/releases/tag/en_core_web_trf-3.7.3
vocab_prof = 785000 # size of vocab is shared in https://allenai.github.io/scispacy/

collect_no_filter = []

loc = 0
for sentence in tqdm(keyword_document):
  sentence = re.sub("[^A-Z]", " ", sentence , 0, re.IGNORECASE)
  doc = standard(sentence)
  std_num = 0
  for ent in doc.ents:
      std_num += 1

  doc = prof(sentence)
  prof_num = 0
  for ent in doc.ents:
      prof_num += 1

  collect_no_filter.append([std_num, prof_num])

val_std = 0
val_prof = 0
for num in tqdm(collect_no_filter):
  if num[1] != 0:
    val_std += num[0]
    val_prof += num[1]

val_std /= len(collect_no_filter)
val_prof /= len(collect_no_filter)

if (val_std * vocab_prof) > (val_prof * vocab_standard):
  doc_type = "General"
else:
  doc_type = "Specialized"

print("Document is closed to", doc_type)

# Link Documents

In [None]:
import numpy as np

if choice == "TF-IDF":
  similarity = cosine_similarity(tf_idf_array)
  print("TF-IDF is used for similarity model")
else:
  similarity = cosine_similarity(transformer_array)
  print("Pre-trained LM is used for similarity model")

sort_loc = np.argsort(similarity, axis=1)[:,-3:]
val_loc = np.sort(similarity, axis=1)[:,-3:]

high_similarity = []

if doc_type == "General":
  key_thres = delta
else:
  key_thres = 1 - delta

print("Similarity score is", key_thres)

for i in range(len(sort_loc)):
  for j in range(len(sort_loc[i])):
    if i != sort_loc[i][j] and val_loc[i][j] > key_thres:
      high_similarity.append([i, sort_loc[i][j]])



In [None]:
# Generate a new document based on UDL. Concatenation is considered.
num = 0
denote = "NEW_DOCUMENT-"+str(num)
new_cor_key_loc = {}

for i in range(len(high_similarity)):
  document[denote] = {}

  if document[key_loc[high_similarity[i][0]]]['title'] != document[key_loc[high_similarity[i][1]]]['title']:
    document[denote]['title'] = document[key_loc[high_similarity[i][0]]]['title'] + ' ' + document[key_loc[high_similarity[i][1]]]['title']
  else:
    document[denote]['title'] = document[key_loc[high_similarity[i][0]]]['title']

  if document[key_loc[high_similarity[i][0]]]['text'] != document[key_loc[high_similarity[i][1]]]['text']:
    document[denote]['text'] = document[key_loc[high_similarity[i][0]]]['text'] + ' ' + document[key_loc[high_similarity[i][1]]]['text']
  else:
    document[denote]['text'] = document[key_loc[high_similarity[i][0]]]['text']

  num += 1

  new_cor_key_loc[denote] = [key_loc[high_similarity[i][0]], key_loc[high_similarity[i][1]]]

  denote = "NEW_DOCUMENT-"+str(num)

print("Total size of documents after universal document linking:", len(document))


# Generate Synthetic Queries

In [None]:
from time import time

print("Type of Query Augmentation:", query_aug)
prefix = "gen"
start_time = time()

if query_aug == "qgen":
  if lang == "Vietnamese":
    model_path = "doc2query/msmarco-vietnamese-mt5-base-v1"
  elif lang == "German":
    model_path = "doc2query/msmarco-german-mt5-base-v1"
  else:
    model_path = "BeIR/query-gen-msmarco-t5-base-v1"

  print("Model for QGen:", model_path)

  generator = QGen(model=QGenModel(model_path))
  generator.generate(document, output_dir=data_path, ques_per_passage=ques_per_passage, prefix=prefix, batch_size=4)

  if dataset != "ma-amazon":
    # Load the synthetic queries, qrels and original document
    document, gen_queries, gen_qrels = GenericDataLoader(data_path, prefix=prefix).load(split="train")
  else:
    # Load the synthetic queries, qrels and original document
    _, gen_queries, gen_qrels = GenericDataLoader(data_path, prefix=prefix).load(split="train")

  for key in gen_qrels.keys():
    for key2 in gen_qrels[key]:
      if key2 in new_cor_key_loc.keys():
        gen_qrels[key] = {new_cor_key_loc[key2][0]: 1, new_cor_key_loc[key2][1]: 1} # Link Query-Documents

elif query_aug == "summarization":
  from transformers import PegasusForConditionalGeneration, PegasusTokenizer
  import torch
  device = "cuda" if torch.cuda.is_available() else "cpu"

  gen_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)
  tok = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

  gen_queries = {}
  gen_qrels = {}
  num = 0
  denote_q = "NEW_QUERY-"+str(num)


  for key in tqdm(document.keys()):
    input = document[key]['text']

    if len(tok(input, truncation=True, padding="longest", return_tensors="pt")['input_ids'][0]) > tok.max_len_single_sentence:
      batch = tok(input, truncation=True, padding="longest", return_tensors="pt")
      batch = torch.tensor([batch['input_ids'][0][:tok.max_len_single_sentence].tolist()]).to(device)

    else:
      batch = tok(input, truncation=True, padding="longest", return_tensors="pt").to(device)
      batch = batch['input_ids']

    generated_ids = gen_model.generate(batch, num_return_sequences=ques_per_passage, do_sample=True)

    for i in range(ques_per_passage):
      gen_queries[denote_q] = tok.batch_decode(generated_ids, skip_special_tokens=True)[i]

      if key in new_cor_key_loc.keys():
        gen_qrels[denote_q] = {new_cor_key_loc[key][0]: 1, new_cor_key_loc[key][1]: 1} # Link Query-Documents
      else:
        gen_qrels[denote_q] = {key: 1}

      num += 1
      denote_q = "NEW_QUERY-"+str(num)

  document = ori_document


elif query_aug == "open_llama":
  from transformers import LlamaTokenizer, LlamaForCausalLM
  import torch
  device = "cuda" if torch.cuda.is_available() else "cpu"

  gen_model = LlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b_v2").to(device)
  tok = LlamaTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")

  gen_queries = {}
  gen_qrels = {}
  num = 0
  denote_q = "NEW_QUERY-"+str(num)

  for key in tqdm(document.keys()):
    merge_title_txt = document[key]['title'] + '. ' + document[key]['text']
    prompt = 'Q: ' + merge_title_txt + '\nA:'

    input_ids = tok(prompt, return_tensors="pt").input_ids

    if len(input_ids[0]) > tok.max_len_single_sentence:
      chunk = torch.tensor([input_ids[0][:tok.max_len_single_sentence].tolist()]).to(device)
    else:
      chunk = input_ids.to(device)

    for _ in range(ques_per_passage):
      generation_output = gen_model.generate(input_ids=chunk, do_sample=True, max_new_tokens=32)
      sample = tok.decode(generation_output[0][len(input_ids[0]):])
      gen_queries[denote_q] = sample[:sample.find('\n')]

      if key in new_cor_key_loc.keys():
        gen_qrels[denote_q] = {new_cor_key_loc[key][0]: 1, new_cor_key_loc[key][1]: 1} # Link Query-Documents
      else:
        gen_qrels[denote_q] = {key: 1}

      num += 1
      denote_q = "NEW_QUERY-"+str(num)

  document = ori_document

elif query_aug == "flan":

  from transformers import T5Tokenizer, T5ForConditionalGeneration
  import torch
  device = "cuda" if torch.cuda.is_available() else "cpu"


  tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
  gen_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)

  gen_queries = {}
  gen_qrels = {}
  num = 0
  denote_q = "NEW_QUERY-"+str(num)


  for key in tqdm(document.keys()):
    merge_title_txt = document[key]['title'] + '. ' + document[key]['text']

    input = '<generate_query> paragraph: ' + merge_title_txt

    if len(tok(input, return_tensors="pt")['input_ids'][0]) > tok.max_len_single_sentence:
      batch = tok(input, return_tensors="pt")
      chunk = torch.tensor([batch['input_ids'][0][:tok.max_len_single_sentence].tolist()]).to(device)
      generated_ids = gen_model.generate(chunk, num_return_sequences=ques_per_passage, do_sample=True)


    else:
      batch = tok(input, return_tensors="pt").to(device)
      generated_ids = gen_model.generate(batch["input_ids"], num_return_sequences=ques_per_passage, do_sample=True)

    for i in range(ques_per_passage):
      gen_queries[denote_q] = tok.batch_decode(generated_ids, skip_special_tokens=True)[i]

      if key in new_cor_key_loc.keys():
        gen_qrels[denote_q] = {new_cor_key_loc[key][0]: 1, new_cor_key_loc[key][1]: 1}

      else:
        gen_qrels[denote_q] = {key: 1}

      num += 1
      denote_q = "NEW_QUERY-"+str(num)

  document = ori_document

elif query_aug == "crop":
  from sentence_splitter import split_text_into_sentences
  import random

  gen_queries = {}
  gen_qrels = {}
  num = 0
  denote_q = "NEW_QUERY-"+str(num)

  for key in document.keys():
    if document[key]['text'] != '':
      split_txt = split_text_into_sentences(document[key]['text'], language="en")

      query_pos= random.choices(range(len(split_txt)), k=ques_per_passage)

      for loc in query_pos:
        gen_queries[denote_q] = split_txt[loc]

        if key in new_cor_key_loc.keys():
          gen_qrels[denote_q] = {new_cor_key_loc[key][0]: 1, new_cor_key_loc[key][1]: 1}

        else:
          gen_qrels[denote_q] = {key: 1}

        num += 1
        denote_q = "NEW_QUERY-"+str(num)

  document = ori_document

elif query_aug == "rm3":
  import pyterrier as pt
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]) # If you call multiple time, comment this out

  docs = [{'docno':key, 'text':document[key]['text'], 'title':document[key]['title']} for key in document.keys()]
  indexer = pt.index.IterDictIndexer('./index', meta={'docno':33})
  indexref = indexer.index(docs, fields=('text', 'title', 'docno'))
  index = pt.IndexFactory.of(indexref)
  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
  rm3_pipe = bm25 >> pt.rewrite.RM3(index) >> bm25

  gen_queries = {}
  gen_qrels = {}
  num = 0
  denote_q = "NEW_QUERY-"+str(num)

  for doc in tqdm(docs):
    tmp_query = re.sub('[^A-Za-z0-9 ]+', '', doc['text'])
    rm3_qgen = rm3_pipe.transform(pd.DataFrame({"qid": [denote_q], "query": [tmp_query]}))

    syn_query = []
    tmp_query = rm3_qgen["query"]
    if len(tmp_query) !=0: # not empty
      tmp_query = tmp_query.unique()[0]
      for val in tmp_query.split("^")[1:-1]:
        syn_query.append(val.split()[-1])
      syn_query = ' '.join(syn_query)

      for rank in range(ques_per_passage):
        gen_queries[denote_q] = syn_query
        gen_qrels[denote_q] = {rm3_qgen["docno"][rank]: 1}
        num += 1

        denote_q = "NEW_QUERY-"+str(num)

  document = ori_document

elif query_aug == "axiomaticqe":
  import pyterrier as pt
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]) # If you call multiple time, comment this out

  docs = [{'docno':key, 'text':document[key]['text'], 'title':document[key]['title']} for key in document.keys()]
  indexer = pt.index.IterDictIndexer('./index', meta={'docno':33})
  indexref = indexer.index(docs, fields=('text', 'title', 'docno'))
  index = pt.IndexFactory.of(indexref)
  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
  axiom_pipe = bm25 >> pt.rewrite.AxiomaticQE(index) >> bm25

  gen_queries = {}
  gen_qrels = {}
  num = 0
  denote_q = "NEW_QUERY-"+str(num)

  for doc in tqdm(docs):
    tmp_query = re.sub('[^A-Za-z0-9 ]+', '', doc['text'])
    axiom_qgen = axiom_pipe.transform(pd.DataFrame({"qid": [denote_q], "query": [tmp_query]}))

    syn_query = []
    tmp_query = axiom_qgen["query"]
    if len(tmp_query) !=0: # not empty
      tmp_query = tmp_query.unique()[0]
      for val in tmp_query.split("^")[1:-1]:
        syn_query.append(val.split()[-1])
      syn_query = ' '.join(syn_query)

      for rank in range(ques_per_passage):
        gen_queries[denote_q] = syn_query
        gen_qrels[denote_q] = {axiom_qgen["docno"][rank]: 1}
        num += 1

        denote_q = "NEW_QUERY-"+str(num)

  document = ori_document

else:
  print("Query Augmentation is not properly defined.")

print()
print("Time computation for Query augmentation:", int(time()-start_time), "seconds")
print("Size of synthetic queries:",len(gen_qrels))
print("Size of synthetic qrles:",len(gen_queries))
print("Size of original document:",len(document))

# Fine-tune Retrieval Model

In [None]:
print("Retrieval model for fine-tuning:",retrieval_model_name)
model = SentenceTransformer(retrieval_model_name)
retriever = TrainRetriever(model=model, batch_size=8)
train_samples = retriever.load_train(document, gen_queries, gen_qrels)

train_dataloader = retriever.prepare_train(train_samples, shuffle=True)
train_loss = losses.MultipleNegativesRankingLoss(model=retriever.model)
ir_evaluator = retriever.load_dummy_evaluator()

#### Provide model save path
model_save_path = os.path.join("output", "{}-v1-{}".format(retrieval_model_name, dataset))
os.makedirs(model_save_path, exist_ok=True)


#### Hyperparameters for fine-tuning the retrieval model
num_epochs = 1
evaluation_steps = 5000
warmup_steps = int(len(train_samples) * num_epochs / retriever.batch_size * 0.1)

retriever.fit(train_objectives=[(train_dataloader, train_loss)],
                evaluator=ir_evaluator,
                epochs=num_epochs,
                output_path=model_save_path,
                warmup_steps=warmup_steps,
                evaluation_steps=evaluation_steps,
                use_amp=True)

# Evaluate Retrieval Model

In [None]:
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES


if dataset != "ma-amazon":
  # Load document / queries / qrels of test set
  test_document, test_queries, test_qrels = GenericDataLoader(data_path).load(split="test")

  model = DRES(models.SentenceBERT(model_save_path), batch_size=32)
  eval_retriever = EvaluateRetrieval(model, score_function="cos_sim")

  print("Size of test queries:",len(test_queries))
  print("Size of test qrles:",len(test_qrels))
  print("Size of test document:",len(test_document))

  #### Retrieve dense results (format of results is identical to qrels)
  start_time = time()
  results = eval_retriever.retrieve(test_document, test_queries)
  end_time = time()
  eval_retriever.k_values = [10, 100]

  print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))
  print("Retriever evaluation for k in: {}".format(eval_retriever.k_values))
  ndcg, _map, recall, precision = eval_retriever.evaluate(test_qrels, results, eval_retriever.k_values)

  mrr = eval_retriever.evaluate_custom(test_qrels, results, eval_retriever.k_values, metric="mrr")
  recall_cap = eval_retriever.evaluate_custom(test_qrels, results, eval_retriever.k_values, metric="r_cap")
  hole = eval_retriever.evaluate_custom(test_qrels, results, eval_retriever.k_values, metric="hole")


  list_per = ['NDCG@10','Recall@100']

  for key in ndcg.keys():
    if key in list_per:
      print(key, ndcg[key]*100)

  for key in recall.keys():
    if key in list_per:
      print(key, recall[key]*100)


else:
  for test_data_loc in ["esci-data/recall/", "esci-data/ndcg/"]:
    # Load document / queries / qrels of test set
    with open(test_data_loc + "corpus.jsonl", "r") as file:
      for line in file:
        test_document = json.loads(line)

    with open(test_data_loc + "queries.jsonl", "r") as file:
      for line in file:
        test_queries = json.loads(line)

    with open(test_data_loc + "qrels.jsonl", "r") as file:
      for line in file:
        test_qrels = json.loads(line)

    if "ndcg" in test_data_loc:
      ## gain score should be int
      for key in test_qrels.keys():
        for key2 in test_qrels[key]:
          test_qrels[key][key2] = int(100*test_qrels[key][key2])

    else:
      ## gain score should be int
      for key in test_qrels.keys():
        for key2 in test_qrels[key]:
          test_qrels[key][key2] = int(test_qrels[key][key2])

    model = DRES(models.SentenceBERT(model_save_path), batch_size=32)
    eval_retriever = EvaluateRetrieval(model, score_function="cos_sim")

    print("Size of test queries:",len(test_queries))
    print("Size of test qrles:",len(test_qrels))
    print("Size of test document:",len(test_document))

    #### Retrieve dense results (format of results is identical to qrels)
    start_time = time()
    results = eval_retriever.retrieve(test_document, test_queries)
    end_time = time()
    eval_retriever.k_values = [50, 100, 500]

    print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))
    print("Retriever evaluation for k in: {}".format(eval_retriever.k_values))
    ndcg, _map, recall, precision = eval_retriever.evaluate(test_qrels, results, eval_retriever.k_values)

    if "recall" in test_data_loc:
      list_per = ['Recall@100', 'Recall@500']
      for key in recall.keys():
        if key in list_per:
          print(key, recall[key]*100)
    else:
      list_per = ['NDCG@50']
      for key in ndcg.keys():
        if key in list_per:
          print(key, ndcg[key]*100)
