In [None]:
!pip install --upgrade --force-reinstall git+https://github.com/castorini/pygaggle
!pip install faiss-cpu==1.7.2 --quiet
!pip install jsonlines==3.0.0 --quiet
!pip install beir==1.0.0 --quiet
!pip install protobuf==3.20.1 --quiet
!pip install ir_measures==0.3.0 --quiet
!pip install ir-datasets==0.5.1 --quiet

In [None]:
import torch

if torch.cuda.is_available(): 
    dev = "cuda:0"
    print(dev, torch.cuda.get_device_name(0))
    device = torch.device(dev)
else: 
    dev = "cpu"
    print(dev) 

from pyserini.search import SimpleSearcher
from pygaggle.rerank.base import hits_to_texts
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5
from transformers import T5ForConditionalGeneration
from pygaggle.rerank.transformer import SentenceTransformersReranker
import jsonlines
import os
import math
from tqdm.notebook import tqdm
from beir import util
from beir.datasets.data_loader import GenericDataLoader
import pandas as pd
import ir_datasets
import ir_measures
from ir_measures import *
from IPython.display import clear_output

In [None]:
def download_cqadupstack():
    list_subsets = ['android','english','gaming','gis','mathematica','physics','programmers','stats','tex','unix','webmasters','wordpress']
    list_queries = []
    list_corpus = []
    list_qrels = []

    for subset in list_subsets:
        dataset = ir_datasets.load("beir/cqadupstack/{}".format(subset))

        for query in dataset.queries_iter():
            list_queries.append(query)

        for doc in dataset.docs_iter():
            list_corpus.append(doc)

        for qrel in dataset.qrels_iter():
            list_qrels.append(qrel)

    return list_queries, list_corpus, list_qrels


def download_robust04():
    list_subsets = ['fold1','fold2','fold3','fold4','fold5']
    list_queries = []
    list_qrels = []

    for subset in list_subsets:
        dataset = ir_datasets.load("trec-robust04/{}".format(subset))

        for query in dataset.queries_iter():
            list_queries.append(query)

        for qrel in dataset.qrels_iter():
            list_qrels.append(qrel)

    return list_queries, list_qrels


def index_corpus(corpus):
    """
    Index corpus to be retrieved by BM25

    Args:
      corpus: Corpus (dict)
      
    Returns:
      Searcher object to initialize BM25
    """

    !rm -r candidates
    !rm -r tmp_candidates
    !mkdir tmp_candidates

    for docs in corpus:  
        indexed_dict = { "id": str(docs[0]), "contents": docs[2] + ' ' + docs[1]}
        with jsonlines.open('/content/tmp_candidates/candidate.jsonl', mode='a') as writer:
            writer.write(indexed_dict)
         
    !python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
    -threads 1 -input /content/tmp_candidates \
    -index /content/candidates/indexes -storePositions -storeDocvectors -storeRaw

    searcher = SimpleSearcher('/content/candidates/indexes')

    return searcher


def prepare_qrels(list_qrels):
    """
    Convert qreld to TREC eval format

    Args:
      list_qrels: list of qrels (list)

    """
    list_qrels_trec = []
    for qrels in list_qrels:
        list_qrels_trec.append([str(qrels[0]), '0', str(qrels[1]), str(qrels[2])])
    
    cols = ['query-id', 'zero',	'corpus-id', 'score']
    df_qrel = pd.DataFrame(list_qrels_trec, columns= cols)
  
    df_qrel.to_csv('qrel.tsv', sep='\t', header = None, index = False)


def run_retrieval(list_queries, searcher, dataset, model_name):
    """
    Run BM25 and reranker retrieval. Save the outputs as txt files

    Args:
      queries: Queries (dict)
      searcher: Pyserini object to perform retrieval
      dataset: dataset name (string)
      model_name: model name (string)

    """
    run_bm25 = open("/content/run_BM25_{}.txt".format(dataset),'a')
    run_reranker = open("/content/run_{}_{}.txt".format(model_name, dataset),'a')

    list_t5 = []
    for query in tqdm(list_queries):
      
        hits = searcher.search(query[1][0:1024], k=1000)
        texts = hits_to_texts(hits)
        query_ = Query(query[1])
        reranked = reranker.rerank(query_, texts)
        reranked.sort(key=lambda x: x.score, reverse=True)

        for idx in range(len(hits)):

            run_bm25.write(str(query[0])+' Q0 '+ str(hits[idx].docid) + ' ' + str(idx+1) + ' ' + str(hits[idx].score) + ' BM25\n')
            run_reranker.write(str(query[0])+' Q0 '+ str(reranked[idx].metadata["docid"]) + ' ' + str(idx+1) + ' ' + str(math.exp(reranked[idx].score) * 100) + ' ' + model_name+'\n')

    run_bm25.close()
    run_reranker.close()


def evaluation(dataset, model_name):
    """
    Run evaluation and prepare the dataframe results

    Args:
      dataset: Dataset name (string)
      model_name: model name (string)   

    Returns:
      Returns the df_final dataframe containing run results
    """
    ## Evaluation 
    run_bm25 = ir_measures.read_trec_run('/content/run_BM25_{}.txt'.format(dataset))
    qrels = ir_measures.read_trec_qrels('/content/qrel.tsv')
    result_bm25 = ir_measures.calc_aggregate([nDCG@10], qrels, run_bm25)
    
    ## Evaluation 
    run = ir_measures.read_trec_run('/content/run_{}_{}.txt'.format(model_name, dataset))
    qrels = ir_measures.read_trec_qrels('/content/qrel.tsv')
    result = ir_measures.calc_aggregate([nDCG@10], qrels, run)
       
    return result_bm25, result

In [None]:
model_name = 'MiniLM' #@param ["monot5-small", "monot5-base", "monot5-3B", "MiniLM"]

In [None]:
## List of datasets to be evaluated 
datasets = ['cqadupstack', 'robust04']

## Download model
if model_name == "MiniLM":
    reranker = SentenceTransformersReranker(pretrained_model_name_or_path='cross-encoder/ms-marco-MiniLM-L-6-v2')
else:
    reranker = MonoT5(pretrained_model_name_or_path='castorini/{}-msmarco-10k'.format(model_name), token_false='▁false', token_true ='▁true')

list_results = []
for dataset in tqdm(datasets):
    
    
    !rm /content/qrel.tsv
    if dataset == 'cqadupstack':
        # Dowload data
        list_queries, list_corpus, list_qrels = download_cqadupstack()
        # Index corpus
        searcher = index_corpus(list_corpus)
    else:
        # Dowload data
        list_queries, list_qrels = download_robust04()
        # Load indexed corpus
        searcher = SimpleSearcher.from_prebuilt_index(dataset)

    # Run retrieval models
    run_retrieval(list_queries, searcher, dataset, model_name)
    # prepare qrels
    prepare_qrels(list_qrels)
    # Evaluate
    result_bm25, result = evaluation(dataset, model_name)
    list_results.append((dataset, list(result_bm25.values())[0], list(result.values())[0]))

    # clear previous iteration files
    clear_output(wait=True)

pd.DataFrame(list_results, columns = ['Dataset', 'BM25', 'Reranker']).to_csv('BEIR_results.csv', index = False)