In [None]:
!pip install beir
!pip install jsonlines
!pip install ir_measures==0.3.0 --quiet

!git clone --recursive https://github.com/texttron/tevatron
%cd tevatron
!pip install --editable .
%cd /content/

In [None]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
from tqdm.notebook import tqdm
import jsonlines
import pandas as pd
import ir_measures
from ir_measures import *
from IPython.display import clear_output


def download_dataset(dataset):
    """
    Download a BEIR dataset (test set only). Return the preprocessed corpus, queries and qrels

    Args:
      dataset: Dataset name (string)

    Returns:  
      Return the preprocessed corpus, queries and qrels
    """
    print('Downloading', dataset)
    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
    data_path = util.download_and_unzip(url, "datasets")
    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

    return corpus, queries, qrels


def tokenize_corpus(dataset, splits): 
    list_corpus = []
    with jsonlines.open('/content/datasets/{}/corpus.jsonl'.format(dataset),'r') as reader:
        for obj in reader:
            list_corpus.append({'text_id':obj['_id'], 'title':obj['title'], 'text':obj['text']})
            
    df_corpus_dev = pd.DataFrame(list_corpus)
    df_corpus_dev.to_csv('/content/corpus.tsv', sep='\t', index=False, header=None)

    !python /content/tevatron/examples/coCondenser-marco/tokenize_passages.py --tokenizer_name bert-base-uncased --file /content/corpus.tsv --save_to /content/corpus --n_splits {splits}


def encode_corpus():
    sh = """
    mkdir -p /content/encoding/corpus/
    cd tevatron/src

    for i in $(seq -f "%02g" 0 9)
    do
    python -m tevatron.driver.encode \
      --output_dir /content/ \
      --model_name_or_path Luyu/co-condenser-marco-retriever \
      --fp16 \
      --per_device_eval_batch_size 128 \
      --encode_in_path /content/corpus/split${i}.json \
      --encoded_save_path /content/encoding/corpus/split${i}.pt
    done
    cd /content/
    """
    with open('script.sh', 'w') as file:
      file.write(sh)

    !bash script.sh
    

def tokenize_queries(dataset):
    !mkdir query
    list_queries = []
    with jsonlines.open('/content/datasets/{}/queries.jsonl'.format(dataset),'r') as reader:
        for obj in reader:    
            list_queries.append({'query_id':obj['_id'], 'text':obj['text'][:900]})

    f = open('/content/dev.query.txt','w')
    for query in list_queries:
        f.write(str(query['query_id']) + '\t' + str(query['text']) + ' \n')
    f.close()

    !python /content/tevatron/examples/coCondenser-marco/tokenize_queries.py --tokenizer_name bert-base-uncased --query_file /content/dev.query.txt --save_to /content/query/dev.query.json     


def encode_queries():
    !mkdir -p /content/encoding/query/
    %cd tevatron/src

    !python -m tevatron.driver.encode \
      --output_dir /content/ \
      --model_name_or_path Luyu/co-condenser-marco-retriever \
      --fp16 \
      --q_max_len 32 \
      --encode_is_qry \
      --per_device_eval_batch_size 128 \
      --encode_in_path /content/query/dev.query.json \
      --encoded_save_path /content/encoding/query/qry.pt


def retrieval(dataset, k):
    !python -m tevatron.faiss_retriever \
      --query_reps /content/encoding/query/qry.pt \
      --passage_reps /content/encoding/corpus/'*.pt' \
      --depth {k} \
      --batch_size -1 \
      --save_text \
      --save_ranking_to /content/rank_{dataset}.tsv
    %cd /content/


def prepare_qrels(dataset):
    """
    Convert qreld to TREC eval format

    Args:
      dataset: Dataset name (string)

    """
    df_qrel = pd.read_csv('/content/datasets/{}/qrels/test.tsv'.format(dataset), sep='\t')
    df_qrel['zero'] = '0'
    cols = ['query-id', 'zero',	'corpus-id', 'score']
    df_qrel = df_qrel[cols] 
    df_qrel.to_csv('/content/qrel.tsv', sep='\t', header = None, index = False)


def prepare_topics(dataset):
    df_rank = pd.read_csv('/content/rank_{}.tsv'.format(dataset), sep = '\t', header = None)
    run_reranker = open("/content/run_CoCondenser_{}.txt".format(dataset),'a')

    for query_id in df_rank[0].unique():
        df_rankk = df_rank[df_rank[0]== query_id]
        list_doc_ids = df_rankk[1].tolist()
        list_score = df_rankk[2].tolist()
        idx = 0
        for doc_id, score in zip(list_doc_ids, list_score):
            run_reranker.write(str(query_id)+' Q0 '+ str(doc_id) + ' ' + str(idx + 1) + ' ' + str(score) + ' CoCondenser\n')
            idx+=1

    run_reranker.close()


def evaluation(dataset, model_name):
    """
    Run evaluation and prepare the dataframe results

    Args:
      dataset: Dataset name (string)
      model_name: model name (string)   

    Returns:
      Returns the df_final dataframe containing run results
    """
    ## Evaluation 
    run = ir_measures.read_trec_run('/content/run_{}_{}.txt'.format(model_name, dataset))
    qrels = ir_measures.read_trec_qrels('/content/qrel.tsv')
    result = ir_measures.calc_aggregate([nDCG@10], qrels, run)
       
    return result


def delete_temp_data():
    !rm -r /content/corpus
    !rm /content/encoding/corpus/split.pt
    !rm /content/encoding/query/qry.pt
    !rm /content/query/dev.query.json
    !rm /content/corpus.tsv
    !rm /content/dev.query.txt
    !rm /content/qrel.tsv

In [None]:
model_name = "CoCondenser"  
datasets = ["trec-covid","nfcorpus","scifact","scidocs","fiqa","arguana","nq","webis-touche2020","quora","dbpedia-entity","climate-fever","fever","hotpotqa"] 

list_results = []
for dataset in datasets:
    clear_output(wait=True)

    # Download dataset  
    corpus, queries, qrels = download_dataset(dataset)

    # Tokenize and encode corpus
    tokenize_corpus(dataset, splits=10) 
    encode_corpus()

    # Tokenize and encode queries
    tokenize_queries(dataset)
    encode_queries()

    # Retrieval
    retrieval(dataset, k=10)

    # Evaluation 
    prepare_qrels(dataset)
    prepare_topics(dataset)
    result = evaluation(dataset, model_name)

    # Save some previous results
    list_results.append({'Dataset':dataset,'nDCG':result[nDCG@10]})
    df_save = pd.DataFrame(list_results)
    df_save.to_csv('/content/coCondenser-results.csv')
    
    # Delete temporary data created during execution
    delete_temp_data()