Download the libraries

In [None]:
!pip install -r /kaggle/input/requirements/requirements.txt

Installing Java as it is needed for pyterrier

In [None]:
!apt-get install -y default-jre
!apt-get update && \
    apt-get install -y openjdk-11-jdk ca-certificates-java && \
    apt-get clean && \
    update-ca-certificates -f

Importing all the necessary libraries

In [37]:
import pyterrier as pt
if not pt.started():
    pt.init()
pt.logging("INFO")

from pyterrier.measures import *
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer

ModuleNotFoundError: No module named 'pyt_deepimpact'

In [None]:
def load_dataset(dataset_name):
    dataset = pt.get_dataset(dataset_name)
    return dataset

In [None]:
""" 
    Iterating over docs to remove duplicate and empty docs
    Code retrieved by : https://github.com/terrierteam/pyterrier_deepimpact/blob/main/cord19_example.py 
"""
def text_iter(doc_iter):
    encountered_docnos = set()
    for doc in doc_iter:
        # Skipping over empty docs
        if len(doc['title'].strip()) == 0 or len(doc['abstract'].strip()) == 0:
            continue
        # Skipping over duplicate docs and merging fields
        if doc['docno'] not in encountered_docnos:
            yield {"docno": doc['docno'], "text": '{title} {abstract}'.format(**doc)}
            encountered_docnos.add(doc['docno'])

Setting up some constants for the CORD-19 dataset for each round

In [None]:
FULL_TREC_COVID_DATASET_NAME = "irds:cord19/trec-covid"
ROUND_TREC_COVID_DATASET_NAME = f"{FULL_TREC_COVID_DATASET_NAME}/round"
STAND_INDEX_NAME = 'standard_index_round'
DOC2QUERY_INDEX_NAME = 'doc2query--_index_round'

Creating the index

In [None]:
def indexing(trec_covid_round):
    round_dataset = load_dataset(f'{ROUND_TREC_COVID_DATASET_NAME}{trec_covid_round}')

    # Creating index cord19
    indexer = pt.IterDictIndexer(f'./{STAND_INDEX_NAME}{trec_covid_round}')
    index_ref = indexer.index(text_iter(round_dataset.get_corpus_iter()))
    return index_ref, round_dataset

In [None]:
def retrieval(index_ref, round_dataset):
    # Preparing the models
    tfidf = pt.BatchRetrieve(index_ref, wmodel="TF_IDF")
    bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")
    dir = pt.BatchRetrieve(index_ref, wmodel="DirichletLM")

    # Evaluation
    exp = pt.Experiment(
        [tfidf,bm25,dir],
        round_dataset.get_topics(variant='title'),
        round_dataset.get_qrels(),
        eval_metrics=[P@20,R@20,'map',nDCG@20],
        round = 4,
        names=["TF_IDF", "BM25","DirichletLM"])
    return exp

Standard Indexing and Retrieval for each round

In [None]:
trec_covid_round = 1
index_ref_1, round_dataset_1 = indexing(trec_covid_round)
experiment_1 = retrieval(index_ref_1,round_dataset_1)
print(experiment_1)

In [None]:
trec_covid_round = 2
index_ref_2, round_dataset_2 = indexing(trec_covid_round)
experiment_2 = retrieval(index_ref_2,round_dataset_2)
print(experiment_2)

In [None]:
trec_covid_round = 3
index_ref_3, round_dataset_3 = indexing(trec_covid_round)
experiment_3 = retrieval(index_ref_3,round_dataset_3)
print(experiment_3)

In [None]:
trec_covid_round = 4
index_ref_4, round_dataset_4 = indexing(trec_covid_round)
experiment_4 = retrieval(index_ref_4,round_dataset_4)
print(experiment_4)

In [None]:
trec_covid_round = 5
index_ref_5, round_dataset_5 = indexing(trec_covid_round)
experiment_5 = retrieval(index_ref_5,round_dataset_5)
print(experiment_5)

Doc2Query-- indexing

In [None]:
def doc2query_minus_minus_indexing(trec_covid_round, doc2query, scorer):
    round_dataset = load_dataset(f'{ROUND_TREC_COVID_DATASET_NAME}{trec_covid_round}')
    index = pt.IterDictIndexer(f'./{DOC2QUERY_INDEX_NAME}{trec_covid_round}')
    pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=True, t=3.21484375) >> index
    
    index_ref = pipeline.index(text_iter(round_dataset.get_corpus_iter()))
    return index_ref, round_dataset


#  Initialize a Doc2Query object with a pre-trained Doc2Query model based on t5-base and trained on MS MARCO(default).
#  It generates the queries but we don't append them because we will remove non-relevant queries
doc2query = Doc2Query(append=False, num_samples=20)
# The generated queries will be scored with the "crystina-z/monoELECTRA_LCE_nneg3" pre-trained model 
# using Electra scorer since it has the best scores in the Doc2Query-- research
scorer = ElectraScorer('crystina-z/monoELECTRA_LCE_nneg31')

In [None]:
trec_covid_round = 1
index_ref_1, round_dataset_1 = doc2query_minus_minus_indexing(trec_covid_round, doc2query, scorer)
exp_doc_1 = retrieval(index_ref_1, round_dataset_1)
print(exp_doc_1)

In [None]:
trec_covid_round = 2
index_ref_2, round_dataset_2 = doc2query_minus_minus_indexing(trec_covid_round, doc2query, scorer)
exp_doc_2 = retrieval(index_ref_2, round_dataset_2)
print(exp_doc_2)

In [None]:
exp_doc_2 = retrieval(2, index_ref_2, round_dataset_2)
print(exp_doc_2)

In [None]:
trec_covid_round = 3
index_ref_3, round_dataset_3 = doc2query_minus_minus_indexing(trec_covid_round, doc2query, scorer)
exp_doc_3 = retrieval(index_ref_3, round_dataset_3)
print(exp_doc_3)

In [None]:
trec_covid_round = 4
index_ref_4, round_dataset_4 = doc2query_minus_minus_indexing(trec_covid_round, doc2query, scorer)
exp_doc_4 = retrieval(index_ref_4, round_dataset_4)
print(exp_doc_4)

In [None]:
trec_covid_round = 5
index_ref_5 round_dataset_5 = doc2query_minus_minus_indexing(trec_covid_round, doc2query, scorer)
exp_doc_5 = retrieval(index_ref_5, round_dataset_5)
print(exp_doc_5)

In [None]:
def zip_index(trec_covid_round, standard_index_name):
    !zip -r {standard_index_name}{trec_covid_round}.zip /kaggle/working/{standard_index_name}{trec_covid_round}
trec_covid_round = 2  
# Zipping the standard indexes
zip_index(trec_covid_round, DOC2QUERY_INDEX_NAME)
#Print the link to download the index
from IPython.display import FileLink
FileLink(f'./{DOC2QUERY_INDEX_NAME}{trec_covid_round}.zip')