Download the libraries

In [None]:
!pip install -r /kaggle/input/requirements/requirements.txt

Installing Java as it is needed for pyterrier

In [None]:
!apt-get install -y default-jre
!apt-get update && \
    apt-get install -y openjdk-11-jdk ca-certificates-java && \
    apt-get clean && \
    update-ca-certificates -f

Importing all the necessary libraries

In [None]:
import pyterrier as pt
if not pt.started():
    pt.init()
pt.logging("INFO")

from pyterrier.measures import *
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer

In [None]:
def load_dataset(dataset_name):
    dataset = pt.get_dataset(dataset_name)
    return dataset

In [None]:
""" 
    Iterating over docs to remove duplicate and empty docs
    Code retrieved by : https://github.com/terrierteam/pyterrier_deepimpact/blob/main/cord19_example.py 
"""
def text_iter(doc_iter):
    encountered_docnos = set()
    for doc in doc_iter:
        # Skipping over empty docs
        if len(doc['title'].strip()) == 0 or len(doc['abstract'].strip()) == 0:
            continue
        # Skipping over duplicate docs and merging fields
        if doc['docno'] not in encountered_docnos:
            yield {"docno": doc['docno'], "text": '{title} {abstract}'.format(**doc)}
            encountered_docnos.add(doc['docno'])

Setting up some constants for the CORD-19 dataset


In [None]:
FULL_TREC_COVID_DATASET_NAME = "irds:cord19/trec-covid"
STAND_INDEX_NAME = 'standard_index'
DOC2QUERY_INDEX_NAME = 'doc2query--_index'

Creating the index

In [None]:
"""
Standard indexing with pyterrier
"""
def indexing():
    loaded_dataset = load_dataset(FULL_TREC_COVID_DATASET_NAME)

    # Creating standard index
    indexer = pt.IterDictIndexer(f'./indexes/{STAND_INDEX_NAME}')
    index_ref = indexer.index(text_iter(loaded_dataset.get_corpus_iter()))
    return index_ref, loaded_dataset

In [None]:
def retrieval(index_ref, loaded_dataset, variant='title'):
    # Preparing the models
    tfidf = pt.BatchRetrieve(index_ref, wmodel="TF_IDF")
    bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")
    dir_LM = pt.BatchRetrieve(index_ref, wmodel="DirichletLM")

    # Evaluation
    exp = pt.Experiment(
        [tfidf, bm25, dir_LM],
        loaded_dataset.get_topics(variant=variant),
        loaded_dataset.get_qrels(),
        eval_metrics=[P@20,R@20,'map',nDCG@20],
        round = 4,
        names=["TF IDF", "BM25","DirichletLM"])
    return exp

Standard Indexing

In [None]:
index_ref1, loaded_dataset = indexing()
experiment = retrieval(index_ref1,loaded_dataset)
print(experiment)

Doc2Query-- indexing

In [None]:
"""
Doc2Query-- indexing
"""
def doc2query_minus_minus_indexing():
    #  Initialize a Doc2Query object with a pre-trained Doc2Query model
    #  based on t5-base and trained on MS MARCO(default).
    #  It generates the queries but we don't append them because we will remove non-relevant queries
    doc2query = Doc2Query(append=False, num_samples=20)
    # The generated queries will be scored with the "crystina-z/monoELECTRA_LCE_nneg3" pre-trained model
    # using Electra scorer since it has the best scores in the Doc2Query-- research
    scorer = ElectraScorer('crystina-z/monoELECTRA_LCE_nneg31')

    loaded_dataset = load_dataset(FULL_TREC_COVID_DATASET_NAME)

    index = pt.IterDictIndexer(f'./indexes/{DOC2QUERY_INDEX_NAME}')
    pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=True, t=3.21484375) >> index

    index_ref = pipeline.index(text_iter(loaded_dataset.get_corpus_iter()))
    return index_ref, loaded_dataset

In [None]:
index_ref_doc2query, loaded_dataset = doc2query_minus_minus_indexing()
exp_doc = retrieval(index_ref_doc2query, loaded_dataset)
print(exp_doc)

Deep Impact Indexing and Retrieval

In [None]:
def deep_impact_indexing():
    loaded_dataset = load_dataset(FULL_TREC_COVID_DATASET_NAME)

    index_path = f'./indexes/{DEEPIMPACT_INDEX_NAME}'

    parent = pt.IterDictIndexer(index_path)
    parent.setProperty("termpipelines", "")

    # Set base model with 'gsarti/covidbert-nli' pre-trained model on CORD-19 dataset
    # https://huggingface.co/gsarti/covidbert-nli
    indexer = DeepImpactIndexer(parent, batch_size=32, base_model='gsarti/covidbert-nli')
    indexer.index(text_iter(loaded_dataset.get_corpus_iter()))

    index_ref = pt.IndexRef.of(index_path + "/data.properties")
    index_di = pt.IndexFactory.of(index_ref)
    return index_di, loaded_dataset


In [None]:
def retrieval_deep_impact(index_ref, loaded_dataset, variant='title'):
    # Preparing the model
    tf = pt.BatchRetrieve(index_ref, wmodel="Tf")

    # Evaluation
    exp = pt.Experiment(
        [tf],
        loaded_dataset.get_topics(variant=variant),
        loaded_dataset.get_qrels(),
        eval_metrics=[P@20,R@20,'map',nDCG@20],
        round = 4,
        names=["Deep Impact"])
    return exp


In [None]:
index, loaded_dataset = deep_impact_indexing()
print(retrieval_deep_impact(index, loaded_dataset))