Download the libraries

In [1]:
!pip install -r  /kaggle/input/requirements/requirements.txt

Collecting git+https://github.com/terrierteam/pyterrier_doc2query.git (from -r /kaggle/input/requirements/requirements.txt (line 2))
  Cloning https://github.com/terrierteam/pyterrier_doc2query.git to /tmp/pip-req-build-oz8doyvr
  Running command git clone --filter=blob:none --quiet https://github.com/terrierteam/pyterrier_doc2query.git /tmp/pip-req-build-oz8doyvr
  Resolved https://github.com/terrierteam/pyterrier_doc2query.git to commit 572da2d6c61fc4f46d96a1bb15235789e629d5d9
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting git+https://github.com/terrierteam/pyterrier_dr.git (from -r /kaggle/input/requirements/requirements.txt (line 3))
  Cloning https://github.com/terrierteam/pyterrier_dr.git to /tmp/pip-req-build-dv90ijd4
  Running command git clone --filter=blob:none --quiet https://github.com/terrierteam/pyterrier_dr.git /tmp/pip-req-build-dv90ijd4
  Resolved https://github.com/terrierteam/pyterrier_dr.git to commit c620231ebc5dba55486302aaee92aa9033a3c69e
  Prepa

Installing Java as it is needed for pyterrier

In [2]:
!apt-get install -y default-jre
!apt-get update && \
    apt-get install -y openjdk-11-jdk ca-certificates-java && \
    apt-get clean && \
    update-ca-certificates -f
!apt install openjdk-11-jdk

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
default-jre is already the newest version (2:1.11-72build2).
0 upgraded, 0 newly installed, 0 to remove and 46 not upgraded.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [632 kB]
Get:3 https://packages.cloud.google.com/apt cloud-sdk InRelease [6361 B]       
Get:4 https://packages.cloud.google.com/apt google-fast-socket InRelease [5015 B]
Get:5 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [553 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]      
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Get:8 http://packages.cloud.google.com/apt gcsfuse-focal InRelease [1301 B]    
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]        
Get:10 http:

In [3]:
import pyterrier as pt
if not pt.started():
    pt.init()
pt.logging("INFO")

from pyterrier.measures import *

from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
from pyterrier_pisa import PisaIndex # needs 3.7> <3.10 python version

terrier-assemblies 5.8 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
def load_dataset(dataset_name):
    dataset = pt.get_dataset(dataset_name)
    return dataset

Indexing
    Using the PisaIndex (Performant Indexes and Search for Academia) because of efficient query-time performance
    Relevant paper : https://ceur-ws.org/Vol-2409/docker08.pdf 

In [5]:
""" 
    Iterating over docs to remove duplicate and empty docs
    Code retrieved by : https://github.com/terrierteam/pyterrier_deepimpact/blob/main/cord19_example.py 
"""
def text_iter(doc_iter):
    encountered_docnos = set()
    i =0
    for doc in doc_iter:
        # Skipping over empty docs
        if len(doc['title'].strip()) == 0 or len(doc['abstract'].strip()) == 0:
            continue
        # Skipping over duplicate docs and merging fields
        if doc['docno'] not in encountered_docnos:
            yield {"docno": doc['docno'], "text": '{title} {abstract}'.format(**doc)}


Setting up some constants

In [6]:
FULL_TREC_COVID_DATASET_NAME = "irds:cord19/trec-covid"
ROUND_1_TREC_COVID_DATASET_NAME = f"{FULL_TREC_COVID_DATASET_NAME}/round1"
ROUND_2_TREC_COVID_DATASET_NAME = f"{FULL_TREC_COVID_DATASET_NAME}/round2"
ROUND_3_TREC_COVID_DATASET_NAME = f"{FULL_TREC_COVID_DATASET_NAME}/round3"
ROUND_4_TREC_COVID_DATASET_NAME = f"{FULL_TREC_COVID_DATASET_NAME}/round4"

In [None]:
# First round

round1 = load_dataset(ROUND_1_TREC_COVID_DATASET_NAME)

# Index cord19/round1
# indexer1 = pt.IterDictIndexer('./indices/cord19_trec-covid_round1')
# index_ref1 = indexer1.index(text_iter(round1.get_corpus_iter()))

# Preparing the models for round 1
index_ref1 = pt.IndexRef.of('./indices/cord19_trec-covid_round1')
index1 = pt.IndexFactory.of(index_ref1)
tfidf_1 = pt.BatchRetrieve(index1, wmodel="TF_IDF")
bm25_1 = pt.BatchRetrieve(index1, wmodel="BM25")
dir_1 = pt.BatchRetrieve(index1, wmodel="DirichletLM")

# Evaluation for round 1
exper1 = pt.Experiment(
    [tfidf_1,bm25_1,dir_1],
    round1.get_topics(variant='title'),
    round1.get_qrels(),
    eval_metrics=[P@20,R@20,'map',nDCG@20],
    round = 4,
    names=["TF_IDF", "BM25","DirichletLM"])
# print(exper1)

In [None]:
#Evaluation with the metrics from TREC for round1
pt.Experiment(
    [tfidf_1,bm25_1,dir_1],
    round1.get_topics(variant='title'),
    round1.get_qrels(),
    eval_metrics=[P@5,nDCG@10,'map',Bpref],
    round = 4,
    names=["TF_IDF", "BM25","DirichletLM"])

In [None]:
# Second round

round2 = pt.get_dataset('irds:cord19/trec-covid/round2')

# Index cord19/round2
# indexer2 = pt.IterDictIndexer('./indices/cord19_trec-covid_round2')
# index_ref2 = indexer2.index(text_iter(round2.get_corpus_iter()))


# Preparing the models for round 2
index_ref2 = pt.IndexRef.of('./indices/cord19_trec-covid_round2')
index2 = pt.IndexFactory.of(index_ref2)
tfidf_2 = pt.BatchRetrieve(index2, wmodel="TF_IDF")
bm25_2 = pt.BatchRetrieve(index2, wmodel="BM25")
dir_2 = pt.BatchRetrieve(index2, wmodel="DirichletLM")

# Evaluation for round 2
exper2 = pt.Experiment(
    [tfidf_2,bm25_2,dir_2],
    round2.get_topics(variant='title'),
    round2.get_qrels(),
    eval_metrics=[P@20,R@20,'map',nDCG@20],
    round = 4,
    names=["TF_IDF", "BM25","DirichletLM"])
# print(exper2)

In [None]:
# Third round

round3 = pt.get_dataset('irds:cord19/trec-covid/round3')

# Index cord19/round3
# indexer3 = pt.IterDictIndexer('./indices/cord19_trec-covid_round3')
# index_ref3 = indexer3.index(text_iter(round3.get_corpus_iter()))

# Preparing the models for round 3
index_ref3 = pt.IndexRef.of('./indices/cord19_trec-covid_round3')
index3 = pt.IndexFactory.of(index_ref3)
tfidf_3 = pt.BatchRetrieve(index3, wmodel="TF_IDF")
bm25_3 = pt.BatchRetrieve(index3, wmodel="BM25")
dir_3 = pt.BatchRetrieve(index3, wmodel="DirichletLM")

# Evaluation for round 3
exper3 = pt.Experiment(
    [tfidf_3,bm25_3,dir_3],
    round3.get_topics(variant='title'),
    round3.get_qrels(),
    eval_metrics=[P@20,R@20,'map',nDCG@20],
    round = 4,
    names=["TF_IDF", "BM25","DirichletLM"])
# print(exper3)

In [None]:
# Fourth round

round4 = pt.get_dataset('irds:cord19/trec-covid/round4')

# Index cord19/round4
# indexer4 = pt.IterDictIndexer('./indices/cord19_trec-covid_round4')
# index_ref4 = indexer4.index(text_iter(round4.get_corpus_iter()))

# Preparing the models for round 4
index_ref4 = pt.IndexRef.of('./indices/cord19_trec-covid_round4')
index4 = pt.IndexFactory.of(index_ref4)
tfidf_4 = pt.BatchRetrieve(index4, wmodel="TF_IDF")
bm25_4 = pt.BatchRetrieve(index4, wmodel="BM25")
dir_4 = pt.BatchRetrieve(index4, wmodel="DirichletLM")

# Evaluation for round 4
exper4 = pt.Experiment(
    [tfidf_4,bm25_4,dir_4],
    round4.get_topics(variant='title'),
    round4.get_qrels(),
    eval_metrics=[P@20,R@20,'map',nDCG@20],
    round = 4,
    names=["TF_IDF", "BM25","DirichletLM"])
# print(exper4)

In [None]:
# Fifth round

round5 = pt.get_dataset('irds:cord19/trec-covid/round5')

# Index cord19/round5
# indexer5 = pt.IterDictIndexer('./indices/cord19_trec-covid_round5')
# index_ref5 = indexer5.index(text_iter(round5.get_corpus_iter()))

# Preparing the models for round 5
index_ref5 = pt.IndexRef.of('./indices/cord19_trec-covid_round5')
index5 = pt.IndexFactory.of(index_ref5)
tfidf_5 = pt.BatchRetrieve(index5, wmodel="TF_IDF")
bm25_5 = pt.BatchRetrieve(index5, wmodel="BM25")
dir_5 = pt.BatchRetrieve(index5, wmodel="DirichletLM")

# Evaluation for round 5
exper5 = pt.Experiment(
    [tfidf_5,bm25_5,dir_5],
    round5.get_topics(variant='title'),
    round5.get_qrels(),
    eval_metrics=[P@20,R@20,'map',nDCG@20],
    round = 4,
    names=["TF_IDF", "BM25","DirichletLM"])
# print(exper5)

Load trec-covid dataset

In [7]:
dataset = load_dataset(ROUND_1_TREC_COVID_DATASET_NAME)

 Initialize a Doc2Query object with a pre-trained Doc2Query model based on t5-base and trained on MS MARCO(default).
 It generates the queries but we don't append them because we will remove non-relevant queries

In [8]:
doc2query = Doc2Query(append=False, num_samples=20)

  warn('consider setting fast_tokenizer=True; it speeds up inference considerably')


Downloading tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

The generated queries will be scored with the "crystina-z/monoELECTRA_LCE_nneg3" pre-trained model using Electra scorer since it has the best scores in the Doc2Query-- research

In [9]:
 scorer = ElectraScorer('crystina-z/monoELECTRA_LCE_nneg31')

Downloading tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [10]:
index = pt.IterDictIndexer('./doc2query--_index_round1')

In [11]:
pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=True, t=3.21484375) >> index

In [None]:
idx = pipeline.index(text_iter(dataset.get_corpus_iter()))

[INFO] [starting] building docstore
[INFO] If you have a local copy of https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/42a21f386be86c24647a41bedde34046
[INFO] [starting] https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv
docs_iter: 0doc [00:00, ?doc/s]
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv: 0.0%| 0.00/77.3M [00:00<?, ?B/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv: 0.0%| 16.4k/77.3M [00:00<11:58, 108kB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv: 0.1%| 57.3k/77.3M [00:00<06:56, 185kB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv: 0.1%| 98.3k/77.3M [00:00<06:06, 211kB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020

14:38:29.927 [ForkJoinPool-1-worker-3] INFO org.terrier.structures.indexing.Indexer - Indexer using 1 fields
14:38:29.986 [ForkJoinPool-1-worker-3] INFO org.terrier.structures.indexing.Indexer - creating the data structures data_stream0_1
14:38:30.035 [ForkJoinPool-1-worker-3] INFO org.terrier.structures.indexing.LexiconBuilder - LexiconBuilder active - flushing every 100000 documents, or when memory threshold hit


cord19/trec-covid/round1 documents:   0%|          | 104/51078 [00:50<6:51:26,  2.06it/s]
ELECTRA scoring:   0%|          | 0/2000 [00:00<?, ?record/s][A
ELECTRA scoring:   1%|          | 16/2000 [00:00<00:19, 100.40record/s][A
ELECTRA scoring:   2%|▏         | 32/2000 [00:00<00:21, 90.55record/s] [A
ELECTRA scoring:   2%|▏         | 48/2000 [00:00<00:22, 87.13record/s][A
ELECTRA scoring:   3%|▎         | 64/2000 [00:00<00:20, 93.62record/s][A
ELECTRA scoring:   4%|▍         | 80/2000 [00:00<00:18, 105.75record/s][A
ELECTRA scoring:   5%|▍         | 96/2000 [00:00<00:17, 105.96record/s][A
ELECTRA scoring:   6%|▌         | 112/2000 [00:01<00:21, 86.80record/s][A
ELECTRA scoring:   6%|▋         | 128/2000 [00:01<00:24, 77.63record/s][A
ELECTRA scoring:   7%|▋         | 144/2000 [00:01<00:23, 78.13record/s][A
ELECTRA scoring:   8%|▊         | 160/2000 [00:01<00:23, 77.26record/s][A
ELECTRA scoring:   9%|▉         | 176/2000 [00:02<00:20, 86.99record/s][A
ELECTRA scoring:  10%|

In [None]:
tfidf_doc = pt.BatchRetrieve(idx, wmodel="TF_IDF")
bm25_doc = pt.BatchRetrieve(idx, wmodel="BM25")
dir_doc = pt.BatchRetrieve(idx, wmodel="DirichletLM")
pt.Experiment(
    [tfidf_doc, bm25_doc,dir_doc],
    dataset.get_topics(variant='title'),
    dataset.get_qrels(),
    eval_metrics=[P@20,R@20,'map',nDCG@20],
    names=["TF_IDF", "BM25","DirichletLM"]
)