# Setup

# Setup

## Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


## Installs

In [None]:
!pip install pyserini -q
!pip install faiss-cpu -q
!pip install jsonlines -q
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Paths

In [None]:
DATASET_FILE = './expanded_docs.jsonl'

In [None]:
!cp './gdrive/Shareddrives/UNICAMP/DOUTORADO/IA368/[5] Doc2Query/expanded_docs.jsonl' . 

# Helper Function

In [None]:
from tqdm.auto import tqdm
import json

def get_documents(expanded=False):
  with open(DATASET_FILE, 'r') as fp:
    for line in fp:
      line_dict = json.loads(line)

      del line_dict['queries']
      line_dict['id'] = line_dict["_id"]
      del line_dict["_id"]

      if expanded:
        del line_dict['text']
        line_dict['contents'] = line_dict["expanded_text"]
        del line_dict["expanded_text"]
        del line_dict["title"]

        yield line_dict
      else:
        line_dict['contents'] = line_dict['title'] + ' ' + line_dict['text']
        del line_dict['text']
        del line_dict["expanded_text"]
        del line_dict["title"]
        yield line_dict

# Qrels

In [None]:
from datasets import load_dataset

qrels_dataset = load_dataset("BeIR/trec-covid-qrels")
qrels_dataset

Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading and preparing dataset csv/BeIR--trec-covid-qrels to /root/.cache/huggingface/datasets/BeIR___csv/BeIR--trec-covid-qrels-1766e3af5b0b856a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/981k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___csv/BeIR--trec-covid-qrels-1766e3af5b0b856a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

# Queries

In [None]:
queries_dataset = load_dataset("BeIR/trec-covid", "queries")
queries_dataset

Downloading builder script:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading and preparing dataset trec-covid/queries to /root/.cache/huggingface/datasets/BeIR___trec-covid/queries/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599...


Downloading data:   0%|          | 0.00/4.70k [00:00<?, ?B/s]

Generating queries split: 0 examples [00:00, ? examples/s]

Dataset trec-covid downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___trec-covid/queries/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    queries: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 50
    })
})

# BM25 - Plain Doc

In [None]:
!mkdir -p collections/non_expanded

In [None]:
import jsonlines

for non_expanded_doc in get_documents(expanded=False):
  with jsonlines.open(f"collections/non_expanded/non_expanded_docs.jsonl", 'a') as writer:
      writer.write(non_expanded_doc)  

## Build Index

In [None]:
%%time
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input collections/non_expanded/ \
  --index indexes/lucene-index-trec-covid-non-expanded \
  --generator DefaultLuceneDocumentGenerator \
  --threads 9 \
  --storePositions --storeDocvectors --storeRaw

2023-04-10 18:52:02,931 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-04-10 18:52:02,934 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-04-10 18:52:02,935 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: collections/non_expanded/
2023-04-10 18:52:02,936 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-04-10 18:52:02,936 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-04-10 18:52:02,937 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 9
2023-04-10 18:52:02,937 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-04-10 18:52:02,938 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-04-10 18:52:02,938 INFO  [main] index.IndexCollection (IndexCollection.java:391)

In [None]:
from pyserini.index import IndexReader

reader = IndexReader('indexes/lucene-index-trec-covid-non-expanded')
total_terms = reader.stats()["total_terms"]
print(f"Total number of entries in the index: {total_terms:,}")

Total number of entries in the index: 20,822,821


## Rank

In [None]:
from pyserini.search import get_topics, get_qrels, SimpleSearcher
from tqdm.auto import tqdm
from pyserini.search.lucene import LuceneSearcher
from collections import defaultdict

searcher = LuceneSearcher('indexes/lucene-index-trec-covid-non-expanded')
searcher.set_bm25(k1=0.9, b=0.4)

scored_output = defaultdict(list)

for query in tqdm(queries_dataset['queries'], desc='Running Queries'):
  hits = searcher.search(query['text'], 1000)

  for i, hit in enumerate(hits):
    scored_output[query["_id"]].append({'doc_id': hit.docid,
                                        'bm25_score': hit.score,
                                        'bm25_position': i
                                        })

Running Queries:   0%|          | 0/50 [00:00<?, ?it/s]

## Eval

In [None]:
!pip install trectools -q
!pip install evaluate -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for trectools (setup.py) ... [?25l[?25hdone
  Building wheel for bs4 (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
qrels_format = defaultdict(list)

for query in qrels_dataset['test']:
  qrels_format['query'].append(query["query-id"])
  qrels_format['q0'].append("q0")
  qrels_format['docid'].append(str(query["corpus-id"]))
  qrels_format['rel'].append(query["score"])

In [None]:
run_format = defaultdict(list)

for query_id in scored_output:
  results = scored_output[query_id]
  rank = 1
  for result in results:
    run_format['query'].append(query_id)
    run_format['q0'].append("q0")
    run_format['docid'].append(str(result['doc_id']))
    run_format['rank'].append(rank)
    run_format['score'].append(result['bm25_score'])
    run_format['system'].append("Reranker")    

    rank += 1

In [None]:
from evaluate import load
trec_eval = load("trec_eval")

Downloading builder script:   0%|          | 0.00/5.51k [00:00<?, ?B/s]

In [None]:
results = trec_eval.compute(predictions=[run_format], references=[qrels_format])

In [None]:
results['NDCG@10']

0.5946917010118077

# BM25 - Expanded Docs

In [None]:
!mkdir -p collections/expanded

In [None]:
import jsonlines

for non_expanded_doc in get_documents(expanded=True):
  with jsonlines.open(f"collections/expanded/expanded_docs.jsonl", 'a') as writer:
      writer.write(non_expanded_doc)  

## Build Index

In [None]:
%%time
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input collections/expanded/ \
  --index indexes/lucene-index-trec-covid-expanded \
  --generator DefaultLuceneDocumentGenerator \
  --threads 9 \
  --storePositions --storeDocvectors --storeRaw

2023-04-10 19:14:33,212 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-04-10 19:14:33,221 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-04-10 19:14:33,222 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: collections/expanded/
2023-04-10 19:14:33,222 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-04-10 19:14:33,223 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-04-10 19:14:33,223 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 9
2023-04-10 19:14:33,224 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-04-10 19:14:33,224 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-04-10 19:14:33,224 INFO  [main] index.IndexCollection (IndexCollection.java:391) - K

## Rank

In [None]:
from pyserini.search import get_topics, get_qrels, SimpleSearcher
from tqdm.auto import tqdm
from pyserini.search.lucene import LuceneSearcher
from collections import defaultdict

searcher = LuceneSearcher('indexes/lucene-index-trec-covid-expanded')
searcher.set_bm25(k1=0.9, b=0.4)

scored_output = defaultdict(list)

for query in tqdm(queries_dataset['queries'], desc='Running Queries'):
  hits = searcher.search(query['text'], 1000)

  for i, hit in enumerate(hits):
    scored_output[query["_id"]].append({'doc_id': hit.docid,
                                        'bm25_score': hit.score,
                                        'bm25_position': i
                                        })

Running Queries:   0%|          | 0/50 [00:00<?, ?it/s]

## Eval

In [None]:
run_format = defaultdict(list)

for query_id in scored_output:
  results = scored_output[query_id]
  rank = 1
  for result in results:
    run_format['query'].append(query_id)
    run_format['q0'].append("q0")
    run_format['docid'].append(str(result['doc_id']))
    run_format['rank'].append(rank)
    run_format['score'].append(result['bm25_score'])
    run_format['system'].append("Reranker")    

    rank += 1

In [None]:
from evaluate import load
trec_eval = load("trec_eval")

In [None]:
results = trec_eval.compute(predictions=[run_format], references=[qrels_format])

In [None]:
results['NDCG@10']

0.6556285378554335