# Setup

# Setup

## Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

## Installs

In [None]:
!pip install pyserini -q
!pip install faiss-cpu -q
!pip install jsonlines -q
!pip install datasets -q

## Paths

In [None]:
DATASET_FILE = './expanded_docs.jsonl'

In [None]:
!cp './gdrive/Shareddrives/UNICAMP/DOUTORADO/IA368/[5] Doc2Query/expanded_docs.jsonl' . 

# Helper Function

In [None]:
from tqdm.auto import tqdm
import json

def get_documents(expanded=False):
  with open(DATASET_FILE, 'r') as fp:
    for line in fp:
      line_dict = json.loads(line)

      del line_dict['queries']
      line_dict['id'] = line_dict["_id"]
      del line_dict["_id"]

      if expanded:
        del line_dict['text']
        line_dict['contents'] = line_dict["expanded_text"]
        del line_dict["expanded_text"]
        del line_dict["title"]

        yield line_dict
      else:
        line_dict['contents'] = line_dict['title'] + ' ' + line_dict['text']
        del line_dict['text']
        del line_dict["expanded_text"]
        del line_dict["title"]
        yield line_dict

# Qrels

In [None]:
from datasets import load_dataset

qrels_dataset = load_dataset("BeIR/trec-covid-qrels")
qrels_dataset

# Queries

In [None]:
queries_dataset = load_dataset("BeIR/trec-covid", "queries")
queries_dataset

# BM25 - Plain Doc

In [None]:
!mkdir -p collections/non_expanded

In [None]:
import jsonlines

for non_expanded_doc in get_documents(expanded=False):
  with jsonlines.open(f"collections/non_expanded/non_expanded_docs.jsonl", 'a') as writer:
      writer.write(non_expanded_doc)  

## Build Index

In [None]:
%%time
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input collections/non_expanded/ \
  --index indexes/lucene-index-trec-covid-non-expanded \
  --generator DefaultLuceneDocumentGenerator \
  --threads 9 \
  --storePositions --storeDocvectors --storeRaw

In [None]:
from pyserini.index import IndexReader

reader = IndexReader('indexes/lucene-index-trec-covid-non-expanded')
total_terms = reader.stats()["total_terms"]
print(f"Total number of entries in the index: {total_terms:,}")

## Rank

In [None]:
from pyserini.search import get_topics, get_qrels, SimpleSearcher
from tqdm.auto import tqdm
from pyserini.search.lucene import LuceneSearcher
from collections import defaultdict

searcher = LuceneSearcher('indexes/lucene-index-trec-covid-non-expanded')
searcher.set_bm25(k1=0.9, b=0.4)

scored_output = defaultdict(list)

for query in tqdm(queries_dataset['queries'], desc='Running Queries'):
  hits = searcher.search(query['text'], 1000)

  for i, hit in enumerate(hits):
    scored_output[query["_id"]].append({'doc_id': hit.docid,
                                        'bm25_score': hit.score,
                                        'bm25_position': i
                                        })

## Eval

In [None]:
!pip install trectools -q
!pip install evaluate -q

In [None]:
qrels_format = defaultdict(list)

for query in qrels_dataset['test']:
  qrels_format['query'].append(query["query-id"])
  qrels_format['q0'].append("q0")
  qrels_format['docid'].append(str(query["corpus-id"]))
  qrels_format['rel'].append(query["score"])

In [None]:
run_format = defaultdict(list)

for query_id in scored_output:
  results = scored_output[query_id]
  rank = 1
  for result in results:
    run_format['query'].append(query_id)
    run_format['q0'].append("q0")
    run_format['docid'].append(str(result['doc_id']))
    run_format['rank'].append(rank)
    run_format['score'].append(result['bm25_score'])
    run_format['system'].append("Reranker")    

    rank += 1

In [None]:
from evaluate import load
trec_eval = load("trec_eval")

In [None]:
results = trec_eval.compute(predictions=[run_format], references=[qrels_format])

In [None]:
results['NDCG@10']

# BM25 - Expanded Docs

In [None]:
!mkdir -p collections/expanded

In [None]:
import jsonlines

for non_expanded_doc in get_documents(expanded=True):
  with jsonlines.open(f"collections/expanded/expanded_docs.jsonl", 'a') as writer:
      writer.write(non_expanded_doc)  

## Build Index

In [None]:
%%time
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input collections/expanded/ \
  --index indexes/lucene-index-trec-covid-expanded \
  --generator DefaultLuceneDocumentGenerator \
  --threads 9 \
  --storePositions --storeDocvectors --storeRaw

## Rank

In [None]:
from pyserini.search import get_topics, get_qrels, SimpleSearcher
from tqdm.auto import tqdm
from pyserini.search.lucene import LuceneSearcher
from collections import defaultdict

searcher = LuceneSearcher('indexes/lucene-index-trec-covid-expanded')
searcher.set_bm25(k1=0.9, b=0.4)

scored_output = defaultdict(list)

for query in tqdm(queries_dataset['queries'], desc='Running Queries'):
  hits = searcher.search(query['text'], 1000)

  for i, hit in enumerate(hits):
    scored_output[query["_id"]].append({'doc_id': hit.docid,
                                        'bm25_score': hit.score,
                                        'bm25_position': i
                                        })

## Eval

In [None]:
run_format = defaultdict(list)

for query_id in scored_output:
  results = scored_output[query_id]
  rank = 1
  for result in results:
    run_format['query'].append(query_id)
    run_format['q0'].append("q0")
    run_format['docid'].append(str(result['doc_id']))
    run_format['rank'].append(rank)
    run_format['score'].append(result['bm25_score'])
    run_format['system'].append("Reranker")    

    rank += 1

In [None]:
from evaluate import load
trec_eval = load("trec_eval")

In [None]:
results = trec_eval.compute(predictions=[run_format], references=[qrels_format])

In [None]:
results['NDCG@10']