In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from beir.datasets.data_loader import GenericDataLoader
from rank_bm25 import BM25Okapi
from ranx import Qrels, Run, evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = Path(r"D:\projects\search-ranking-project\data\beir\scifact\scifact")
RESULTS_DIR = Path(r"D:\projects\search-ranking-project\results\bm25")

TOP_K = 100
METRICS = ["ndcg@10", "precision@10"]


In [3]:
def build_doc_text(doc: dict) -> str:
    title = (doc.get("title") or "").strip()
    text = (doc.get("text") or "").strip()
    return (title + " " + text).strip()


def tokenize(text: str):
    # Minimal tokenizer (baseline)
    return text.lower().split()


In [4]:
if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"Dataset path not found: {DATA_PATH.resolve()}"
    )

corpus, queries, qrels = GenericDataLoader(str(DATA_PATH)).load(split="test")

print(f"corpus: {len(corpus)}")
print(f"queries: {len(queries)}")
print(f"qrels: {sum(len(v) for v in qrels.values())}")


100%|██████████| 5183/5183 [00:00<00:00, 114972.91it/s]

corpus: 5183
queries: 300
qrels: 339





In [5]:
doc_ids = []
tokenized_docs = []

for doc_id, doc in tqdm(corpus.items(), desc="Preparing corpus"):
    doc_ids.append(doc_id)
    tokenized_docs.append(tokenize(build_doc_text(doc)))

bm25 = BM25Okapi(tokenized_docs)
print("BM25 index built")


Preparing corpus: 100%|██████████| 5183/5183 [00:00<00:00, 65679.35it/s]


BM25 index built


In [6]:
run_dict = {}  # {qid: {docid: score}}

for qid, qtext in tqdm(queries.items(), desc="Retrieving"):
    q_tokens = tokenize(qtext)
    scores = bm25.get_scores(q_tokens)

    top_idx = np.argsort(scores)[::-1][:TOP_K]
    run_dict[qid] = {doc_ids[i]: float(scores[i]) for i in top_idx}

print(f"Run created for {len(run_dict)} queries")


Retrieving: 100%|██████████| 300/300 [00:03<00:00, 77.64it/s]

Run created for 300 queries





In [7]:
qrels_obj = Qrels(qrels)
run_obj = Run(run_dict)

scores = evaluate(qrels_obj, run_obj, METRICS)
scores


  scores[i] = _ndcg(qrels[i], run[i], k, rel_lvl, jarvelin)


{'ndcg@10': 0.5597016150134456, 'precision@10': 0.07633333333333332}

In [8]:
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

with open(RESULTS_DIR / "run.json", "w", encoding="utf-8") as f:
    json.dump(run_dict, f)

with open(RESULTS_DIR / "metrics.json", "w", encoding="utf-8") as f:
    json.dump(scores, f, indent=2)

pd.DataFrame([scores]).to_csv(RESULTS_DIR / "metrics.csv", index=False)

print("Results saved to:", RESULTS_DIR.resolve())


Results saved to: D:\projects\search-ranking-project\results\bm25
