In [40]:
import os
import time
import pandas as pd
import pyterrier as pt
import numpy as np
from sklearn.model_selection import train_test_split
import pytrec_eval
import string
from tqdm.auto import tqdm

# Initialize PyTerrier
if not pt.started():
    pt.init()

  if not pt.started():


In [38]:
BASE_IDX = "indexes/stopwords_removed"
QUERIES = "data/train_queries.csv"
QRELS   = "data/train_qrels.csv"

# Load queries and qrels
qs = pd.read_csv(QUERIES, sep="\t", names=["qid", "query"], header=0)
qrels = pd.read_csv(QRELS, sep="\t")

# Strip out all punctuation
qs['query'] = qs['query'] \
    .str.translate(str.maketrans('', '', string.punctuation))

# Split into train/validation
train_qs, val_qs = train_test_split(qs, test_size=0.2, random_state=42)

In [30]:
# Prepare qrels dict for pytrec_eval
qrels_dict = {}
for _, row in qrels.iterrows():
    qrels_dict.setdefault(str(row.qid), {})[row["docno"]] = int(row["relevance"])

In [27]:
# Build index reference
abs_idx_dir = os.path.abspath(BASE_IDX)   # BASE_IDX = "indexes/stopwords_removed"
index = pt.IndexFactory.of(abs_idx_dir)

# Define retrieval models
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
lm_dir = pt.BatchRetrieve(index, wmodel="DirichletLM")

# Parameter grids
bm25_params = {
    'bm25.k_1': [0.5, 1.0, 1.5, 2.0],
    'bm25.b':   [0.0, 0.25, 0.5, 0.75, 1.0]
}
lm_params = {
    'DirichletLM.mu': [500, 1000, 1500, 2000]
}

  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
  lm_dir = pt.BatchRetrieve(index, wmodel="DirichletLM")


In [31]:
def evaluate(run_df, qrels_dict):
    """
    Evaluate a run (DataFrame with qid, docno, score) against qrels_dict
    using pytrec_eval, returning a dict of metrics per query.
    """
    # Format for pytrec_eval
    run = {}
    for qid, group in run_df.groupby('qid'):
        run[qid] = {doc: float(score) for doc, score in zip(group['docno'], group['score'])}
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels_dict, {'ndcg_cut.5','ndcg_cut.10','ndcg_cut.20',
                     'recip_rank','P.5','P.10','P.20',
                     'recall.5','recall.10','recall.20'}
    )
    results = evaluator.evaluate(run)
    # Aggregate
    metrics = {m: np.mean([results[q][m] for q in results]) for m in next(iter(results.values())).keys()}
    return metrics

In [44]:
bm25_results = []
for k1 in tqdm(bm25_params['bm25.k_1'], desc='k1'):
    for b in tqdm(bm25_params['bm25.b'], desc='b', leave=False):
        # create a fresh BM25 retriever with your chosen k1 and b
        model = pt.terrier.Retriever(
            index,
            wmodel="BM25",
            controls={
                "bm25.k_1": k1,
                "bm25.b":   b
            }
        )
        start = time.time()
        run = model.transform(train_qs[['qid','query']])
        metrics = evaluate(run, qrels_dict)
        elapsed = (time.time() - start) / len(train_qs)
        bm25_results.append({
            'model': 'BM25',
            'k1': k1, 'b': b,
            **metrics,
            'avg_eval_time_s': elapsed
        })

bm25_df = pd.DataFrame(bm25_results)
best_bm25 = bm25_df.sort_values('ndcg_cut_10', ascending=False).iloc[0]
best_bm25

k1:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
lm_results = []
for mu in tqdm(lm_params['DirichletLM.mu'], desc='mu'):
    model = pt.terrier.Retriever(
        index,
        wmodel="DirichletLM",
        controls={
            "mu": mu
        }
    )
    start = time.time()
    run = model.transform(train_qs[['qid','query']])
    metrics = evaluate(run, qrels_dict)
    elapsed = (time.time() - start) / len(train_qs)
    lm_results.append({
        'model': 'DirichletLM',
        'mu': mu,
        **metrics,
        'avg_eval_time_s': elapsed
    })

lm_df = pd.DataFrame(lm_results)
best_lm = lm_df.sort_values('ndcg_cut_10', ascending=False).iloc[0]
best_lm

In [None]:
best_models = [
    (pt.BatchRetrieve(index, wmodel="BM25", bm25_k_1=best_bm25.k1, bm25_b=best_bm25.b), 'BM25'),
    (pt.BatchRetrieve(index, wmodel="DirichletLM", DirichletLM_mu=int(best_lm.mu)), 'DirichletLM')
]

val_results = []
for model, name in best_models:
    start = time.time()
    run = model.transform(val_qs)
    metrics = evaluate(run, qrels_dict)
    elapsed = (time.time() - start) / len(val_qs)
    val_results.append({
        'model': name,
        **metrics,
        'avg_eval_time_s': elapsed
    })

val_df = pd.DataFrame(val_results)

# Combine and display
print("=== BM25 Grid Search (train) ===")
print(bm25_df.sort_values('ndcg_cut.10', ascending=False).reset_index(drop=True))
print("\n=== Dirichlet LM Grid Search (train) ===")
print(lm_df.sort_values('ndcg_cut.10', ascending=False).reset_index(drop=True))
print("\nBest BM25 params:", best_bm25.to_dict())
print("Best LM params:", best_lm.to_dict())
print("\n=== Validation Results ===")
print(val_df)