In [40]:
import os
import time
import pandas as pd
import pyterrier as pt
import numpy as np
from sklearn.model_selection import train_test_split
import pytrec_eval
import string
from tqdm.auto import tqdm

# Initialize PyTerrier
if not pt.started():
    pt.init()

  if not pt.started():


In [38]:
BASE_IDX = "indexes/stopwords_removed"
QUERIES = "data/train_queries.csv"
QRELS   = "data/train_qrels.csv"

# Load queries and qrels
qs = pd.read_csv(QUERIES, sep="\t", names=["qid", "query"], header=0)
qrels = pd.read_csv(QRELS, sep="\t")

# Strip out all punctuation
qs['query'] = qs['query'] \
    .str.translate(str.maketrans('', '', string.punctuation))

# Split into train/validation
train_qs, val_qs = train_test_split(qs, test_size=0.2, random_state=42)

In [30]:
# Prepare qrels dict for pytrec_eval
qrels_dict = {}
for _, row in qrels.iterrows():
    qrels_dict.setdefault(str(row.qid), {})[row["docno"]] = int(row["relevance"])

In [None]:
# Build index reference
abs_idx_dir = os.path.abspath(BASE_IDX)   # BASE_IDX = "indexes/stopwords_removed"
index = pt.IndexFactory.of(abs_idx_dir)

# Define retrieval models
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
lm_dir = pt.BatchRetrieve(index, wmodel="DirichletLM")

# Parameter grids
bm25_params = {
    'bm25.k_1': [1.0, 1.25, 1.5, 1.75, 2.0],
    'bm25.b':   [0.0, 0.25, 0.5, 0.75, 1.0]
}
lm_params = {
    'DirichletLM.mu': [500, 1000, 1500, 2000]
}

  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
  lm_dir = pt.BatchRetrieve(index, wmodel="DirichletLM")


In [31]:
def evaluate(run_df, qrels_dict):
    """
    Evaluate a run (DataFrame with qid, docno, score) against qrels_dict
    using pytrec_eval, returning a dict of metrics per query.
    """
    # Format for pytrec_eval
    run = {}
    for qid, group in run_df.groupby('qid'):
        run[qid] = {doc: float(score) for doc, score in zip(group['docno'], group['score'])}
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels_dict, {'ndcg_cut.5','ndcg_cut.10','ndcg_cut.20',
                     'recip_rank','P.5','P.10','P.20',
                     'recall.5','recall.10','recall.20'}
    )
    results = evaluator.evaluate(run)
    # Aggregate
    metrics = {m: np.mean([results[q][m] for q in results]) for m in next(iter(results.values())).keys()}
    return metrics

In [46]:
bm25_results = []
for k1 in tqdm(bm25_params['bm25.k_1'], desc='k1'):
    for b in tqdm(bm25_params['bm25.b'], desc='b', leave=False):
        # create a fresh BM25 retriever with your chosen k1 and b
        model = pt.terrier.Retriever(
            index,
            wmodel="BM25",
            controls={
                "bm25.k_1": k1,
                "bm25.b":   b
            }
        )
        start = time.time()
        run = model.transform(train_qs[['qid','query']])
        metrics = evaluate(run, qrels_dict)
        elapsed = (time.time() - start) / len(train_qs)
        bm25_results.append({
            'model': 'BM25',
            'k1': k1, 'b': b,
            **metrics,
            'avg_eval_time_s': elapsed
        })

bm25_df = pd.DataFrame(bm25_results)
best_bm25 = bm25_df.sort_values('ndcg_cut_10', ascending=False).iloc[0]
best_bm25

k1: 100%|██████████| 4/4 [52:50<00:00, 792.52s/it]


model                  BM25
k1                      2.0
b                      0.75
recip_rank          0.21382
P_5                0.065715
P_10               0.039266
P_20               0.020847
recall_5           0.328577
recall_10          0.392665
recall_20          0.416948
ndcg_cut_5         0.234097
ndcg_cut_10        0.255082
ndcg_cut_20        0.261284
avg_eval_time_s     0.01726
Name: 18, dtype: object

In [47]:
lm_results = []
for mu in tqdm(lm_params['DirichletLM.mu'], desc='mu'):
    model = pt.terrier.Retriever(
        index,
        wmodel="DirichletLM",
        controls={
            "mu": mu
        }
    )
    start = time.time()
    run = model.transform(train_qs[['qid','query']])
    metrics = evaluate(run, qrels_dict)
    elapsed = (time.time() - start) / len(train_qs)
    lm_results.append({
        'model': 'DirichletLM',
        'mu': mu,
        **metrics,
        'avg_eval_time_s': elapsed
    })

lm_df = pd.DataFrame(lm_results)
best_lm = lm_df.sort_values('ndcg_cut_10', ascending=False).iloc[0]
best_lm

mu: 100%|██████████| 4/4 [10:05<00:00, 151.26s/it]


model              DirichletLM
mu                         500
recip_rank            0.177736
P_5                   0.055802
P_10                  0.037276
P_20                   0.02056
recall_5              0.279009
recall_10             0.372763
recall_20              0.41119
ndcg_cut_5            0.190705
ndcg_cut_10            0.22117
ndcg_cut_20            0.23111
avg_eval_time_s       0.019782
Name: 0, dtype: object

In [None]:
# TODO: This should be done for all the indices with stopwords removed

In [52]:
best_models = [
    (
        pt.BatchRetrieve(
            index,
            wmodel="BM25",
            controls={
                "bm25.k_1": best_bm25["k1"],
                "bm25.b":  best_bm25["b"]
            }
        ),
        "BM25"
    ),
    (
        pt.BatchRetrieve(
            index,
            wmodel="DirichletLM",
            controls={
                "dirichletlm.mu": int(best_lm["mu"])
            }
        ),
        "DirichletLM"
    )
]

val_results = []
for model, name in best_models:
    start = time.time()
    run = model.transform(val_qs)
    metrics = evaluate(run, qrels_dict)
    elapsed = (time.time() - start) / len(val_qs)
    val_results.append({
        'model': name,
        **metrics,
        'avg_eval_time_s': elapsed
    })

val_df = pd.DataFrame(val_results)

# Combine and display
print("=== BM25 Grid Search (train) ===")
print(bm25_df.sort_values('ndcg_cut_10', ascending=False).reset_index(drop=True))
print("\n=== Dirichlet LM Grid Search (train) ===")
print(lm_df.sort_values('ndcg_cut_10', ascending=False).reset_index(drop=True))
print("\nBest BM25 params:", best_bm25.to_dict())
print("Best LM params:", best_lm.to_dict())
print("\n=== Validation Results ===")
print(val_df)

  pt.BatchRetrieve(
  pt.BatchRetrieve(


=== BM25 Grid Search (train) ===
   model   k1     b  recip_rank       P_5      P_10      P_20  recall_5  \
0   BM25  2.0  0.75    0.213820  0.065715  0.039266  0.020847  0.328577   
1   BM25  1.5  0.75    0.213096  0.065115  0.039191  0.020747  0.325573   
2   BM25  2.0  1.00    0.211604  0.064614  0.038803  0.020635  0.323069   
3   BM25  1.0  0.75    0.210496  0.063988  0.038678  0.020585  0.319940   
4   BM25  1.5  1.00    0.210790  0.064113  0.038578  0.020603  0.320566   
5   BM25  1.0  1.00    0.207635  0.063537  0.038190  0.020459  0.317687   
6   BM25  2.0  0.50    0.204821  0.063838  0.038929  0.020772  0.319189   
7   BM25  1.5  0.50    0.204303  0.063638  0.038891  0.020697  0.318188   
8   BM25  0.5  0.75    0.204767  0.062160  0.037564  0.020146  0.310802   
9   BM25  1.0  0.50    0.201846  0.062486  0.038403  0.020522  0.312430   
10  BM25  0.5  1.00    0.203525  0.061910  0.037351  0.020084  0.309551   
11  BM25  0.5  0.50    0.196933  0.060859  0.037264  0.020071  0.30