In [2]:
import os
import time
import pandas as pd
import pyterrier as pt
import numpy as np
import pytrec_eval
import re
import string
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_rel, t
from statsmodels.stats.multitest import multipletests
from pyterrier.measures import *

In [3]:
BASE_IDX = "indexes/stopwords_removed"
QUERIES = "data/train_queries.csv"
QRELS   = "data/train_qrels.csv"

# Load queries and qrels
qs = pd.read_csv(QUERIES, sep="\t", names=["qid", "query"], header=0)
qrels = pd.read_csv(QRELS, sep="\t")

# Strip out all punctuation
qs['query'] = qs['query'].str.replace(rf"[{re.escape(string.punctuation)}]", " ", regex=True)

# # Make the qid an str
qs['qid'] = qs['qid'].astype(str)
qrels['qid'] = qrels['qid'].astype(str)

train_qs, val_qs = train_test_split(
    qs,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# now get the corresponding qrels
train_qids = set(train_qs['qid'])
val_qids   = set(val_qs  ['qid'])

train_qrels = qrels[qrels['qid'].isin(train_qids)]
val_qrels   = qrels[qrels['qid'].isin(val_qids)]

In [4]:
import os

index_base_path = "indexes"
all_index_dirs = [
    os.path.join(index_base_path, d)
    for d in os.listdir(index_base_path)
    if os.path.isdir(os.path.join(index_base_path, d))
]

In [9]:
# Parameter grids
bm25_params = {
    'bm25.k_1': [1.0, 1.25, 1.5, 1.75, 1.99, 2.5, 3.0, 3.5, 4.0, 5.0],
    'bm25.b':   [0.0, 0.25, 0.5, 0.75, 1.0]
}

for idx_dir in all_index_dirs:
    print(idx_dir)
    abs_idx_dir = os.path.abspath(idx_dir)
    index = pt.IndexFactory.of(abs_idx_dir)
    BM25 = pt.terrier.Retriever(index, wmodel="BM25", controls={"bm25.b" : 0.75, "bm25.k_1": 1.2})
    pt.GridSearch(
        BM25,
        {BM25: bm25_params},
        train_qs,
        train_qrels,
        "recip_rank",
        verbose=True,
    )

indexes/stemming_only


GridScan: 100%|██████████| 15/15 [14:48<00:00, 59.25s/it]


Best recip_rank is 0.102950
Best setting is ['TerrierRetr(BM25) bm25.k_1=4.0', 'TerrierRetr(BM25) bm25.b=0.75']
indexes/stopwords_removed_trimmed


GridScan: 100%|██████████| 15/15 [06:36<00:00, 26.44s/it]


Best recip_rank is 0.136786
Best setting is ['TerrierRetr(BM25) bm25.k_1=2.5', 'TerrierRetr(BM25) bm25.b=0.75']
indexes/stopwords_and_stemming_trimmed


GridScan: 100%|██████████| 15/15 [06:40<00:00, 26.72s/it]


Best recip_rank is 0.137748
Best setting is ['TerrierRetr(BM25) bm25.k_1=2.5', 'TerrierRetr(BM25) bm25.b=1.0']
indexes/stemming_only_trimmed


GridScan:  53%|█████▎    | 8/15 [08:15<07:13, 61.93s/it]


KeyboardInterrupt: 

In [5]:
bm25_best_params = {
    "indexes/stemming_only": {
        "bm25.k_1":  4.0,
        "bm25.b": 0.75
    },
    "indexes/stopwords_removed_trimmed": {
        "bm25.k_1":  2.5,
        "bm25.b": 0.75
    },
    "indexes/stopwords_and_stemming_trimmed": {
        "bm25.k_1":  2.5,
        "bm25.b": 1
    },
    "indexes/stopwords_and_stemming": {
        "bm25.k_1":  3.0,
        "bm25.b": 1
    },
    "indexes/stemming_only_trimmed": {
        "bm25.k_1":  5.0,
        "bm25.b": 0.75
    },
    "indexes/stopwords_removed": {
        "bm25.k_1":  3.50,
        "bm25.b": 0.75
    },
    "indexes/full_index": {
        "bm25.k_1": 2.5,
        "bm25.b": 0.75
    },
    "indexes/full_index_trimmed": {
        "bm25.k_1":  2.5,
        "bm25.b": 0.75
    },
}

In [None]:
lm_params = {
    'dirichletlm.mu':  [50, 100, 250, 500, 1000, 1500, 3000]
}

for idx_dir in all_index_dirs:

    print(idx_dir)
    abs_idx_dir = os.path.abspath(idx_dir)
    index = pt.IndexFactory.of(abs_idx_dir)
    LM = pt.terrier.Retriever(index, wmodel="DirichletLM", controls={"dirichletlm.mu": 1000})
    pt.GridSearch(
        LM,
        {LM: lm_params},
        train_qs[:1000],
        train_qrels,
        'recip_rank',
        verbose=True,
    )

indexes/stemming_only


GridScan: 100%|██████████| 4/4 [04:29<00:00, 67.38s/it]


Best recip_rank is 0.126779
Best setting is ['TerrierRetr(DirichletLM) dirichletlm.mu=250']
indexes/stopwords_removed_trimmed


GridScan:   0%|          | 0/4 [00:14<?, ?it/s]


KeyboardInterrupt: 

In [6]:
lm_best_params = {
    "indexes/stemming_only": {
        "mu": 250
    },
    "indexes/stopwords_removed_trimmed": {
        "mu": 100
    },
    "indexes/stopwords_and_stemming_trimmed": {
        "mu": 100
    },
    "indexes/stopwords_and_stemming": {
        "mu": 100
    },
    "indexes/stopwords_removed_trimmed": {
        "mu": 100
    },
    "indexes/stemming_only_trimmed": {
        "mu": 250
    },
    "indexes/stopwords_removed": {
        "mu": 100
    },
    "indexes/full_index": {
        "mu": 250
    },
    "indexes/full_index_trimmed": {
       "mu": 250
    },
}

In [7]:
retrievers = []
names      = []

for idx_path, p in lm_best_params.items():
    abs_dir = os.path.abspath(idx_path)
    idx = pt.IndexFactory.of(abs_dir)
    retrievers.append(
        pt.terrier.Retriever(
            idx,
            wmodel="DirichletLM",
            controls={ "dirichletlm.mu": p["mu"] }
        )
    )
    names.append(f"LM - {os.path.basename(idx_path)}")

for idx_path, p in bm25_best_params.items():
    abs_dir = os.path.abspath(idx_path)
    idx = pt.IndexFactory.of(abs_dir)
    retrievers.append(
        pt.terrier.Retriever(
            idx,
            wmodel="BM25",
            controls={ "bm25.k_1": p["bm25.k_1"], "bm25.b": p["bm25.b"] }
        )
    )
    names.append(f"BM25 - {os.path.basename(idx_path)}")


Java started (triggered by IndexFactory.of) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [8]:
from pyterrier.measures import *
from table_helpers import format_and_style, build_latex_table

eval_measures = [
    'ndcg_cut.5', 'ndcg_cut.10', 'ndcg_cut.20',
    RR@5, RR@10, RR@20,
    'P.5', 'P.10', 'P.20',
    'recall.5', 'recall.10', 'recall.20'
]

eval_measures_str = [
    'ndcg_cut.5', 'ndcg_cut.10', 'ndcg_cut.20',
    'RR@5', 'RR@10', 'RR@20',
    'P.5', 'P.10', 'P.20',
    'recall.5', 'recall.10', 'recall.20'
]

In [16]:
result_df = pt.Experiment(
    retrievers, train_qs, train_qrels,
    eval_metrics=eval_measures,
    filter_by_qrels=True,
    verbose=True,
    perquery=True,
    names=names
)

display(format_and_style(result_df, decimals=3, measures_order=eval_measures_str))

latex_code = build_latex_table(result_df, decimals=3, measures_order=eval_measures_str)
print(latex_code)

pt.Experiment: 100%|██████████| 16/16 [1:02:10<00:00, 233.14s/system]


Unnamed: 0_level_0,ndcg_cut.5,ndcg_cut.10,ndcg_cut.20,RR@5,RR@10,RR@20,P.5,P.10,P.20,recall.5,recall.10,recall.20
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BM25 - full_index,0.420 ± 0.013,0.467 ± 0.012,0.484 ± 0.011,0.360 ± 0.013,0.380 ± 0.012,0.385 ± 0.012,0.121 ± 0.003,0.075 ± 0.001,0.041 ± 0.001,0.605 ± 0.016,0.748 ± 0.014,0.814 ± 0.013
BM25 - full_index_trimmed,0.410 ± 0.013,0.457 ± 0.012,0.472 ± 0.011,0.351 ± 0.013,0.371 ± 0.012,0.375 ± 0.012,0.118 ± 0.003,0.073 ± 0.001,0.040 ± 0.001,0.591 ± 0.016,0.734 ± 0.015,0.797 ± 0.013
BM25 - stemming_only,0.387 ± 0.013,0.434 ± 0.012,0.452 ± 0.011,0.329 ± 0.012,0.349 ± 0.012,0.354 ± 0.012,0.113 ± 0.003,0.071 ± 0.001,0.039 ± 0.001,0.566 ± 0.016,0.710 ± 0.015,0.782 ± 0.014
BM25 - stemming_only_trimmed,0.379 ± 0.013,0.424 ± 0.012,0.440 ± 0.011,0.322 ± 0.012,0.340 ± 0.012,0.345 ± 0.012,0.112 ± 0.003,0.069 ± 0.002,0.038 ± 0.001,0.558 ± 0.016,0.694 ± 0.015,0.758 ± 0.014
BM25 - stopwords_and_stemming,0.535 ± 0.012,0.582 ± 0.011,0.596 ± 0.010,0.465 ± 0.013,0.485 ± 0.012,0.489 ± 0.012,0.150 ± 0.003,0.089 ± 0.001,0.047 ± 0.000,0.748 ± 0.014,0.891 ± 0.010,0.947 ± 0.007
BM25 - stopwords_and_stemming_trimmed,0.535 ± 0.012,0.581 ± 0.011,0.595 ± 0.010,0.466 ± 0.013,0.485 ± 0.012,0.489 ± 0.012,0.149 ± 0.003,0.089 ± 0.001,0.047 ± 0.000,0.745 ± 0.014,0.886 ± 0.010,0.940 ± 0.008
BM25 - stopwords_removed,0.531 ± 0.012,0.581 ± 0.011,0.595 ± 0.010,0.463 ± 0.013,0.484 ± 0.012,0.488 ± 0.012,0.148 ± 0.003,0.089 ± 0.001,0.047 ± 0.000,0.742 ± 0.014,0.895 ± 0.010,0.948 ± 0.007
BM25 - stopwords_removed_trimmed,0.533 ± 0.012,0.581 ± 0.011,0.594 ± 0.010,0.465 ± 0.013,0.485 ± 0.012,0.488 ± 0.012,0.149 ± 0.003,0.089 ± 0.001,0.047 ± 0.000,0.743 ± 0.014,0.889 ± 0.010,0.939 ± 0.008
LM - full_index,0.485 ± 0.013,0.539 ± 0.011,0.559 ± 0.010,0.421 ± 0.013,0.444 ± 0.012,0.449 ± 0.012,0.136 ± 0.003,0.085 ± 0.001,0.046 ± 0.000,0.682 ± 0.015,0.846 ± 0.012,0.922 ± 0.009
LM - full_index_trimmed,0.485 ± 0.013,0.538 ± 0.011,0.557 ± 0.010,0.421 ± 0.013,0.443 ± 0.012,0.448 ± 0.012,0.136 ± 0.003,0.084 ± 0.001,0.046 ± 0.000,0.680 ± 0.015,0.844 ± 0.012,0.917 ± 0.009


\begin{table*}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c c c c c c c c c c c c}
\toprule
Name & ndcg\_cut.5 $\uparrow$ & ndcg\_cut.10 $\uparrow$ & ndcg\_cut.20 $\uparrow$ & RR@5 $\uparrow$ & RR@10 $\uparrow$ & RR@20 $\uparrow$ & P.5 $\uparrow$ & P.10 $\uparrow$ & P.20 $\uparrow$ & recall.5 $\uparrow$ & recall.10 $\uparrow$ & recall.20 $\uparrow$ \\
\midrule
BM25 - full\_index & $0.420\pm0.013$ & $0.467\pm0.012$ & $0.484\pm0.011$ & $0.360\pm0.013$ & $0.380\pm0.012$ & $0.385\pm0.012$ & $0.121\pm0.003$ & $0.075\pm0.001$ & $0.041\pm0.001$ & $0.605\pm0.016$ & $0.748\pm0.014$ & $0.814\pm0.013$ \\
BM25 - full\_index\_trimmed & $0.410\pm0.013$ & $0.457\pm0.012$ & $0.472\pm0.011$ & $0.351\pm0.013$ & $0.371\pm0.012$ & $0.375\pm0.012$ & $0.118\pm0.003$ & $0.073\pm0.001$ & $0.040\pm0.001$ & $0.591\pm0.016$ & $0.734\pm0.015$ & $0.797\pm0.013$ \\
BM25 - stemming\_only & $0.387\pm0.013$ & $0.434\pm0.012$ & $0.452\pm0.011$ & $0.329\pm0.012$ & $0.349\pm0.012$ & $0.354\pm0.012$ & $0.

In [9]:
result_df = pt.Experiment(
    retrievers, val_qs, val_qrels,
    eval_metrics=eval_measures,
    filter_by_qrels=True,
    verbose=True,
    perquery=True,
    names=names
)

display(format_and_style(result_df, decimals=3, measures_order=eval_measures_str))

latex_code = build_latex_table(result_df, decimals=3, measures_order=eval_measures_str)
print(latex_code)

pt.Experiment: 100%|██████████| 16/16 [12:35<00:00, 47.24s/system]


Unnamed: 0_level_0,ndcg_cut.5,ndcg_cut.10,ndcg_cut.20,RR@5,RR@10,RR@20,P.5,P.10,P.20,recall.5,recall.10,recall.20
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BM25 - full_index,0.427 ± 0.025,0.475 ± 0.023,0.494 ± 0.022,0.363 ± 0.025,0.383 ± 0.024,0.388 ± 0.024,0.124 ± 0.006,0.077 ± 0.003,0.042 ± 0.001,0.619 ± 0.032,0.767 ± 0.028,0.840 ± 0.024
BM25 - full_index_trimmed,0.415 ± 0.026,0.461 ± 0.023,0.479 ± 0.022,0.354 ± 0.025,0.374 ± 0.024,0.378 ± 0.024,0.120 ± 0.006,0.074 ± 0.003,0.040 ± 0.001,0.598 ± 0.032,0.741 ± 0.029,0.809 ± 0.026
BM25 - stemming_only,0.394 ± 0.026,0.442 ± 0.023,0.460 ± 0.022,0.336 ± 0.025,0.356 ± 0.024,0.361 ± 0.024,0.114 ± 0.007,0.072 ± 0.003,0.040 ± 0.001,0.572 ± 0.033,0.719 ± 0.030,0.790 ± 0.027
BM25 - stemming_only_trimmed,0.382 ± 0.026,0.432 ± 0.023,0.447 ± 0.023,0.326 ± 0.025,0.347 ± 0.024,0.352 ± 0.024,0.110 ± 0.007,0.071 ± 0.003,0.038 ± 0.001,0.550 ± 0.033,0.706 ± 0.030,0.764 ± 0.028
BM25 - stopwords_and_stemming,0.533 ± 0.024,0.578 ± 0.021,0.591 ± 0.020,0.460 ± 0.025,0.479 ± 0.024,0.482 ± 0.024,0.151 ± 0.006,0.090 ± 0.002,0.047 ± 0.001,0.756 ± 0.028,0.899 ± 0.020,0.946 ± 0.015
BM25 - stopwords_and_stemming_trimmed,0.530 ± 0.024,0.574 ± 0.021,0.586 ± 0.020,0.457 ± 0.025,0.475 ± 0.024,0.479 ± 0.024,0.151 ± 0.006,0.089 ± 0.002,0.047 ± 0.001,0.754 ± 0.028,0.891 ± 0.021,0.936 ± 0.016
BM25 - stopwords_removed,0.536 ± 0.025,0.586 ± 0.021,0.595 ± 0.020,0.465 ± 0.025,0.486 ± 0.024,0.489 ± 0.024,0.151 ± 0.006,0.091 ± 0.002,0.047 ± 0.001,0.753 ± 0.028,0.906 ± 0.019,0.940 ± 0.016
BM25 - stopwords_removed_trimmed,0.534 ± 0.025,0.583 ± 0.021,0.592 ± 0.020,0.465 ± 0.026,0.485 ± 0.024,0.488 ± 0.024,0.149 ± 0.006,0.089 ± 0.002,0.046 ± 0.001,0.744 ± 0.029,0.892 ± 0.020,0.929 ± 0.017
LM - full_index,0.498 ± 0.025,0.548 ± 0.022,0.564 ± 0.021,0.430 ± 0.026,0.450 ± 0.024,0.455 ± 0.024,0.141 ± 0.006,0.086 ± 0.002,0.046 ± 0.001,0.705 ± 0.030,0.859 ± 0.023,0.921 ± 0.018
LM - full_index_trimmed,0.495 ± 0.025,0.545 ± 0.022,0.560 ± 0.021,0.428 ± 0.026,0.448 ± 0.024,0.453 ± 0.024,0.140 ± 0.006,0.085 ± 0.002,0.046 ± 0.001,0.701 ± 0.030,0.852 ± 0.023,0.912 ± 0.019


\begin{table*}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c c c c c c c c c c c c}
\toprule
Name & ndcg\_cut.5 $\uparrow$ & ndcg\_cut.10 $\uparrow$ & ndcg\_cut.20 $\uparrow$ & RR@5 $\uparrow$ & RR@10 $\uparrow$ & RR@20 $\uparrow$ & P.5 $\uparrow$ & P.10 $\uparrow$ & P.20 $\uparrow$ & recall.5 $\uparrow$ & recall.10 $\uparrow$ & recall.20 $\uparrow$ \\
\midrule
BM25 - full\_index & $0.427\pm0.025$ & $0.475\pm0.023$ & $0.494\pm0.022$ & $0.363\pm0.025$ & $0.383\pm0.024$ & $0.388\pm0.024$ & $0.124\pm0.006$ & $0.077\pm0.003$ & $0.042\pm0.001$ & $0.619\pm0.032$ & $0.767\pm0.028$ & $0.840\pm0.024$ \\
BM25 - full\_index\_trimmed & $0.415\pm0.026$ & $0.461\pm0.023$ & $0.479\pm0.022$ & $0.354\pm0.025$ & $0.374\pm0.024$ & $0.378\pm0.024$ & $0.120\pm0.006$ & $0.074\pm0.003$ & $0.040\pm0.001$ & $0.598\pm0.032$ & $0.741\pm0.029$ & $0.809\pm0.026$ \\
BM25 - stemming\_only & $0.394\pm0.026$ & $0.442\pm0.023$ & $0.460\pm0.022$ & $0.336\pm0.025$ & $0.356\pm0.024$ & $0.361\pm0.024$ & $0.

In [10]:
result_df = pt.Experiment(
    retrievers, train_qs, train_qrels,
    eval_metrics=eval_measures + ["mrt"],
    filter_by_qrels=True,
    verbose=True,
    names=names
)

result_df[["name", "mrt"]]

pt.Experiment:   0%|          | 0/16 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 16/16 [1:04:20<00:00, 241.28s/system]


Unnamed: 0,name,mrt
0,LM - stemming_only,105.155745
1,LM - stopwords_removed_trimmed,41.739121
2,LM - stopwords_and_stemming_trimmed,38.928932
3,LM - stopwords_and_stemming,29.453719
4,LM - stemming_only_trimmed,69.669955
5,LM - stopwords_removed,29.990319
6,LM - full_index,70.995222
7,LM - full_index_trimmed,70.962768
8,BM25 - stemming_only,81.018268
9,BM25 - stopwords_removed_trimmed,38.031875


In [11]:
result_df = pt.Experiment(
    retrievers, val_qs, val_qrels,
    eval_metrics=eval_measures + ["mrt"],
    filter_by_qrels=True,
    verbose=True,
    names=names
)

result_df[["name", "mrt"]]

pt.Experiment: 100%|██████████| 16/16 [21:06<00:00, 79.17s/system]


Unnamed: 0,name,mrt
0,LM - stemming_only,134.790704
1,LM - stopwords_removed_trimmed,34.622623
2,LM - stopwords_and_stemming_trimmed,46.76521
3,LM - stopwords_and_stemming,54.175446
4,LM - stemming_only_trimmed,107.078885
5,LM - stopwords_removed,52.305111
6,LM - full_index,115.064086
7,LM - full_index_trimmed,110.445214
8,BM25 - stemming_only,127.535834
9,BM25 - stopwords_removed_trimmed,33.287908
