In [1]:
import pandas as pd
import pyterrier as pt
import numpy as np
import string
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from pyterrier.measures import *
import re

In [2]:
BASE_IDX = "indexes/stopwords_removed"
QUERIES = "data/train_queries.csv"
QRELS   = "data/train_qrels.csv"

# Load queries and qrels
qs = pd.read_csv(QUERIES, sep="\t", names=["qid", "query"], header=0)
qrels = pd.read_csv(QRELS, sep="\t")

# Strip out all punctuation
qs['query'] = qs['query'].str.replace(rf"[{re.escape(string.punctuation)}]", " ", regex=True)

# # Make the qid an str
qs['qid'] = qs['qid'].astype(str)
qrels['qid'] = qrels['qid'].astype(str)


train_qs, val_qs = train_test_split(
    qs,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# now get the corresponding qrels
train_qids = set(train_qs['qid'])
val_qids   = set(val_qs  ['qid'])

train_qrels = qrels[qrels['qid'].isin(train_qids)]
val_qrels   = qrels[qrels['qid'].isin(val_qids)]

In [3]:
stopwords_idx = pt.IndexFactory.of("./indexes/stopwords_removed")
bm25 = pt.terrier.Retriever(stopwords_idx, wmodel="BM25", controls={"bm25.k_1": 3.5, "bm25.b": 0.75})

stopwords_stemmed_idx = pt.IndexFactory.of("./indexes/stopwords_and_stemming")
lmd = pt.terrier.Retriever(stopwords_stemmed_idx, wmodel="DirichletLM", controls={"dirichletlm.mu": 100})

Java started (triggered by IndexFactory.of) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [None]:
for name, retriever, idx in [("BM25", bm25, stopwords_idx), ("LMD", lmd, stopwords_stemmed_idx)]:
    rm3 = pt.rewrite.RM3(idx)
    rm3_pipe = retriever >> rm3 >> retriever

    param_grid = {
        "fb_terms": [10, 20, 40, 80],
        "fb_docs" : [1, 3, 5, 10, 20],
    }

    result = pt.GridSearch(
            rm3_pipe,
            {rm3: param_grid},
            train_qs,
            train_qrels,
            'recip_rank',
            verbose=True,
            return_type="best_setting"
    )
    best_setting = {
        setting: val for _, setting, val in result[1]
    }
    print(f"Best setting for {name}: {best_setting}")

GridScan: 100%|██████████| 20/20 [54:21<00:00, 163.09s/it]


Best recip_rank is 0.134890
Best setting is ['QueryExpansion(/Users/christianjensen/Documents/search-engines/indexes/stopwords_removed/data.properties,20,80,<org.terrier.querying.RM3 at 0x32f1c6cb0 jclass=org/terrier/querying/RM3 jself=<LocalRef obj=0x17ee64912 at 0x38b0d8bb0>>) fb_terms=80', 'QueryExpansion(/Users/christianjensen/Documents/search-engines/indexes/stopwords_removed/data.properties,20,80,<org.terrier.querying.RM3 at 0x32f1c6cb0 jclass=org/terrier/querying/RM3 jself=<LocalRef obj=0x17ee64912 at 0x38b0d8bb0>>) fb_docs=3']
Best setting for BM25: {'fb_terms': 80, 'fb_docs': 3}


GridScan:   0%|          | 0/20 [01:20<?, ?it/s]


KeyboardInterrupt: 

Best setting for BM25: {'fb_terms': 80, 'fb_docs': 3}

Best setting for LMD: {'fb_terms': 10, 'fb_docs': 5}

In [6]:
import gensim.downloader as api
from gensim.models import Word2Vec
from w2v_expander import W2VExpander

# w2v = api.load("word2vec-google-news-300")
# custom_w2v = Word2Vec.load("word2vec_stopwords_removed.model").wv
glove = api.load('glove-wiki-gigaword-50')

In [7]:
from pyterrier.measures import *
from table_helpers import format_and_style, build_latex_table

eval_measures = [
    'ndcg_cut.5', 'ndcg_cut.10', 'ndcg_cut.20',
    RR@5, RR@10, RR@20,
    'P.5', 'P.10', 'P.20',
    'recall.5', 'recall.10', 'recall.20'
]

eval_measures_str = [
    'ndcg_cut.5', 'ndcg_cut.10', 'ndcg_cut.20',
    'RR@5', 'RR@10', 'RR@20',
    'P.5', 'P.10', 'P.20',
    'recall.5', 'recall.10', 'recall.20'
]

In [None]:
# for method in ["centroid", "combSUM", "combMNZ", "combMAX"]:
#     w2v_expander = W2VExpander(w2v, method=method, nu=10, n_similar=50)
#     w2v_expander_pipe = w2v_expander >> retriever

#     custom_w2v_expander = W2VExpander(custom_w2v, method=method, nu=10, n_similar=50)
#     custom_w2v_expander_pipe = custom_w2v_expander >> retriever

#     glove_expander = W2VExpander(glove, method=method, nu=10, n_similar=50)
#     glove_expander_pipe = glove_expander >> retriever

#     result_df = pt.Experiment(
#         [w2v_expander_pipe, custom_w2v_expander_pipe, glove_expander_pipe], val_qs, val_qrels,
#         eval_metrics=eval_measures,
#         filter_by_qrels=True,
#         verbose=True,
#         perquery=True,
#         names=["Word2Vec", "Custom Word2Vec" "Glove"]
#     )
#     print(f"Method: {method}")
#     display(format_and_style(result_df, decimals=3))

In [9]:
expander = W2VExpander(glove, nu=10, n_similar=10, interpolate_lambda=0)

for name, retriever in [("BM25", bm25), ("LMD", lmd)]:
    pipe = expander >> retriever

    param_grid = {
        expander: {
            "method":    ["centroid","combSUM","combMNZ","combMAX"],
            "n_similar": [50, 100],
            "nu":        [10, 25],
            "interpolate_lambda": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
        }
    }

    gs = pt.GridSearch(
        pipe,
        param_grid,
        train_qs,
        train_qrels,
        metric="recip_rank",
        verbose=True,
        return_type="best_setting"
    )
    best = {k:v for _,k,v in gs[1]}
    print("Best expansion setting:", best)

GridScan: 100%|██████████| 96/96 [2:45:16<00:00, 103.29s/it]  


Best recip_rank is 0.113546
Best setting is ['<__main__.W2VExpander object at 0x340ddbc20> method=centroid', '<__main__.W2VExpander object at 0x340ddbc20> n_similar=50', '<__main__.W2VExpander object at 0x340ddbc20> nu=10', '<__main__.W2VExpander object at 0x340ddbc20> interpolate_lambda=0.0']
Best expansion setting: {'method': 'centroid', 'n_similar': 50, 'nu': 10, 'interpolate_lambda': 0.0}


GridScan: 100%|██████████| 96/96 [2:37:39<00:00, 98.54s/it]   

Best recip_rank is 0.097455
Best setting is ['<__main__.W2VExpander object at 0x340ddbc20> method=combSUM', '<__main__.W2VExpander object at 0x340ddbc20> n_similar=100', '<__main__.W2VExpander object at 0x340ddbc20> nu=10', '<__main__.W2VExpander object at 0x340ddbc20> interpolate_lambda=0.0']
Best expansion setting: {'method': 'combSUM', 'n_similar': 100, 'nu': 10, 'interpolate_lambda': 0.0}





Best expansion setting for BM25: {'method': 'centroid', 'n_similar': 50, 'nu': 10, 'interpolate_lambda': 0.0}

Best expansion setting for LMD: {'method': 'combSUM', 'n_similar': 100, 'nu': 10, 'interpolate_lambda': 0.0}

In [None]:
from llm_expander import Q2EZSExpander

bm25_rm3 = pt.rewrite.RM3(stopwords_idx, fb_terms=80, fb_docs=3)
bm25_rm3_pipe = bm25 >> bm25_rm3 >> bm25

bm25_we = W2VExpander(glove, method="centroid", nu=10, n_similar=50, interpolate_lambda=0)
bm25_we_pipe = bm25_we >> bm25

q2ezs_expander = Q2EZSExpander("google/flan-t5-small")
bm25_llm_pipe = q2ezs_expander >> bm25

lmd_rm3 = pt.rewrite.RM3(stopwords_stemmed_idx, fb_terms=10, fb_docs=5)
lmd_rm3_pipe = lmd >> lmd_rm3 >> lmd

lmd_expander = W2VExpander(glove, method="combSUM", nu=10, n_similar=100, interpolate_lambda=0)
lmd_expander_pipe = lmd_expander >> lmd

lmd_llm_pipe = q2ezs_expander >> lmd

models = [bm25, bm25_rm3_pipe, bm25_we_pipe, bm25_llm_pipe, lmd, lmd_rm3_pipe, lmd_expander_pipe, lmd_llm_pipe]
names = ["BM25", "BM25 + RM3", "BM25 + WE", "BM25 + LLM", "LMD", "LMD + RM3", "LMD + WE", "LMD + LLM"]

result_df = pt.Experiment(
    models,
    train_qs,
    train_qrels,
    eval_metrics=eval_measures,
    filter_by_qrels=True,
    verbose=True,
    perquery=True,
    names=names
)

display(format_and_style(result_df, decimals=3, measures_order=eval_measures_str))

latex_code = build_latex_table(result_df, decimals=3, measures_order=eval_measures_str)
print(latex_code)

pt.Experiment: 100%|██████████| 8/8 [1:00:52<00:00, 456.61s/system]


Unnamed: 0_level_0,ndcg_cut.5,ndcg_cut.10,ndcg_cut.20,RR@5,RR@10,RR@20,P.5,P.10,P.20,recall.5,recall.10,recall.20
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BM25,0.531 ± 0.012,0.581 ± 0.011,0.595 ± 0.010,0.463 ± 0.013,0.484 ± 0.012,0.488 ± 0.012,0.148 ± 0.003,0.089 ± 0.001,0.047 ± 0.000,0.742 ± 0.014,0.895 ± 0.010,0.948 ± 0.007
BM25 + LLM,0.531 ± 0.012,0.581 ± 0.011,0.595 ± 0.010,0.462 ± 0.013,0.483 ± 0.012,0.487 ± 0.012,0.149 ± 0.003,0.090 ± 0.001,0.047 ± 0.000,0.745 ± 0.014,0.898 ± 0.010,0.949 ± 0.007
BM25 + RM3,0.523 ± 0.013,0.575 ± 0.011,0.590 ± 0.010,0.457 ± 0.013,0.479 ± 0.012,0.483 ± 0.012,0.145 ± 0.003,0.089 ± 0.001,0.047 ± 0.000,0.727 ± 0.015,0.888 ± 0.010,0.945 ± 0.008
BM25 + WE,0.434 ± 0.013,0.486 ± 0.011,0.506 ± 0.011,0.370 ± 0.013,0.392 ± 0.012,0.398 ± 0.012,0.126 ± 0.003,0.079 ± 0.001,0.043 ± 0.001,0.630 ± 0.016,0.789 ± 0.013,0.866 ± 0.011
LMD,0.533 ± 0.013,0.584 ± 0.011,0.599 ± 0.010,0.465 ± 0.013,0.486 ± 0.012,0.491 ± 0.012,0.148 ± 0.003,0.090 ± 0.001,0.048 ± 0.000,0.740 ± 0.014,0.895 ± 0.010,0.954 ± 0.007
LMD + LLM,0.525 ± 0.013,0.577 ± 0.011,0.593 ± 0.010,0.458 ± 0.013,0.480 ± 0.012,0.485 ± 0.012,0.146 ± 0.003,0.089 ± 0.001,0.047 ± 0.000,0.728 ± 0.015,0.888 ± 0.010,0.948 ± 0.007
LMD + RM3,0.459 ± 0.014,0.499 ± 0.012,0.523 ± 0.012,0.406 ± 0.014,0.423 ± 0.013,0.430 ± 0.013,0.124 ± 0.003,0.074 ± 0.001,0.042 ± 0.001,0.619 ± 0.016,0.743 ± 0.014,0.835 ± 0.012
LMD + WE,0.380 ± 0.013,0.428 ± 0.012,0.452 ± 0.011,0.324 ± 0.012,0.344 ± 0.012,0.351 ± 0.012,0.110 ± 0.003,0.070 ± 0.002,0.040 ± 0.001,0.550 ± 0.016,0.697 ± 0.015,0.792 ± 0.013


\begin{table*}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c c c c c c c c c c c c}
\toprule
Name & ndcg\_cut.5 $\uparrow$ & ndcg\_cut.10 $\uparrow$ & ndcg\_cut.20 $\uparrow$ & RR@5 $\uparrow$ & RR@10 $\uparrow$ & RR@20 $\uparrow$ & P.5 $\uparrow$ & P.10 $\uparrow$ & P.20 $\uparrow$ & recall.5 $\uparrow$ & recall.10 $\uparrow$ & recall.20 $\uparrow$ \\
\midrule
BM25 & \nsig{$0.531\pm0.012$} & \nsig{$0.581\pm0.011$} & \nsig{$0.595\pm0.010$} & \nsig{$0.463\pm0.013$} & \nsig{$0.484\pm0.012$} & \nsig{$0.488\pm0.012$} & \nsig{$0.148\pm0.003$} & \nsig{$0.089\pm0.001$} & $0.047\pm0.000$ & \nsig{$0.742\pm0.014$} & \nsig{$0.895\pm0.010$} & $0.948\pm0.007$ \\
BM25 + LLM & \nsig{$0.531\pm0.012$} & \nsig{$0.581\pm0.011$} & \nsig{$0.595\pm0.010$} & \nsig{$0.462\pm0.013$} & \nsig{$0.483\pm0.012$} & \nsig{$0.487\pm0.012$} & \nsig{$\mathbf{0.149\pm0.003}$} & \nsig{$\mathbf{0.090\pm0.001}$} & \nsig{$0.047\pm0.000$} & \nsig{$\mathbf{0.745\pm0.014}$} & \nsig{$\mathbf{0.898\pm0.010}$} & \n

In [16]:
result_df = pt.Experiment(
    models,
    val_qs,
    val_qrels,
    eval_metrics=eval_measures,
    filter_by_qrels=True,
    verbose=True,
    perquery=True,
    names=names
)

display(format_and_style(result_df, decimals=3, measures_order=eval_measures_str))

latex_code = build_latex_table(result_df, decimals=3, measures_order=eval_measures_str)
print(latex_code)

pt.Experiment: 100%|██████████| 8/8 [09:55<00:00, 74.42s/system]


Unnamed: 0_level_0,ndcg_cut.5,ndcg_cut.10,ndcg_cut.20,RR@5,RR@10,RR@20,P.5,P.10,P.20,recall.5,recall.10,recall.20
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BM25,0.536 ± 0.025,0.586 ± 0.021,0.595 ± 0.020,0.465 ± 0.025,0.486 ± 0.024,0.489 ± 0.024,0.151 ± 0.006,0.091 ± 0.002,0.047 ± 0.001,0.753 ± 0.028,0.906 ± 0.019,0.940 ± 0.016
BM25 + LLM,0.541 ± 0.024,0.588 ± 0.021,0.597 ± 0.020,0.468 ± 0.025,0.488 ± 0.024,0.490 ± 0.024,0.152 ± 0.006,0.091 ± 0.002,0.047 ± 0.001,0.762 ± 0.028,0.905 ± 0.019,0.939 ± 0.016
BM25 + RM3,0.523 ± 0.025,0.577 ± 0.021,0.590 ± 0.020,0.455 ± 0.026,0.478 ± 0.024,0.481 ± 0.024,0.146 ± 0.006,0.090 ± 0.002,0.047 ± 0.001,0.731 ± 0.029,0.895 ± 0.020,0.944 ± 0.015
BM25 + WE,0.454 ± 0.025,0.504 ± 0.022,0.521 ± 0.021,0.388 ± 0.025,0.409 ± 0.024,0.414 ± 0.024,0.131 ± 0.006,0.081 ± 0.003,0.044 ± 0.001,0.654 ± 0.031,0.806 ± 0.026,0.874 ± 0.022
LMD,0.521 ± 0.024,0.572 ± 0.021,0.586 ± 0.019,0.448 ± 0.025,0.469 ± 0.024,0.473 ± 0.023,0.149 ± 0.006,0.090 ± 0.002,0.048 ± 0.001,0.745 ± 0.029,0.903 ± 0.020,0.955 ± 0.014
LMD + LLM,0.506 ± 0.024,0.559 ± 0.021,0.573 ± 0.019,0.432 ± 0.025,0.454 ± 0.024,0.458 ± 0.023,0.146 ± 0.006,0.090 ± 0.002,0.047 ± 0.001,0.729 ± 0.029,0.895 ± 0.020,0.947 ± 0.015
LMD + RM3,0.432 ± 0.026,0.483 ± 0.024,0.502 ± 0.022,0.377 ± 0.026,0.398 ± 0.025,0.404 ± 0.025,0.121 ± 0.006,0.076 ± 0.003,0.042 ± 0.001,0.603 ± 0.032,0.758 ± 0.028,0.831 ± 0.025
LMD + WE,0.386 ± 0.025,0.434 ± 0.023,0.458 ± 0.021,0.323 ± 0.024,0.344 ± 0.023,0.351 ± 0.023,0.115 ± 0.007,0.072 ± 0.003,0.041 ± 0.001,0.577 ± 0.033,0.724 ± 0.029,0.818 ± 0.025


\begin{table*}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c c c c c c c c c c c c}
\toprule
Name & ndcg\_cut.5 $\uparrow$ & ndcg\_cut.10 $\uparrow$ & ndcg\_cut.20 $\uparrow$ & RR@5 $\uparrow$ & RR@10 $\uparrow$ & RR@20 $\uparrow$ & P.5 $\uparrow$ & P.10 $\uparrow$ & P.20 $\uparrow$ & recall.5 $\uparrow$ & recall.10 $\uparrow$ & recall.20 $\uparrow$ \\
\midrule
BM25 & $0.536\pm0.025$ & \nsig{$0.586\pm0.021$} & \nsig{$0.595\pm0.020$} & \nsig{$0.465\pm0.025$} & \nsig{$0.486\pm0.024$} & \nsig{$0.489\pm0.024$} & \nsig{$0.151\pm0.006$} & \nsig{$\mathbf{0.091\pm0.002}$} & $0.047\pm0.001$ & \nsig{$0.753\pm0.028$} & \nsig{$\mathbf{0.906\pm0.019}$} & $0.940\pm0.016$ \\
BM25 + LLM & \nsig{$\mathbf{0.541\pm0.024}$} & \nsig{$\mathbf{0.588\pm0.021}$} & \nsig{$\mathbf{0.597\pm0.020}$} & \nsig{$\mathbf{0.468\pm0.025}$} & \nsig{$\mathbf{0.488\pm0.024}$} & \nsig{$\mathbf{0.490\pm0.024}$} & \nsig{$\mathbf{0.152\pm0.006}$} & \nsig{$0.091\pm0.002$} & $0.047\pm0.001$ & \nsig{$\mathbf{0.762\

In [21]:
result_df = pt.Experiment(
    models,
    train_qs,
    train_qrels,
    eval_metrics=eval_measures + ["mrt"],
    filter_by_qrels=True,
    verbose=True,
    names=names
)

result_df[["name", "mrt"]]

pt.Experiment:   0%|          | 0/8 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 8/8 [55:39<00:00, 417.49s/system]  


Unnamed: 0,name,mrt
0,BM25,20.709407
1,BM25 + RM3,283.20781
2,BM25 + WE,53.018823
3,BM25 + LLM,110.508779
4,LMD,22.280432
5,LMD + RM3,221.76564
6,LMD + WE,95.17574
7,LMD + LLM,112.908758


In [22]:
result_df = pt.Experiment(
    models,
    val_qs,
    val_qrels,
    eval_metrics=eval_measures + ["mrt"],
    filter_by_qrels=True,
    verbose=True,
    names=names
)

result_df[["name", "mrt"]]

pt.Experiment: 100%|██████████| 8/8 [09:26<00:00, 70.78s/system]


Unnamed: 0,name,mrt
0,BM25,23.680542
1,BM25 + RM3,154.069668
2,BM25 + WE,45.007996
3,BM25 + LLM,107.345488
4,LMD,20.507404
5,LMD + RM3,88.76996
6,LMD + WE,76.85377
7,LMD + LLM,102.751935
