In [1]:
import pandas as pd
import pyterrier as pt
import string
import random
from tqdm.auto import tqdm
import os

In [2]:
import re

qs = pd.read_csv('data/unseen_queries.csv', sep='\t')
qs['text'] = qs['text'].str.replace(rf"[{re.escape(string.punctuation)}]", " ", regex=True)

In [3]:
def _random_run_name():
    """Generate a name matching [a-z]{3}[0-9]{3}[A-Z]{3}[0-9]{3}."""
    part1 = ''.join(random.choices(string.ascii_lowercase, k=3))
    part2 = ''.join(random.choices(string.digits, k=3))
    part3 = ''.join(random.choices(string.ascii_uppercase, k=3))
    part4 = ''.join(random.choices(string.digits, k=3))
    return f"{part1}{part2}{part3}{part4}"

def generate_run(queries_df: pd.DataFrame, transformer: pt.Transformer):
    run_name = _random_run_name()
    print(transformer, run_name)

    records = [
        (qid, 'Q0', docno, rank + 1, score, run_name)
        for qid, query in tqdm(zip(queries_df['qid'], queries_df['text']), total=len(queries_df))
        for rank, (docno, score) in enumerate(
            # select just the two columns, and return unnamed tuples (docno, score)
            transformer
                .search(query, qid=qid)
                [['docno','score']]
                .itertuples(index=False, name=None),
        )
    ]
    df = pd.DataFrame(
        records,
        columns=['qid', 'Q0', 'docno', 'rank', 'score', 'tag']
    )
    
    # Create the 'evaluations' directory if it doesn't exist
    os.makedirs('evaluations', exist_ok=True)
    
    df.to_csv(f'evaluations/{run_name}.txt', sep=' ', index=False, header=False)

In [4]:
from w2v_expander import W2VExpander
import gensim.downloader as api
from llm_expander import Q2EZSExpander

q2ezs_expander = Q2EZSExpander("google/flan-t5-small")

glove = api.load('glove-wiki-gigaword-50')

stopwords_idx = pt.IndexFactory.of("./indexes/stopwords_removed")
stopwords_stemming_idx = pt.IndexFactory.of("./indexes/stopwords_and_stemming")
bm25 = pt.terrier.Retriever(stopwords_idx, wmodel="BM25", controls={"bm25.k_1": 3.5, "bm25.b": 0.75})
lmd = pt.terrier.Retriever(stopwords_stemming_idx, wmodel="DirichletLM", controls={"dirichletlm.mu": 100})

bm25_rm3 = pt.rewrite.RM3(stopwords_idx, fb_terms=80, fb_docs=3)
bm25_rm3_pipe = bm25 >> bm25_rm3 >> bm25

bm25_expander = W2VExpander(glove, method="centroid", nu=10, n_similar=50, interpolate_lambda=0)
bm25_expander_pipe = bm25_expander >> bm25

bm25_llm_pipe = q2ezs_expander >> bm25


for transformer in [bm25, lmd, bm25_rm3_pipe, bm25_expander_pipe, bm25_llm_pipe]:
    generate_run(qs, transformer)

Java started (triggered by IndexFactory.of) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


TerrierRetr(BM25) rnj946RCF446


  0%|          | 0/5000 [00:00<?, ?it/s]

TerrierRetr(DirichletLM) wej489MXG551


  0%|          | 0/5000 [00:00<?, ?it/s]

(TerrierRetr(BM25) >> QueryExpansion(/Users/christianjensen/Documents/search-engines/indexes/stopwords_removed/data.properties,3,80,<org.terrier.querying.RM3 at 0x177ea9720 jclass=org/terrier/querying/RM3 jself=<LocalRef obj=0x3072a2712 at 0x30ee8b4d0>>) >> TerrierRetr(BM25)) ttn137FKS193


  0%|          | 0/5000 [00:00<?, ?it/s]

(<w2v_expander.W2VExpander object at 0x1681b6b40> >> TerrierRetr(BM25)) mpm386BCU810


  0%|          | 0/5000 [00:00<?, ?it/s]

In [16]:
id_mapping = {
    "rnj946RCF446": "BM25",
    "wej489MXG551": "LMD",
    "ttn137FKS193": "BM25+RM3",
    # "mpm386BCU810": "BM25+WE",
    # "npg886HTA985": "BM25+LLM"
}

In [5]:
id_mapping = {
    "cpr507QIU278": "BM25 with stopwords removed",
    "vzl832QIM930": "BM25 with stopwords and stemming",
    "atb696JXZ142": "LMD with stopwords and stemming"
}

In [17]:
run_scores = pd.read_csv("data/runs_scores.csv")

# Remove .txt from run_name if it exists
run_scores['run_name'] = run_scores['run_name'].str.replace('.txt', '')

my_runs = run_scores[run_scores['run_name'].isin(id_mapping)]
my_runs.loc[:, 'run_name'] = my_runs['run_name'].replace(id_mapping)
my_runs[['run_name', 'MRT']]

Unnamed: 0,run_name,MRT
126,LMD,2.2932
127,BM25,2.2827
128,BM25+RM3,2.4255


In [18]:
from table_helpers import format_and_style, build_latex_table

eval_measures_str = [
    'nDCG@5', 'nDCG@10', 'nDCG@20',
    'RR@5', 'RR@10', 'RR@20',
    'P@5', 'P@10', 'P@20',
    'R@5', 'R@10', 'R@20'
]

run_scores = pd.read_csv("data/runs_scores_perquery.csv")

run_scores['run_name'] = run_scores['run_name'].str.replace('.txt', '')

mask = run_scores['run_name'].isin(id_mapping)
my_runs = run_scores[mask].copy()
my_runs['name'] = my_runs['run_name'].replace(id_mapping)
my_runs.drop(columns='run_name', inplace=True)

result_df =my_runs.melt(
    id_vars=["name", "qid"], 
    var_name="measure", 
    value_name="value"
)

display(format_and_style(result_df, decimals=3, measures_order=eval_measures_str))

latex_code = build_latex_table(result_df, decimals=3, measures_order=eval_measures_str)
print(latex_code)

Unnamed: 0_level_0,nDCG@5,nDCG@10,nDCG@20,RR@5,RR@10,RR@20,P@5,P@10,P@20,R@5,R@10,R@20
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BM25,0.529 ± 0.015,0.575 ± 0.013,0.590 ± 0.012,0.458 ± 0.016,0.477 ± 0.015,0.481 ± 0.015,0.149 ± 0.004,0.089 ± 0.001,0.047 ± 0.000,0.747 ± 0.018,0.886 ± 0.013,0.941 ± 0.010
BM25+RM3,0.523 ± 0.015,0.571 ± 0.013,0.585 ± 0.012,0.451 ± 0.016,0.471 ± 0.015,0.476 ± 0.015,0.148 ± 0.004,0.088 ± 0.001,0.047 ± 0.000,0.741 ± 0.018,0.884 ± 0.013,0.941 ± 0.010
LMD,0.521 ± 0.015,0.570 ± 0.013,0.584 ± 0.012,0.448 ± 0.016,0.468 ± 0.015,0.472 ± 0.015,0.148 ± 0.004,0.089 ± 0.001,0.047 ± 0.000,0.741 ± 0.018,0.891 ± 0.013,0.947 ± 0.009


\begin{table*}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c c c c c c c c c c c c}
\toprule
Name & nDCG@5 $\uparrow$ & nDCG@10 $\uparrow$ & nDCG@20 $\uparrow$ & RR@5 $\uparrow$ & RR@10 $\uparrow$ & RR@20 $\uparrow$ & P@5 $\uparrow$ & P@10 $\uparrow$ & P@20 $\uparrow$ & R@5 $\uparrow$ & R@10 $\uparrow$ & R@20 $\uparrow$ \\
\midrule
BM25 & \nsig{$\mathbf{0.529\pm0.015}$} & \nsig{$\mathbf{0.575\pm0.013}$} & \nsig{$\mathbf{0.590\pm0.012}$} & \nsig{$\mathbf{0.458\pm0.016}$} & \nsig{$\mathbf{0.477\pm0.015}$} & \nsig{$\mathbf{0.481\pm0.015}$} & \nsig{$\mathbf{0.149\pm0.004}$} & \nsig{$0.089\pm0.001$} & \nsig{$0.047\pm0.000$} & \nsig{$\mathbf{0.747\pm0.018}$} & \nsig{$0.886\pm0.013$} & \nsig{$0.941\pm0.010$} \\
BM25+RM3 & \nsig{$0.523\pm0.015$} & \nsig{$0.571\pm0.013$} & $0.585\pm0.012$ & $0.451\pm0.016$ & $0.471\pm0.015$ & $0.476\pm0.015$ & \nsig{$0.148\pm0.004$} & \nsig{$0.088\pm0.001$} & \nsig{$0.047\pm0.000$} & \nsig{$0.741\pm0.018$} & \nsig{$0.884\pm0.013$} & \nsig{$0.941

In [19]:
df = my_runs.merge(qs[['qid','text']], on='qid', how='left')

# 2. Create an “mrr” column (here using RR@20)
df['mrr'] = df['RR@20']

avg_mrr = (
    df
    .groupby(['qid','text'], as_index=False)['mrr']
    .mean()
    .rename(columns={'mrr':'avg_mrr'})
)

# 2. Find 5 queries with lowest average MRR
worst_5 = avg_mrr.nsmallest(5, 'avg_mrr')[['qid','text','avg_mrr']]

# 3. Find 5 queries with highest average MRR
best_5  = avg_mrr.nlargest(5, 'avg_mrr')[['qid','text','avg_mrr']]

print("5 Worst Queries (by average MRR across models):")
print(worst_5.to_string(index=False))

print("\n5 Best Queries (by average MRR across models):")
print(best_5.to_string(index=False))

5 Worst Queries (by average MRR across models):
 qid                                                  text  avg_mrr
 206                                    how big is ky lake      0.0
 699          how to fight upset stomach from insulin rush      0.0
 906 can you file criminal charges on a stop payment check      0.0
1348                             spongebob characters wiki      0.0
1383                 medical definition of disease process      0.0

5 Best Queries (by average MRR across models):
 qid                                        text  avg_mrr
  13           what is fibro osseous integration      1.0
  57 what season of voice was jozy bernadette on      1.0
  85                        the name cleo   girl      1.0
 103           is aes filing required for canada      1.0
 123   what are the largest areas of land called      1.0


In [20]:
count_zero = (avg_mrr['avg_mrr'] == 0).sum()

# Count how many queries have avg_mrr exactly 1
count_one = (avg_mrr['avg_mrr'] == 1).sum()

print(f"Queries with avg_mrr == 0: {count_zero}")
print(f"Queries with avg_mrr == 1: {count_one}")

Queries with avg_mrr == 0: 59
Queries with avg_mrr == 1: 434


In [13]:
combined = pd.concat([worst_5.assign(set='worst'), best_5.assign(set='best')], ignore_index=True)

# ── (2) Load your two Terrier indexes:
stopwords_idx = pt.IndexFactory.of("./indexes/stopwords_removed")
stopwords_stemming_idx = pt.IndexFactory.of("./indexes/stopwords_and_stemming")

# ── (3) Define a helper to run a single query text through a given index and return the list of matching terms:
def terrier_terms(query_text: str, index) -> str:
    rq = pt.terrier.J.Request()
    rq.setOriginalQuery(query_text)
    rq.setIndex(index)
    # parse → translate to matching‐terms → apply the index’s term pipeline
    pt.terrier.J.TerrierQLParser().process(None, rq)
    pt.terrier.J.TerrierQLToMatchingQueryTerms().process(None, rq)
    pt.terrier.J.ApplyTermPipeline().process(None, rq)
    # collect the terms as a space‐separated string
    return " ".join(term.getKey().toString() for term in rq.getMatchingQueryTerms())

# ── (4) Iterate over each row in `combined`, run both pipelines, and store results in a list of dicts:
rows = []
for row in combined.itertuples(index=False):
    qid          = row.qid
    original_q   = row.text
    # a) Stop‐word removal only (no stemming)
    no_stem = terrier_terms(original_q, stopwords_idx)
    # b) Stop‐word removal + stemming
    stem    = terrier_terms(original_q, stopwords_stemming_idx)
    rows.append({
        "qid": qid,
        "set": row.set,
        "original_query": original_q,
        "after_stopword_removal": no_stem,
        "after_stopword_removal_and_stemming": stem
    })

# ── (5) Turn it into a DataFrame and show:
result_df = pd.DataFrame(rows, columns=[
    "set", "qid", "original_query", 
    "after_stopword_removal", 
    "after_stopword_removal_and_stemming"
])

print(result_df.to_string(index=False))

  set  qid                                        original_query                  after_stopword_removal after_stopword_removal_and_stemming
worst  206                                    how big is ky lake                             big ky lake                         big ky lake
worst  699          how to fight upset stomach from insulin rush        fight upset stomach insulin rush    fight upset stomach insulin rush
worst  906 can you file criminal charges on a stop payment check can file criminal charges payment check can file crimin charg payment check
worst 1348                             spongebob characters wiki               spongebob characters wiki              spongebob charact wiki
worst 1383                 medical definition of disease process      medical definition disease process        medic definit diseas process
 best   13                     what is fibro osseous integration               fibro osseous integration                 fibro osseou integr
 best   57   

In [26]:
index = pt.IndexFactory.of("./debug_indexes/full_index_with_blocks")
searcher = pt.terrier.Retriever(index, wmodel="BM25")
docs = pd.read_json("data/docs.jsonl", lines=True)

In [44]:
# qdf = pd.DataFrame([["Q1", "#1(marc chagall)"]], columns=["qid", "query"])
# results = searcher.transform(qdf)

qdf = pd.DataFrame([["Q1", "what is keeper on android"]], columns=["qid", "query"])
results = bm25.transform(qdf)

print(f"Number of results: {len(results)}")

# Assuming docs has columns 'docno', 'title', and 'text'
for docno in results["docno"].head(10):
    match = docs[docs["docno"] == docno]
    if not match.empty:
        title = match.iloc[0]["title"]
        body = match.iloc[0]["body"]
        print(f"Title: {title}\nBody: {body}\n{'-'*40}")

Number of results: 1000
Title: Keeper 5.0 (for Android)
Body: REVIEW0 Comments
SPECSCOMPAREView Gallery
View All 5 Photos in Gallery
MSRP$ 9.97Pros
User-friendly interface.
Easy auto-fill when launching sites within app browser.
Generates strong passwords.
Cons
Doesn't handle complex logins or form fills.
Charges to for cloud backup or to sync across more than one device.
Bottom Line
Keeper for Android is a good way to securely store login information on a single Android device paired with a desktop account.
Other password managers offer more features and a better user experience, for free.
Keeper 5.0 for Android (Free, $9.99 for premium) is a one trick pony, but that trick is decent.
Using military-grade (AES-128) encryption, the free version of Keeper locally stores all your usernames and passwords, and lets you easily copy and paste this data into form fields when you launch websites through a dedicated mobile browser.
Keeper for Android also syncs with a rather limited desktop clie