In [2]:
import pandas as pd
import pyterrier as pt
import string
import random
from tqdm.auto import tqdm
import os
import re

In [3]:
QUERIES = "data/train_queries.csv"
QRELS   = "data/train_qrels.csv"

# Load queries and qrels
qs = pd.read_csv(QUERIES, sep="\t", names=["qid", "query"], header=0)
qrels = pd.read_csv(QRELS, sep="\t")

# Strip out all punctuation
qs['query'] = qs['query'].str.replace(rf"[{re.escape(string.punctuation)}]", " ", regex=True)

# # Make the qid an str
qs['qid'] = qs['qid'].astype(str)
qrels['qid'] = qrels['qid'].astype(str)

In [22]:
from pyterrier.measures import *

stopwords_idx = pt.IndexFactory.of("./indexes/stopwords_removed")
stopwords_stemming_idx = pt.IndexFactory.of("./indexes/stopwords_and_stemming")
bm25 = pt.terrier.Retriever(stopwords_idx, wmodel="BM25", controls={"bm25.k_1": 3.5, "bm25.b": 0.75})
lmd = pt.terrier.Retriever(stopwords_stemming_idx, wmodel="DirichletLM", controls={"dirichletlm.mu": 100})

bm25_rm3 = pt.rewrite.RM3(stopwords_idx, fb_terms=80, fb_docs=3)
bm25_rm3_pipe = bm25 >> bm25_rm3 >> bm25

result_df = pt.Experiment(
    [bm25, lmd, bm25_rm3_pipe], qs, qrels,
    eval_metrics=[RR@20],
    filter_by_qrels=True,
    verbose=True,
    perquery=True,
    names=["BM25", "LMD", "BM25 + RM3"]
)

pt.Experiment: 100%|██████████| 3/3 [21:58<00:00, 439.47s/system]


In [23]:
df = result_df.merge(qs[['qid','query']], on='qid', how='left')

df['mrr'] = df['value']

avg_mrr = (
    df
    .groupby(['qid','query'], as_index=False)['mrr']
    .mean()
    .rename(columns={'mrr':'avg_mrr'})
)

# 2. Find 5 queries with lowest average MRR
worst_5 = avg_mrr.nsmallest(5, 'avg_mrr')[['qid','query','avg_mrr']]

# 3. Find 5 queries with highest average MRR
best_5  = avg_mrr.nlargest(5, 'avg_mrr')[['qid','query','avg_mrr']]

print("5 Worst Queries (by average MRR across models):")
print(worst_5.to_string(index=False))

print("\n5 Best Queries (by average MRR across models):")
print(best_5.to_string(index=False))

5 Worst Queries (by average MRR across models):
  qid                             query  avg_mrr
10048 where did the koa trees originate      0.0
10141                  gum decay causes      0.0
10257             when does time switch      0.0
10460       which bond forms a molecule      0.0
 1047                 fading definition      0.0

5 Best Queries (by average MRR across models):
  qid                                                      query  avg_mrr
10049 art marc chagall s works reflected his heritage  which was      1.0
10082                what has a cell body  dendrites and an axon      1.0
10084                   eastern michigan university tuition cost      1.0
10094                                  what is keeper on android      1.0
 1011                  what materials are used to make an iphone      1.0


In [25]:
count_zero = (avg_mrr['avg_mrr'] == 0).sum()

# Count how many queries have avg_mrr exactly 1
count_one = (avg_mrr['avg_mrr'] == 1).sum()

print(f"Queries with avg_mrr == 0: {count_zero}")
print(f"Queries with avg_mrr == 1: {count_one}")

Queries with avg_mrr == 0: 117
Queries with avg_mrr == 1: 936


In [12]:
docs = pd.read_json("data/docs.jsonl", lines=True)

In [21]:
# Suppose you want the relevant docs for query “42”:
qid = "10125"

# 1. Filter qrels for that qid
matches = qrels[qrels["qid"] == qid]

# 2. Pull out the doc IDs (and, if you care, the relevance scores)
relevant_docids = matches["docno"].tolist()
relevance_scores = matches.get("relevance", None)  # if there’s a column named “relevance”

for docno in relevant_docids:
    match = docs[docs["docno"] == docno]
    if not match.empty:
        title = match.iloc[0]["title"]
        url = match.iloc[0]["url"]
        body = match.iloc[0]["body"]
        print(f"Title: {title}\nURL: {url}\nBody: {body}\n{'-'*40}")

Title: Types Of Swiss Chard: Tips For Choosing The Best Swiss Chard Variety
URL: http://www.gardeningknowhow.com/edible/vegetables/swiss-chard/types-of-swiss-chard.htm
Body: Types Of Swiss Chard: Tips For Choosing The Best Swiss Chard Variety
Printer Friendly Version
Image by David Fisher
By Bonnie L. Grant, Certified Urban Agriculturist
Chard is a cool season leafy green vegetable.
The plant is related to beets but doesn’t produce the globular edible root.
Chard plants come in many varieties and colors.
The brightly colored ribs of the celery-like stems belong to the well-known Swiss chard plant family.
The choices keep coming with a rainbow of types of Swiss chard.
This nutritious plant is easy to grow and can be harvested several times in spring.
Swiss Chard Plant Family
The “Swiss” descriptor was added to the chard name to differentiate it from French chardon.
Chard has a milder flavor than spinach and very similar green leaves.
The leaves are born on top of long stems that may ran