In [1]:
import os, time
import pandas as pd
import shutil
import string

from tqdm.auto import tqdm
import pyterrier as pt
if not pt.started(): pt.init()

  if not pt.started(): pt.init()
Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  if not pt.started(): pt.init()


In [2]:
BASE_IDX = "indexes"

# load queries (strip punctuation) and qrels
qs = pd.read_csv("data/train_queries.csv", sep="\t", names=["qid", "query"], header=0)
qs['query'] = qs['query'].str.translate(str.maketrans('', '', string.punctuation))
qrels = pd.read_csv("data/train_qrels.csv", sep="\t")
docs = pd.read_json("data/docs.jsonl", lines=True)

In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

docs['body_word_count'] = docs['body'].apply(lambda x: len(word_tokenize(x)))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christianjensen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
NO_STEM = pt.TerrierStemmer.none

configs = {
    "full_index": {
        "stopwords": None,
        "stemmer"  : NO_STEM,
    },
    "stopwords_removed": {
        "stemmer": NO_STEM,
    },
    "stemming_only": {
        "stopwords": None,
    },
    "stopwords_and_stemming": {
        # defaults
    },
}



N_BUILDS = 5
build_times = {name: [] for name in configs}
build_times.update({name + "_trimmed": [] for name in configs})

lower = docs['body_word_count'].quantile(0.01)
upper = docs['body_word_count'].quantile(0.99)

total_runs = N_BUILDS * 2 * len(configs)

with tqdm(total=total_runs, desc="Total Builds") as pbar:
    for run in range(N_BUILDS):
        for trimmed in (False, True):
            for name, opts in configs.items():
                run_name = f"{name}_trimmed" if trimmed else f"{name}_full"
                abs_idx = os.path.abspath(os.path.join(BASE_IDX, run_name))
                
                # wipe & rebuild
                if os.path.isdir(abs_idx): shutil.rmtree(abs_idx)
                os.makedirs(abs_idx, exist_ok=True)

                idxer = pt.IterDictIndexer(abs_idx, text_attrs=["body", "title"], fields=True, **opts)
                t0 = time.perf_counter()
                if trimmed:
                    idx_ref = idxer.index(docs[
                        (docs['body_word_count'] > lower) &
                        (docs['body_word_count'] < upper)
                    ].to_dict(orient='records'))
                else:
                    idx_ref = idxer.index(docs.to_dict(orient='records'))
                
                build_times[run_name].append(time.perf_counter() - t0)
                pbar.update(1)
# quick summary
bt = pd.DataFrame(build_times)
print("Build times summary (s):")
print(bt.describe().T[["mean","std","min","max"]])

Total Builds:   0%|          | 0/40 [00:00<?, ?it/s]

07:50:20.838 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 302 empty documents
07:53:44.009 [ForkJoinPool-2-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 303 empty documents
07:57:26.171 [ForkJoinPool-3-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 302 empty documents
08:00:42.451 [ForkJoinPool-4-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 303 empty documents
08:03:53.619 [ForkJoinPool-5-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
08:06:48.980 [ForkJoinPool-6-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
08:10:02.209 [ForkJoinPool-7-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
08:12:51.021 [ForkJoinPool-8-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
08:16:27.668 [ForkJoinPool-9-worker-1] WARN org.terrier.structures.indexing.Indexer -- I

ValueError: All arrays must be of the same length

In [23]:
bt = pd.DataFrame(build_times)
print("Build times summary (s):")
print(bt.describe().T[["mean","std","min","max"]])

Build times summary (s):
                                      mean        std         min         max
full_index                      224.361024  17.764385  210.837642  255.471691
full_index_trimmed              200.103897  12.415554  187.870443  219.609771
stopwords_removed               206.471247  14.796586  194.001272  231.351704
stopwords_removed_trimmed       188.053850  11.411245  175.334452  198.935515
stemming_only                   219.873281   8.411347  209.852369  232.689631
stemming_only_trimmed           210.452104  20.494514  192.553874  240.537750
stopwords_and_stemming          197.518740  10.622081  187.514076  215.243656
stopwords_and_stemming_trimmed  187.346538  20.582123  168.839451  217.388120


In [26]:
import numpy as np
import scipy.stats as stats

for name, times in build_times.items():
    arr = np.array(times)
    n = len(arr)
    mean = arr.mean()
    sem  = stats.sem(arr)
    t_crit = stats.t.ppf(0.975, df=n-1)
    h = t_crit * sem
    ci = (mean - h, mean + h)
    print(f"{name}: mean={mean:.1f}s, 95% CI=({ci[0]:.1f}, {ci[1]:.1f})")

full_index: mean=224.4s, 95% CI=(202.3, 246.4)
full_index_trimmed: mean=200.1s, 95% CI=(184.7, 215.5)
stopwords_removed: mean=206.5s, 95% CI=(188.1, 224.8)
stopwords_removed_trimmed: mean=188.1s, 95% CI=(173.9, 202.2)
stemming_only: mean=219.9s, 95% CI=(209.4, 230.3)
stemming_only_trimmed: mean=210.5s, 95% CI=(185.0, 235.9)
stopwords_and_stemming: mean=197.5s, 95% CI=(184.3, 210.7)
stopwords_and_stemming_trimmed: mean=187.3s, 95% CI=(161.8, 212.9)


In [34]:
import itertools
from statsmodels.stats.multitest import multipletests

def cohens_d(x, y):
    """Compute Cohen's d for two independent samples."""
    x, y = np.array(x), np.array(y)
    n1, n2 = len(x), len(y)
    s1, s2 = x.var(ddof=1), y.var(ddof=1)
    s_pooled = np.sqrt(((n1-1)*s1 + (n2-1)*s2) / (n1+n2-2))
    return (x.mean() - y.mean()) / s_pooled

# 1) Pairwise Welch tests
pairs = list(itertools.combinations(build_times.keys(), 2))
pvals = [stats.ttest_ind(build_times[a], build_times[b], equal_var=False)[1]
         for a, b in pairs]

# 2) Holm–Bonferroni correction
reject_holm, pvals_holm, _, _ = multipletests(pvals, alpha=0.05, method='holm')

# 3) Compute Cohen's d for each pair and report
for (a, b), p_raw, p_holm, sig in zip(pairs, pvals, pvals_holm, reject_holm):
    d = cohens_d(build_times[a], build_times[b])
    status = "✓" if sig else "✗"
    print(f"{a:30s} vs {b:30s} → p={p_raw:.3f}, holm p={p_holm:.3f} {status}, Cohen’s d={d:.2f}")

full_index                     vs full_index_trimmed             → p=0.040, holm p=0.843 ✗, Cohen’s d=1.58
full_index                     vs stopwords_removed              → p=0.123, holm p=1.000 ✗, Cohen’s d=1.09
full_index                     vs stopwords_removed_trimmed      → p=0.007, holm p=0.180 ✗, Cohen’s d=2.43
full_index                     vs stemming_only                  → p=0.629, holm p=1.000 ✗, Cohen’s d=0.32
full_index                     vs stemming_only_trimmed          → p=0.285, holm p=1.000 ✗, Cohen’s d=0.73
full_index                     vs stopwords_and_stemming         → p=0.025, holm p=0.546 ✗, Cohen’s d=1.83
full_index                     vs stopwords_and_stemming_trimmed → p=0.016, holm p=0.409 ✗, Cohen’s d=1.93
full_index_trimmed             vs stopwords_removed              → p=0.483, holm p=1.000 ✗, Cohen’s d=-0.47
full_index_trimmed             vs stopwords_removed_trimmed      → p=0.149, holm p=1.000 ✗, Cohen’s d=1.01
full_index_trimmed             vs st

In [41]:
import numpy as np
from scipy import stats
from scipy.stats import shapiro

# collect lists
non_trimmed = np.concatenate([
    build_times['full_index'],
    build_times['stopwords_removed'],
    build_times['stemming_only'],
    build_times['stopwords_and_stemming']
])
trimmed = np.concatenate([
    build_times['full_index_trimmed'],
    build_times['stopwords_removed_trimmed'],
    build_times['stemming_only_trimmed'],
    build_times['stopwords_and_stemming_trimmed']
])


for name, arr in [('not_trimmed', non_trimmed), ('trimmed', trimmed)]:
    stat, p = shapiro(arr)
    print(f"{name}: Shapiro-Wilk p = {p:.3f}")

# Welch’s independent t-test
tstat, pval = stats.ttest_ind(non_trimmed, trimmed, equal_var=False)

d = cohens_d(non_trimmed, trimmed)
print(f"\nAll non-trimmed vs trimmed → Welch p={pval:.4f}, d={d:.2f}")

not_trimmed: Shapiro-Wilk p = 0.286
trimmed: Shapiro-Wilk p = 0.441

All non-trimmed vs trimmed → Welch p=0.0074, d=0.90


In [43]:
full = np.concatenate([build_times['full_index'], build_times['full_index_trimmed']])
stopwords = np.concatenate([build_times['stopwords_removed'], build_times['stopwords_removed_trimmed']])
stemming = np.concatenate([build_times['stemming_only'], build_times['stemming_only_trimmed']])

for name, arr in [('full', full), ('stopwords', stopwords), ('stemming', stemming)]:
    stat, p = shapiro(arr)
    print(f"{name}: Shapiro-Wilk p = {p:.3f}")

print()

# Compare full vs stopwords, and full vs stemming
comparisons = [('full', full), ('stopwords', stopwords), ('stemming', stemming)]
for name1, data1 in comparisons:
    if name1 == 'full':
        for name2, data2 in comparisons[1:]:
            tstat, pval = stats.ttest_ind(data1, data2, equal_var=False)
            d = cohens_d(data1, data2)
            print(f"{name1:10s} vs {name2:10s} → Welch p={pval:.4f}, Cohen's d={d:.2f}")

full: Shapiro-Wilk p = 0.236
stopwords: Shapiro-Wilk p = 0.350
stemming: Shapiro-Wilk p = 0.765

full       vs stopwords  → Welch p=0.0744, Cohen's d=0.85
full       vs stemming   → Welch p=0.7132, Cohen's d=-0.17


In [76]:
query_times = {}

# Get all index folders in BASE_IDX
index_folders = [d for d in os.listdir(BASE_IDX) if os.path.isdir(os.path.join(BASE_IDX, d))]

for name in index_folders:
    abs_idx = os.path.abspath(os.path.join(BASE_IDX, name))

    idx_ref = abs_idx  # BatchRetrieve will accept the path
    retriever = pt.terrier.Retriever(idx_ref, wmodel="BM25")

    times = []
    for q in tqdm(qs['query'], desc=f"[QUERY] {name}"):
        iter = pd.DataFrame([["1", q]], columns=["qid", "query"]).itertuples()
        for row in iter:
            t0 = time.perf_counter()
            _ = retriever._retrieve_one(row)
            times.append((time.perf_counter()-t0)*1000)
    query_times[name] = times

# summary
qt = pd.DataFrame(query_times)
print("\nQuery times summary (s):")
print(qt.describe().T[["mean","std","min","max"]])

[QUERY] stemming_only:   0%|          | 0/10000 [00:00<?, ?it/s]

[QUERY] stopwords_removed_trimmed:   0%|          | 0/10000 [00:00<?, ?it/s]



[QUERY] stopwords_and_stemming_trimmed:   0%|          | 0/10000 [00:00<?, ?it/s]



[QUERY] stemming_only_trimmed:   0%|          | 0/10000 [00:00<?, ?it/s]



[QUERY] stopwords_and_stemming:   0%|          | 0/10000 [00:00<?, ?it/s]

[QUERY] stopwords_removed:   0%|          | 0/10000 [00:00<?, ?it/s]



[QUERY] full_index:   0%|          | 0/10000 [00:00<?, ?it/s]



[QUERY] full_index_trimmed:   0%|          | 0/10000 [00:00<?, ?it/s]


Query times summary (s):
                                     mean        std       min         max
stemming_only                   49.270138  24.455601  1.024041  224.537333
stopwords_removed_trimmed       15.782448   6.716416  0.139959  114.413834
stopwords_and_stemming_trimmed  17.776235   7.497144  0.202209  129.318084
stemming_only_trimmed           55.996675  28.652783  1.058083  261.894292
stopwords_and_stemming          17.170530   6.967495  0.206625  102.602834
stopwords_removed               16.200586   6.862061  0.175834  141.042083
full_index                      55.257526  29.110741  0.211250  242.525833
full_index_trimmed              53.403363  32.110103  0.667125  945.019667


In [83]:
qt = pd.DataFrame(query_times)
print("\nQuery times summary (s):")
print(qt.describe().T[["mean", "std","min","max"]])


Query times summary (s):
                                     mean        std       min         max
stemming_only                   49.270138  24.455601  1.024041  224.537333
stopwords_removed_trimmed       15.782448   6.716416  0.139959  114.413834
stopwords_and_stemming_trimmed  17.776235   7.497144  0.202209  129.318084
stemming_only_trimmed           55.996675  28.652783  1.058083  261.894292
stopwords_and_stemming          17.170530   6.967495  0.206625  102.602834
stopwords_removed               16.200586   6.862061  0.175834  141.042083
full_index                      55.257526  29.110741  0.211250  242.525833
full_index_trimmed              53.403363  32.110103  0.667125  945.019667


In [77]:
import numpy as np
import scipy.stats as stats

for name in sorted(query_times.keys()):
    times = query_times[name]
    arr = np.array(times)
    n = len(arr)
    mean = arr.mean()
    sem  = stats.sem(arr)
    t_crit = stats.t.ppf(0.975, df=n-1)
    h = t_crit * sem
    ci = (mean - h, mean + h)
    print(f"{name}: mean={mean:.1f}ms, 95% CI=({ci[0]:.1f}, {ci[1]:.1f})")

full_index: mean=55.3ms, 95% CI=(54.7, 55.8)
full_index_trimmed: mean=53.4ms, 95% CI=(52.8, 54.0)
stemming_only: mean=49.3ms, 95% CI=(48.8, 49.7)
stemming_only_trimmed: mean=56.0ms, 95% CI=(55.4, 56.6)
stopwords_and_stemming: mean=17.2ms, 95% CI=(17.0, 17.3)
stopwords_and_stemming_trimmed: mean=17.8ms, 95% CI=(17.6, 17.9)
stopwords_removed: mean=16.2ms, 95% CI=(16.1, 16.3)
stopwords_removed_trimmed: mean=15.8ms, 95% CI=(15.7, 15.9)


In [84]:
# 1) Pairwise Welch tests
pairs = list(itertools.combinations(query_times.keys(), 2))
pvals = [stats.ttest_ind(query_times[a], query_times[b], equal_var=False)[1]
         for a, b in pairs]

# 2) Holm–Bonferroni correction
reject_holm, pvals_holm, _, _ = multipletests(pvals, alpha=0.05, method='holm')

# 3) Compute Cohen's d for each pair and report
for (a, b), p_raw, p_holm, sig in zip(pairs, pvals, pvals_holm, reject_holm):
    d = cohens_d(query_times[a], query_times[b])
    status = "✓" if sig else "✗"
    print(f"{a:30s} vs {b:30s} → p={p_raw:.3f}, holm p={p_holm:.3f} {status}, Cohen's d={d:.2f}")

stemming_only                  vs stopwords_removed_trimmed      → p=0.000, holm p=0.000 ✓, Cohen's d=1.87
stemming_only                  vs stopwords_and_stemming_trimmed → p=0.000, holm p=0.000 ✓, Cohen's d=1.74
stemming_only                  vs stemming_only_trimmed          → p=0.000, holm p=0.000 ✓, Cohen's d=-0.25
stemming_only                  vs stopwords_and_stemming         → p=0.000, holm p=0.000 ✓, Cohen's d=1.79
stemming_only                  vs stopwords_removed              → p=0.000, holm p=0.000 ✓, Cohen's d=1.84
stemming_only                  vs full_index                     → p=0.000, holm p=0.000 ✓, Cohen's d=-0.22
stemming_only                  vs full_index_trimmed             → p=0.000, holm p=0.000 ✓, Cohen's d=-0.14
stopwords_removed_trimmed      vs stopwords_and_stemming_trimmed → p=0.000, holm p=0.000 ✓, Cohen's d=-0.28
stopwords_removed_trimmed      vs stemming_only_trimmed          → p=0.000, holm p=0.000 ✓, Cohen's d=-1.93
stopwords_removed_trimmed      v

In [None]:
import numpy as np
from scipy.stats import mannwhitneyu

# collect lists
non_trimmed = np.concatenate([
    query_times['full_index'],
    query_times['stopwords_removed'],
    query_times['stemming_only'],
    query_times['stopwords_and_stemming']
])
trimmed = np.concatenate([
    query_times['full_index_trimmed'],
    query_times['stopwords_removed_trimmed'],
    query_times['stemming_only_trimmed'],
    query_times['stopwords_and_stemming_trimmed']
])


stat, p = mannwhitneyu(non_trimmed, trimmed, alternative='two-sided')
print(f"Mann-Whitney U p = {p:.3e}")

Mann-Whitney U p = 3.218e-03
