In [13]:
import os, time
import pandas as pd
import string
import pyterrier as pt
from tqdm.auto import tqdm
import shutil

In [14]:
BASE_IDX = "indexes"

# load queries (strip punctuation) and qrels
qs = pd.read_csv("data/train_queries.csv", sep="\t", names=["qid", "query"], header=0)
qs['query'] = qs['query'].str.translate(str.maketrans('', '', string.punctuation))
qrels = pd.read_csv("data/train_qrels.csv", sep="\t")
docs = pd.read_json("data/docs.jsonl", lines=True)

qs['qid'] = qs['qid'].astype(str)
qrels['qid'] = qrels['qid'].astype(str)

# Make text column that is concattenation of title and body
docs["text"] = docs["title"] + " " + docs["body"]

In [6]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

docs['body_word_count'] = docs['body'].apply(lambda x: len(word_tokenize(x)))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christianjensen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
NO_STEM = pt.TerrierStemmer.none

configs = {
    "full_index": {
        "stopwords": None,
        "stemmer"  : NO_STEM,
    },
    "stopwords_removed": {
        "stemmer": NO_STEM,
    },
    "stemming_only": {
        "stopwords": None,
    },
    "stopwords_and_stemming": {
        # defaults
    },
}



N_BUILDS = 1
build_times = {name: [] for name in configs}
build_times.update({name + "_trimmed": [] for name in configs})

lower = docs['body_word_count'].quantile(0.01)
upper = docs['body_word_count'].quantile(0.99)

total_runs = N_BUILDS * 2 * len(configs)

with tqdm(total=total_runs, desc="Total Builds") as pbar:
    for run in range(N_BUILDS):
        for trimmed in (False, True):
            for name, opts in configs.items():
                run_name = f"{name}_trimmed" if trimmed else f"{name}"
                abs_idx = os.path.abspath(os.path.join(BASE_IDX, run_name))
                
                # wipe & rebuild
                if os.path.isdir(abs_idx): shutil.rmtree(abs_idx)
                os.makedirs(abs_idx, exist_ok=True)

                # idxer = pt.IterDictIndexer(abs_idx, text_attrs=["title", "body"], fields=False, **opts)
                idxer = pt.IterDictIndexer(abs_idx, text_attrs=("text",), fields=False, **opts)
                if trimmed:
                    to_index = docs[
                            (docs['body_word_count'] > lower) &
                            (docs['body_word_count'] < upper)
                        ].to_dict(orient='records')
                else:
                    to_index = docs.to_dict(orient='records')
                t0 = time.perf_counter()
                idx_ref = idxer.index()
                build_times[run_name].append(time.perf_counter() - t0)
                pbar.update(1)
# quick summary
bt = pd.DataFrame(build_times)
print("Build times summary (s):")
print(bt.describe().T[["mean","std","min","max"]])

Total Builds:   0%|          | 0/8 [00:00<?, ?it/s]

21:56:33.015 [ForkJoinPool-2-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 302 empty documents
21:59:23.251 [ForkJoinPool-3-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 303 empty documents
22:02:49.907 [ForkJoinPool-4-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 302 empty documents
22:05:59.350 [ForkJoinPool-5-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 303 empty documents
22:08:49.895 [ForkJoinPool-6-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
22:12:22.479 [ForkJoinPool-7-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
22:15:47.420 [ForkJoinPool-8-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
22:19:44.668 [ForkJoinPool-9-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
Build times summary (s):
                                      mean  std         min    

In [23]:
bt = pd.DataFrame(build_times)
print("Build times summary (s):")
print(bt.describe().T[["mean","std","min","max"]])

Build times summary (s):
                                      mean        std         min         max
full_index                      224.361024  17.764385  210.837642  255.471691
full_index_trimmed              200.103897  12.415554  187.870443  219.609771
stopwords_removed               206.471247  14.796586  194.001272  231.351704
stopwords_removed_trimmed       188.053850  11.411245  175.334452  198.935515
stemming_only                   219.873281   8.411347  209.852369  232.689631
stemming_only_trimmed           210.452104  20.494514  192.553874  240.537750
stopwords_and_stemming          197.518740  10.622081  187.514076  215.243656
stopwords_and_stemming_trimmed  187.346538  20.582123  168.839451  217.388120


In [None]:
index_folders = [d for d in os.listdir(BASE_IDX) if os.path.isdir(os.path.join(BASE_IDX, d))]

records = []
for name in index_folders:
    abs_idx = os.path.abspath(os.path.join(BASE_IDX, name))
    if not os.path.isdir(abs_idx):
        # Skip if directory does not exist
        continue
    # Load the index
    idx = pt.IndexFactory.of(abs_idx)
    stats = idx.getCollectionStatistics()
    
    docs_indexed = stats.getNumberOfDocuments()
    unique_terms = stats.getNumberOfUniqueTerms()
    total_terms = stats.getNumberOfTokens()
    avg_doc_length = stats.getAverageDocumentLength()

    # Calculate size on disk (MB)
    size_bytes = 0
    for root, _, files in os.walk(abs_idx):
        for f in files:
            size_bytes += os.path.getsize(os.path.join(root, f))
    size_mb = size_bytes / (1024 ** 2)
    
    records.append({
        "Index": name,
        "Docs Indexed": docs_indexed,
        "Unique Terms": unique_terms,
        "Total Terms": total_terms,
        "Avg Doc Length": avg_doc_length,
        "Size (MB)": size_mb
    })

# Create DataFrame and display
df = pd.DataFrame(records).set_index("Index")
df

Unnamed: 0_level_0,Docs Indexed,Unique Terms,Total Terms,Avg Doc Length,Size (MB)
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
stemming_only,200000,2654799,375471589,1877.357945,515.809729
stopwords_removed_trimmed,195953,2629000,194362712,991.88434,481.956937
stopwords_and_stemming_trimmed,195953,2403925,194362712,991.88434,426.835865
stemming_only_trimmed,195953,2404075,326621929,1666.838114,471.947236
stopwords_and_stemming,200000,2654647,222689568,1113.44784,465.550773
stopwords_removed,200000,2912126,222689568,1113.44784,526.370728
full_index,200000,2912731,375471589,1877.357945,579.105363
full_index_trimmed,195953,2629597,326621929,1666.838114,529.467052


In [23]:
trimmed = df[df.index.str.endswith("_trimmed")].copy()
trimmed['base'] = trimmed.index.str.replace(r'_trimmed$', '', regex=True)
base     = df.loc[trimmed['base']].copy()
base.index = trimmed.index  # align indices

# 2. pull just the numeric metric columns
metrics = ["Docs Indexed", "Unique Terms", "Total Terms", "Size (MB)"]
trim_m = trimmed[metrics]
base_m  = base[metrics]

# 3. absolute difference
abs_diff = trim_m - base_m
abs_diff.columns = [f"{c} Δ" for c in abs_diff.columns]

# 4. relative change (proportional)
rel_change = abs_diff.values / base_m.values
rel = pd.DataFrame(rel_change, index=abs_diff.index, columns=metrics)
rel.columns = [f"{c} %Δ" for c in rel.columns]

# 5. mean relative change across all trimmed runs
mean_rel = rel.mean().rename("Mean %Δ")

# 6. combine for display if you like
result = pd.concat([trim_m, base_m.add_prefix("Base "), abs_diff, rel], axis=1)

print("\nMean relative changes:")
print(mean_rel)


Mean relative changes:
Docs Indexed %Δ   -0.020235
Unique Terms %Δ   -0.095829
Total Terms %Δ    -0.128653
Size (MB) %Δ      -0.084572
Name: Mean %Δ, dtype: float64


In [7]:
from avg_query_process_time import process_queries

query_times = {}

# Get all index folders in BASE_IDX
index_folders = [d for d in os.listdir(BASE_IDX) if os.path.isdir(os.path.join(BASE_IDX, d))]

for name in index_folders:
    idx_ref = os.path.abspath(os.path.join(BASE_IDX, name))
    processing_time, postings_time = process_queries(qs, idx_ref)
    query_times[name] = postings_time

query_times

100%|██████████| 4434/4434 [1:03:06<00:00,  1.17it/s]
100%|██████████| 4434/4434 [07:40<00:00,  9.63it/s]
100%|██████████| 4434/4434 [08:24<00:00,  8.78it/s]
100%|██████████| 4434/4434 [50:01<00:00,  1.48it/s]  
100%|██████████| 4434/4434 [07:45<00:00,  9.53it/s]
100%|██████████| 4434/4434 [06:08<00:00, 12.04it/s]
100%|██████████| 4434/4434 [54:15<00:00,  1.36it/s]  
100%|██████████| 4434/4434 [44:38<00:00,  1.66it/s]  


In [18]:
DEBUG_IDX = os.path.abspath(os.path.join("debug_indexes", "full_index_with_blocks"))
if os.path.isdir(DEBUG_IDX):
    shutil.rmtree(DEBUG_IDX)
os.makedirs(DEBUG_IDX, exist_ok=True)

block_indexer = pt.IterDictIndexer(DEBUG_IDX, blocks=True, stopwords=None, stemmer=NO_STEM)
to_index = docs.to_dict(orient='records')
print(f"Building block index in {DEBUG_IDX}...")
t0 = time.perf_counter()
block_indexer.index(to_index)
print(f"Done in {time.perf_counter() - t0:.2f} seconds.")

Building block index in /Users/christianjensen/Documents/search-engines/debug_indexes/full_index_with_blocks...
09:56:39.860 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 302 empty documents
Done in 486.65 seconds.
