In [2]:
import os, time
import pandas as pd
import pyterrier as pt
if not pt.started(): pt.init()
import shutil
import json
from tqdm.auto import tqdm

  if not pt.started(): pt.init()
Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  if not pt.started(): pt.init()
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DOCS = "data/docs.jsonl"
QUERIES = "data/train_queries.csv"
QRELS   = "data/train_qrels.csv"
BASE_IDX = "indexes"

In [4]:
qs = pd.read_csv(QUERIES, sep="\t", names=["qid", "query"]) # Change text column to query
qrels = pd.read_csv(QRELS, sep="\t")

# Change text 

def iter_dicts():
    for line in open(DOCS, encoding="utf8"):
        j = json.loads(line)
        d = pd.Series(j)
        yield {"docno": d.docno, "text": f"{d.title}\n{d.body}"}

In [5]:
import string
# Strip out all punctuation
qs['query'] = qs['query'] \
    .str.translate(str.maketrans('', '', string.punctuation))

In [6]:
NO_STEM = pt.TerrierStemmer.none

configs = {
    # (1) full index: keep every token, no stemming
    "full_index": {
        "stopwords": None,      # disable stop-word removal
        "stemmer"  : NO_STEM,   # disable stemming
    },

    # (2) stopwords removed only: remove stop-words, no stemming
    "stopwords_removed": {
        # default stopword list is used when you omit stopwords,
        # so here we only turn off stemming:
        "stemmer": NO_STEM,
    },

    # (3) stemming only: apply Porter stemmer, keep stop-words
    "stemming_only": {
        "stopwords": None,
    },
    # (4) stopwords removed + stemming: the Terrier defaults
    "stopwords_and_stemming": {
        # no args needed — by default Terrier will remove stop-words
        # and apply Porter stemming
    },
}
total_docs = sum(1 for _ in open(DOCS, encoding="utf8"))

results = []
for name, opts in configs.items():
    # build the path and force it to be absolute
    idx_dir      = os.path.join(BASE_IDX, name)
    abs_idx_dir  = os.path.abspath(idx_dir)

    # wipe any old index and re-create
    if os.path.isdir(abs_idx_dir):
        shutil.rmtree(abs_idx_dir)
    os.makedirs(abs_idx_dir, exist_ok=True)

    # optional sanity check
    assert os.access(abs_idx_dir, os.W_OK), f"{abs_idx_dir} is not writable!"

    # build the Terrier index
    indexer   = pt.IterDictIndexer(
        abs_idx_dir,
        text_attrs=('text',),   # tell it which dict key holds your document text
        **opts
    )
    wrapped = tqdm(iter_dicts(),
                   total=total_docs,
                   unit="docs",
                   desc=f"Indexing {name}")

    t0 = time.time()
    index_ref = indexer.index(wrapped)
    build_time = time.time() - t0

    # collect your stats…
    idx         = pt.IndexFactory.of(index_ref)
    num_docs    = idx.getCollectionStatistics().getNumberOfDocuments()
    vocab_size  = idx.getCollectionStatistics().getNumberOfUniqueTerms()
    total_terms = idx.getCollectionStatistics().getNumberOfTokens()
    size_bytes  = sum(
        os.path.getsize(os.path.join(dp, f))
        for dp,_,fs in os.walk(abs_idx_dir) for f in fs
    )

    bm25   = pt.BatchRetrieve(index_ref, wmodel="BM25")
    times  = []
    for idx in qs['query'].sample(100, random_state=0).index:
        single = qs.loc[[idx]]
        t1 = time.time()
        _  = bm25.transform(single)
        times.append(time.time() - t1)
    avg_qtime = sum(times) / len(times)

    results.append({
        "index"       : name,
        "docs"        : num_docs,
        "unique_terms": vocab_size,
        "total_terms" : total_terms,
        "size_MB"     : size_bytes/1e6,
        "build_sec"   : round(build_time,1),
        "avg_q_sec"   : round(avg_qtime,4),
    })

# summary table
df = pd.DataFrame(results).set_index("index")
df.head()

Indexing full_index:  15%|█▌        | 30176/200000 [00:34<02:54, 971.66docs/s] 



Indexing full_index: 100%|██████████| 200000/200000 [02:42<00:00, 1231.53docs/s]


16:38:24.876 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 302 empty documents


  bm25   = pt.BatchRetrieve(index_ref, wmodel="BM25")
Indexing stopwords_removed:  15%|█▌        | 30176/200000 [00:34<03:03, 926.91docs/s] 



Indexing stopwords_removed: 100%|██████████| 200000/200000 [02:47<00:00, 1192.92docs/s]


16:41:44.283 [ForkJoinPool-2-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 303 empty documents


  bm25   = pt.BatchRetrieve(index_ref, wmodel="BM25")
Indexing stemming_only:  15%|█▌        | 30176/200000 [00:35<03:06, 910.09docs/s] 



Indexing stemming_only: 100%|██████████| 200000/200000 [02:58<00:00, 1122.04docs/s]


16:45:08.290 [ForkJoinPool-3-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 302 empty documents


  bm25   = pt.BatchRetrieve(index_ref, wmodel="BM25")
Indexing stopwords_and_stemming:  15%|█▌        | 30176/200000 [00:35<02:58, 951.06docs/s] 



Indexing stopwords_and_stemming: 100%|██████████| 200000/200000 [02:54<00:00, 1147.45docs/s]


16:48:29.284 [ForkJoinPool-4-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 303 empty documents


  bm25   = pt.BatchRetrieve(index_ref, wmodel="BM25")


Unnamed: 0_level_0,docs,unique_terms,total_terms,size_MB,build_sec,avg_q_sec
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
full_index,200000,2912731,375471589,607.235985,190.7,0.0417
stopwords_removed,200000,2912126,222689568,551.939713,195.2,0.0168
stemming_only,200000,2654799,375471589,540.865702,202.3,0.043
stopwords_and_stemming,200000,2654647,222689568,488.165367,196.7,0.018
