In [1]:
import pyterrier as pt
import tiktoken

dataset = pt.datasets.get_dataset("msmarco_passage")
print(dataset.get_corpus())

['/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection-gpt3.5.tsv', '/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection-sbert.tsv', '/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection.tsv']


In [11]:
def printTopFrequentTerms(index_ref):
    lexicon = index_ref.getLexicon()
    term_freqs = []
    iterator = lexicon.iterator()
    while iterator.hasNext():
        entry = iterator.next()
        term_freqs.append((entry.getKey(), entry.getValue().getFrequency()))
    top_terms = sorted(term_freqs, key=lambda x: x[1], reverse=True)[:50]
    for term, freq in top_terms:
        print(f"{term}: {freq}")

In [None]:
iter_indexer = pt.IterDictIndexer("./wordindex", meta={'docno': 20, 'text': 4096})
indexref = iter_indexer.index(dataset.get_corpus_iter())

In [26]:
INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/wordindex/data.properties'
indexref = pt.IndexFactory.of(INDEX_DIR)
print(indexref.getCollectionStatistics())
printTopFrequentTerms(indexref)

Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 0
Number of tokens: 288759529
Field names: []
Positions:   false

1: 2354388
can: 2275758
2: 1854040
will: 1318185
3: 1286521
year: 1132892
time: 1018130
mai: 956589
state: 849629
4: 827548
dai: 793715
new: 792746
name: 780596
5: 757367
first: 706850
includ: 703524
like: 660114
caus: 651638
need: 598907
number: 593676
who: 585304
peopl: 576899
call: 572439
work: 571835
mean: 563717
onli: 551496
type: 551279
water: 549419
cost: 547188
take: 546892
10: 524506
averag: 520107
system: 512566
form: 505577
000: 489118
unit: 486448
part: 468220
bodi: 467482
6: 466800
us: 465184
help: 454774
servic: 449359
see: 446033
cell: 444133
blood: 442774
hour: 441033
area: 433671
just: 424045
gener: 421644
person: 415976


In [23]:
from pyterrier.measures import *

WORD_INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/wordindex/data.properties'
word_index_ref = pt.IndexFactory.of(WORD_INDEX_DIR)

print(word_index_ref.getCollectionStatistics())

bm25 = pt.terrier.Retriever(word_index_ref, wmodel="BM25")
tfidf = pt.terrier.Retriever(word_index_ref, wmodel="TF_IDF")
rm3 = tfidf >> pt.rewrite.RM3(word_index_ref) >> tfidf

results = pt.Experiment([bm25, tfidf, rm3], dataset.get_topics('test-2020'), dataset.get_qrels('test-2020'), eval_metrics=[nDCG@10, AP(rel=2), AP(rel=3), 'map'],names=["bm25","tfidf","rm3"])
print (results[["nDCG@10", "AP(rel=2)", "AP(rel=3)", "map"]])

Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 0
Number of tokens: 288759529
Field names: []
Positions:   false

17:07:01.695 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.
    nDCG@10  AP(rel=2)  AP(rel=3)       map
0  0.493627   0.292988   0.287098  0.358724
1  0.492575   0.292548   0.285249  0.358072
2  0.509225   0.316460   0.305664  0.400533


GPT Pre-processing

In [None]:
import csv

def tokenize_text(input_file, output_file, model="gpt-3.5-turbo"):
    enc = tiktoken.encoding_for_model(model)
    delimiters = {",", ";", ".", "(", ")", "{", "}", "$", "%", "!", "?", "'", "\""}

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            if len(row) < 2:
                continue  # Skip malformed lines

            doc_id, text = row[0], row[1].lower()
            tokenized_text = enc.encode(text)
            tokenized_str = " ".join(
                token for t in tokenized_text 
                if (token := enc.decode_single_token_bytes(t).decode("utf-8", errors="ignore")) not in delimiters
            )

            tokenized_str = tokenized_str.replace("'s", "")
            writer.writerow([doc_id, tokenized_str])


input_file = "/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection.tsv"
output_file = "/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection-gpt3.5.tsv"
tokenize_text(input_file, output_file)

GPT Indexing

In [None]:
dataset = "/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection-gpt3.5.tsv" 

def msmarco_generate(file):
    with pt.io.autoopen(file, 'rt') as corpusfile:
        for l in corpusfile:
            docno, passage = l.split("\t")
            yield {'docno' : docno, 'text' : passage}

iter_indexer = pt.IterDictIndexer("./gpt3.5_index", meta={'docno': 20, 'text': 4096})
indexref = iter_indexer.index(msmarco_generate(dataset))

In [None]:
INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/gpt3.5_index/data.properties'
indexref = pt.IndexFactory.of(INDEX_DIR)
print(indexref.getCollectionStatistics())
printTopFrequentTerms(indexref)

Number of documents: 8841823
Number of terms: 22887
Number of postings: 242159486
Number of fields: 0
Number of tokens: 327156239
Field names: []
Positions:   false

1: 2701885
can: 2327496
2: 2246091
3: 1650529
will: 1377989
4: 1181627
5: 1152220
year: 1137277
time: 1032526
mai: 977397
0: 941649
201: 882254
state: 859577
new: 838625
dai: 816441
6: 793353
name: 788967
first: 709681
includ: 702955
like: 672250
7: 669892
caus: 639821
al: 630672
8: 609805
work: 602136
need: 601552
number: 595085
who: 590327
water: 588445
call: 581941
peopl: 579235
mean: 565803
10: 564405
cost: 561528
type: 555653
onli: 552150
er: 550440
take: 548450
ic: 538051
form: 532631
ag: 521044
th: 519342
averag: 519092
system: 515962
000: 510843
ing: 509574
us: 497358
unit: 485987
part: 483942
bodi: 474181


Query Indexing

In [None]:
dataset2 = pt.datasets.get_dataset("msmarco_passage")
queries = dataset.get_topics('test-2020')
queries.to_csv('msmarco_passage_test2020_queries.tsv', sep='\t', index=False)

In [33]:
input_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2020_queries.tsv"
output_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2020_queries-gpt3.5.tsv"
tokenize_text(input_file, output_file)

In [35]:
import pandas as pd

query_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2020_queries-gpt3.5.tsv"
queries = pd.read_csv(query_file, sep='\t', names=["qid", "query"], dtype={"qid": str, "query": str})

dataset = pt.datasets.get_dataset("msmarco_passage")

index_dir = "./gpt3.5_index"
index_ref = pt.IndexFactory.of(index_dir)

bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
tfidf = pt.terrier.Retriever(index_ref, wmodel="TF_IDF")
rm3 = tfidf >> pt.rewrite.RM3(index_ref) >> tfidf

results = pt.Experiment(
    [bm25, tfidf, rm3],
    queries,
    dataset.get_qrels("test-2020"),
    eval_metrics=[nDCG@10, AP(rel=2), AP(rel=3), 'map'],
    names=["bm25","tfidf","rm3"]
)

print (results[["nDCG@10", "AP(rel=2)", "AP(rel=3)", "map"]])

20:16:36.138 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2 GiB of memory would be required.
    nDCG@10  AP(rel=2)  AP(rel=3)       map
0  0.385532   0.218319   0.222926  0.262193
1  0.383712   0.218054   0.222480  0.262095
2  0.378872   0.226950   0.214434  0.279654


SBERT Tokeniser

In [6]:
from sentence_transformers import SentenceTransformer, models
import csv

def sbert_tokenize_text(input_file, output_file, model_name='all-MiniLM-L6-v2'):
    tokenizer = SentenceTransformer(model_name).tokenizer
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            if len(row) < 2:
                continue

            doc_id, text = row[0], row[1].lower()
            tokens = tokenizer.tokenize(text)
            tokenized_str = ' '.join(tokens).replace("##", "")  # Clean WordPiece tokens
            writer.writerow([doc_id, tokenized_str])

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
input_file = "/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection.tsv"
output_file = "/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection-sbert.tsv"
sbert_tokenize_text(input_file, output_file)

In [None]:
def msmarco_generate_debug(file):
    with pt.io.autoopen(file, 'rt') as corpusfile:
        for idx, l in enumerate(corpusfile):
            try:
                docno, passage = l.strip().split("\t")
                yield {'docno': docno, 'text': passage}
                if idx % 100000 == 0:
                    print(f"Processed {idx} lines...")
            except Exception as e:
                print(f"Error at line {idx}: {e}")

iter_indexer = pt.IterDictIndexer("./sbert_index", meta={'docno': 20, 'text': 4096})
indexref = iter_indexer.index(msmarco_generate_debug(output_file))

In [7]:
query_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2020_queries.tsv"
output_query_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2020_queries-sbert.tsv"
sbert_tokenize_text(query_file, output_query_file)

import pandas as pd
queries = pd.read_csv(output_query_file, sep='\t', names=["qid", "query"], dtype={"qid": str, "query": str})

In [9]:
from pyterrier.measures import *

index_ref = pt.IndexFactory.of("./sbert_index")

bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
tfidf = pt.terrier.Retriever(index_ref, wmodel="TF_IDF")
rm3 = tfidf >> pt.rewrite.RM3(index_ref) >> tfidf

results = pt.Experiment(
    [bm25, tfidf, rm3],
    queries,
    pt.datasets.get_dataset("msmarco_passage").get_qrels("test-2020"),
    eval_metrics=[nDCG@10, AP(rel=2), AP(rel=3), 'map'],
    names=["bm25","tfidf","rm3"]
)

print(results[["nDCG@10", "AP(rel=2)", "AP(rel=3)", "map"]])

19:00:01.819 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.
    nDCG@10  AP(rel=2)  AP(rel=3)       map
0  0.423232   0.248772   0.257547  0.301504
1  0.423862   0.248465   0.257057  0.301395
2  0.412610   0.254249   0.245134  0.315919


In [12]:
INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/sbert_index/data.properties'
indexref = pt.IndexFactory.of(INDEX_DIR)
print(indexref.getCollectionStatistics())
printTopFrequentTerms(indexref)

Number of documents: 8841823
Number of terms: 17342
Number of postings: 241019754
Number of fields: 0
Number of tokens: 327396708
Field names: []
Positions:   false

1: 2524226
can: 2339173
2: 2097508
3: 1467168
will: 1322182
year: 1137122
time: 1049965
4: 1007399
mai: 966819
5: 924514
state: 851971
dai: 817762
new: 803596
name: 801659
er: 761159
first: 708062
includ: 702137
like: 667246
6: 653340
ing: 650554
caus: 650279
work: 632310
need: 600000
number: 594171
water: 591679
who: 590100
call: 579297
peopl: 576599
type: 565722
mean: 564591
10: 561739
take: 558961
cost: 555901
7: 552380
onli: 551157
0: 550760
8: 548994
form: 533125
averag: 519558
system: 515412
000: 499228
unit: 487837
part: 483970
ag: 482865
us: 475225
bodi: 474189
help: 461917
blood: 459969
see: 458336
cell: 456710
