In [1]:
import pyterrier as pt
import tiktoken

dataset = pt.datasets.get_dataset("msmarco_passage")
print(dataset.get_corpus())

['/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection-gpt3.5.tsv', '/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection-sbert.tsv', '/Users/manitk/.pyterrier/corpora/msmarco_passage/corpus/collection.tsv']


In [2]:
def printTopFrequentTerms(index_ref):
    lexicon = index_ref.getLexicon()
    term_freqs = []
    iterator = lexicon.iterator()
    while iterator.hasNext():
        entry = iterator.next()
        term_freqs.append((entry.getKey(), entry.getValue().getFrequency()))
    top_terms = sorted(term_freqs, key=lambda x: x[1], reverse=True)[:50]
    for term, freq in top_terms:
        print(f"{term}: {freq}")

In [3]:
INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/wordindex/data.properties'
indexref = pt.IndexFactory.of(INDEX_DIR)
print(indexref.getCollectionStatistics())
printTopFrequentTerms(indexref)

Java started (triggered by IndexFactory.of) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


19:45:49.993 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.
Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 0
Number of tokens: 288759529
Field names: []
Positions:   false

1: 2354388
can: 2275758
2: 1854040
will: 1318185
3: 1286521
year: 1132892
time: 1018130
mai: 956589
state: 849629
4: 827548
dai: 793715
new: 792746
name: 780596
5: 757367
first: 706850
includ: 703524
like: 660114
caus: 651638
need: 598907
number: 593676
who: 585304
peopl: 576899
call: 572439
work: 571835
mean: 563717
onli: 551496
type: 551279
water: 549419
cost: 547188
take: 546892
10: 524506
averag: 520107
system: 512566
form: 505577
000: 489118
unit: 486448
part: 468220
bodi: 467482
6: 466800
us: 465184
help: 454774
servic: 449359
see: 446033
cell: 444133
blood: 442774
hour: 441033


In [6]:
from pyterrier.measures import *

WORD_INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/wordindex/data.properties'
word_index_ref = pt.IndexFactory.of(WORD_INDEX_DIR)

print(word_index_ref.getCollectionStatistics())

bm25 = pt.terrier.Retriever(word_index_ref, wmodel="BM25", num_results=100)
tfidf = pt.terrier.Retriever(word_index_ref, wmodel="TF_IDF", num_results=100)
rm3 = tfidf >> pt.rewrite.RM3(word_index_ref, fb_terms=10, fb_docs=10) >> tfidf

results = pt.Experiment([bm25, tfidf, rm3], dataset.get_topics('test-2019'), dataset.get_qrels('test-2019'), eval_metrics=[nDCG@10, AP(rel=2), AP(rel=3), 'map'],names=["bm25","tfidf","rm3"])
print (results[["nDCG@10", "AP(rel=2)", "AP(rel=3)", "map"]])

19:53:59.611 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.
Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 0
Number of tokens: 288759529
Field names: []
Positions:   false

    nDCG@10  AP(rel=2)  AP(rel=3)       map
0  0.479540   0.232201   0.163083  0.290681
1  0.478310   0.232189   0.163141  0.290303
2  0.524437   0.252482   0.194832  0.310570


In [7]:
INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/gpt3.5_index/data.properties'
indexref = pt.IndexFactory.of(INDEX_DIR)
print(indexref.getCollectionStatistics())

# printTopFrequentTerms(indexref)

19:54:56.415 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2 GiB of memory would be required.
Number of documents: 8841823
Number of terms: 22887
Number of postings: 242159486
Number of fields: 0
Number of tokens: 327156239
Field names: []
Positions:   false



In [8]:
import csv

def tokenize_text(input_file, output_file, model="gpt-3.5-turbo"):
    enc = tiktoken.encoding_for_model(model)
    delimiters = {",", ";", ".", "(", ")", "{", "}", "$", "%", "!", "?", "'", "\""}

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            if len(row) < 2:
                continue  # Skip malformed lines

            doc_id, text = row[0], row[1].lower()
            tokenized_text = enc.encode(text)
            tokenized_str = " ".join(
                token for t in tokenized_text 
                if (token := enc.decode_single_token_bytes(t).decode("utf-8", errors="ignore")) not in delimiters
            )

            tokenized_str = tokenized_str.replace("'s", "")
            writer.writerow([doc_id, tokenized_str])


queries = dataset.get_topics('test-2019')
queries.to_csv('msmarco_passage_test2019_queries.tsv', sep='\t', index=False)

input_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2019_queries.tsv"
output_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2019_queries-gpt3.5.tsv"
tokenize_text(input_file, output_file)

In [17]:
import pandas as pd
query_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2019_queries-gpt3.5.tsv"
queries = pd.read_csv(query_file, sep='\t', names=["qid", "query"], dtype={"qid": str, "query": str})

index_dir = "./gpt3.5_index"
index_ref = pt.IndexFactory.of(index_dir)

bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25", num_results=100)
tfidf = pt.terrier.Retriever(index_ref, wmodel="TF_IDF", num_results=100)
rm3 = tfidf >> pt.rewrite.RM3(index_ref, fb_terms=10, fb_docs=10) >> tfidf

results = pt.Experiment(
    [bm25, tfidf, rm3],
    queries,
    dataset.get_qrels("test-2019"),
    eval_metrics=[nDCG@10, AP(rel=2), AP(rel=3), 'map'],
    names=["bm25","tfidf","rm3"]
)

print (results[["nDCG@10", "AP(rel=2)", "AP(rel=3)", "map"]])

20:06:44.204 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2 GiB of memory would be required.
    nDCG@10  AP(rel=2)  AP(rel=3)       map
0  0.420154   0.187534   0.147814  0.242088
1  0.421618   0.187597   0.148047  0.241656
2  0.473803   0.209243   0.165079  0.268330


In [None]:
from sentence_transformers import SentenceTransformer, models
import csv

def sbert_tokenize_text(input_file, output_file, model_name='all-MiniLM-L6-v2'):
    tokenizer = SentenceTransformer(model_name).tokenizer
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            if len(row) < 2:
                continue

            doc_id, text = row[0], row[1].lower()
            tokens = tokenizer.tokenize(text)
            tokenized_str = ' '.join(tokens).replace("##", "")  # Clean WordPiece tokens
            writer.writerow([doc_id, tokenized_str])

query_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2019_queries.tsv"
output_query_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2019_queries-sbert.tsv"
sbert_tokenize_text(query_file, output_query_file)



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from pyterrier.measures import *
import pandas as pd

output_query_file = "/Users/manitk/Desktop/GIR/Pyterrier/combined/msmarco_passage_test2019_queries-sbert.tsv"
queries = pd.read_csv(output_query_file, sep='\t', names=["qid", "query"], dtype={"qid": str, "query": str})

index_ref = pt.IndexFactory.of("./sbert_index")

bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25", num_results=100)
tfidf = pt.terrier.Retriever(index_ref, wmodel="TF_IDF", num_results=100)
rm3 = tfidf >> pt.rewrite.RM3(index_ref, fb_terms=10, fb_docs=10) >> tfidf

results = pt.Experiment(
    [bm25, tfidf, rm3],
    queries,
    pt.datasets.get_dataset("msmarco_passage").get_qrels("test-2019"),
    eval_metrics=[nDCG@10, AP(rel=2), AP(rel=3), 'map'],
    names=["bm25","tfidf","rm3"]
)

print(results[["nDCG@10", "AP(rel=2)", "AP(rel=3)", "map"]])

20:07:33.597 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.
    nDCG@10  AP(rel=2)  AP(rel=3)       map
0  0.413136   0.193162   0.142155  0.243673
1  0.413821   0.193531   0.142645  0.243769
2  0.432539   0.211567   0.150619  0.264245


In [19]:
INDEX_DIR = '/Users/manitk/Desktop/GIR/Pyterrier/combined/sbert_index/data.properties'
indexref = pt.IndexFactory.of(INDEX_DIR)
print(indexref.getCollectionStatistics())

20:15:39.035 [main] WARN org.terrier.structures.BaseCompressingMetaIndex -- Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.
Number of documents: 8841823
Number of terms: 17342
Number of postings: 241019754
Number of fields: 0
Number of tokens: 327396708
Field names: []
Positions:   false

