In [1]:
#!pip install -q python-terrier

In [2]:
import pyterrier as pt
import pandas as pd

# note that pt.started() and pt.init() are deprecated

if not pt.java.started():
    pt.java.init()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]


In [3]:
# Load the pickled inverted index
df = pd.read_csv("collection/vaswani/vaswani.tsv", sep="\t")

# assign columns
df.columns = ["docno", "text"]

# Convert columns to strings
df["docno"] = df["docno"].astype(str)
df["text"] = df["text"].astype(str)

# Display the DataFrame to verify
print(df.head())

  docno                                               text
0     1  an electronic analogue computer for solving sy...
1     2  electronic coordinate transformer  circuit det...
2     3  the british computer society  report of a conf...
3     4  millimicrosecond digital computer logic  a sys...
4     5  binary circuits count backwards or forwards  a...


In [5]:
indexer = pt.DFIndexer("./index_3docs", overwrite=True)
# indexer = pt.IterDictIndexer("index_3docs")
indexref = indexer.index(df["text"], df["docno"])
indexref.toString()

18:58:05.388 [main] ERROR org.terrier.structures.Index -- Cannot create new index: path C:\Users\jacop\Documents\GitHub\IR_project\.\var\./index_3docs does not exist, or cannot be written to


  indexer = pt.DFIndexer(r"./index_3docs", overwrite=True)


JavaException: JVM exception occurred: Cannot create new index: path C:\Users\jacop\Documents\GitHub\IR_project\.\var\./index_3docs does not exist, or cannot be written to java.lang.IllegalArgumentException

In [6]:
index = pt.IndexFactory.of(indexref)

# lets see what Python type index is.
print(type(index))

<class 'jnius.reflect.org.terrier.structures.Index'>


In [7]:
print(index.getCollectionStatistics())

Number of documents: 11428
Number of terms: 7756
Number of postings: 224561
Number of fields: 0
Number of tokens: 271568
Field names: []
Positions:   false



In [10]:
print(index.getInvertedIndex())

<org.terrier.structures.PostingIndex at 0x7aabd524a570 jclass=org/terrier/structures/PostingIndex jself=<LocalRef obj=0x5a529369ec40 at 0x7aabd4957170>>


In [9]:
for kv in index.getLexicon():
    print(f"{kv.getKey()} {type(kv.getKey())} {kv.getValue()} {type(kv.getValue())}")

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
graphit <class 'str'> term4483 Nt=3 TF=3 maxTF=1 @{0 123974 4} <class 'jnius.reflect.org.terrier.structures.LexiconEntry'>
grapho <class 'str'> term3938 Nt=1 TF=1 maxTF=1 @{0 123983 0} <class 'jnius.reflect.org.terrier.structures.LexiconEntry'>
grasp <class 'str'> term6065 Nt=1 TF=1 maxTF=1 @{0 123986 0} <class 'jnius.reflect.org.terrier.structures.LexiconEntry'>
grate <class 'str'> term599 Nt=14 TF=27 maxTF=6 @{0 123989 2} <class 'jnius.reflect.org.terrier.structures.LexiconEntry'>
graupel <class 'str'> term4748 Nt=1 TF=1 maxTF=1 @{0 124021 1} <class 'jnius.reflect.org.terrier.structures.LexiconEntry'>
gravit <class 'str'> term1547 Nt=40 TF=52 maxTF=3 @{0 124024 3} <class 'jnius.reflect.org.terrier.structures.LexiconEntry'>
graviti <class 'str'> term2286 Nt=9 TF=9 maxTF=1 @{0 124100 3} <class 'jnius.reflect.org.terrier.structures.LexiconEntry'>
graybeal <class 'str'> term5582 Nt=1 TF=1 maxTF=1 @{0 124121 3} <class 'jnius.

In [16]:
pointer = index.getLexicon()["car"]
print(pointer.toString())

for posting in index.getInvertedIndex().getPostings(pointer):
    print(f"{posting.toString()} doclen={posting.getDocumentLength()}")

term4448 Nt=2 TF=2 maxTF=1 @{0 33137 7}
ID(3590) TF(1) doclen=15
ID(7683) TF(1) doclen=41


In [17]:
def get_document_direct(index, docno=None, docid=None):
    if docid is None and docno is None:
        raise ValueError("Must specify docno or docid")
    if docno is not None:
        docid = index.getMetaIndex().getDocument("docno", docno)
    else:
        docno = index.getMetaIndex().getItem("docno", docid)

    rtr = f"Docno {docno} (docid {docid})\n"
    pointer = index.getDocumentIndex().getDocumentEntry(docid)
    for p in index.getDirectIndex().getPostings(pointer):
        termid = p.getId()
        term = index.getLexicon()[termid].getKey()
        rtr += f"\t{term} {p.getFrequency()}\n"
    return rtr

In [19]:
print(get_document_direct(index, docid=3590))

Docno 3591 (docid 3590)
	detail 1
	electr 1
	practic 1
	set 1
	power 2
	work 1
	suppli 2
	batteri 1
	mobil 1
	suppress 1
	portabl 1
	car 1
	petrol 1



In [20]:
print(get_document_direct(index, docno="3591"))

Docno 3591 (docid 3590)
	detail 1
	electr 1
	practic 1
	set 1
	power 2
	work 1
	suppli 2
	batteri 1
	mobil 1
	suppress 1
	portabl 1
	car 1
	petrol 1



In [26]:
br = pt.terrier.Retriever(index, wmodel="Tf")
br.search("car")

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,3590,3591,0,1.0,car
1,1,7683,7684,1,1.0,car


In [30]:
queries = pd.DataFrame([["q1", "car"], ["q2", "red car"], ["q3", "red"]], columns=["qid", "query"])
br.transform(queries) # br(queries) does the same

Unnamed: 0,qid,docid,docno,rank,score,query
0,q1,3590,3591,0,1.0,car
1,q1,7683,7684,1,1.0,car
2,q2,1278,1279,0,2.0,red car
3,q2,1538,1539,1,2.0,red car
4,q2,3235,3236,2,2.0,red car
5,q2,81,82,3,1.0,red car
6,q2,382,383,4,1.0,red car
7,q2,880,881,5,1.0,red car
8,q2,1698,1699,6,1.0,red car
9,q2,2888,2889,7,1.0,red car


In [31]:
tfidf = pt.terrier.Retriever(index, wmodel="TF_IDF")
tfidf(queries)

Unnamed: 0,qid,docid,docno,rank,score,query
0,q1,3590,3591,0,8.017043,car
1,q1,7683,7684,1,5.249789,car
2,q2,3590,3591,0,8.017043,red car
3,q2,1278,1279,1,7.562049,red car
4,q2,1538,1539,2,7.026548,red car
5,q2,81,82,3,6.567583,red car
6,q2,6816,6817,4,5.915065,red car
7,q2,2888,2889,5,5.58229,red car
8,q2,4292,4293,6,5.58229,red car
9,q2,9514,9515,7,5.479532,red car


In [34]:
bm25 = pt.terrier.Retriever(index, wmodel="BM25")
pippo = pt.terrier.Retriever(index, wmodel="BM25")
bm25(queries)

Unnamed: 0,qid,docid,docno,rank,score,query
0,q1,3590,3591,0,14.318269,car
1,q1,7683,7684,1,9.376012,car
2,q2,3590,3591,0,14.318269,red car
3,q2,1278,1279,1,13.795804,red car
4,q2,1538,1539,2,12.818864,red car
5,q2,81,82,3,11.981552,red car
6,q2,6816,6817,4,10.791132,red car
7,q2,2888,2889,5,10.184035,red car
8,q2,4292,4293,6,10.184035,red car
9,q2,9514,9515,7,9.996569,red car


In [35]:
print(isinstance(bm25, pt.Transformer))
print(isinstance(pippo, pt.Transformer))

True
True


In [37]:
pipeline = br >> tfidf >> bm25
pipeline(queries)

Unnamed: 0,qid,docid,docno,rank,score,query
0,q1,3590,3591,0,14.318269,car
1,q1,7683,7684,1,9.376012,car
2,q2,3590,3591,0,14.318269,red car
3,q2,1278,1279,1,13.795804,red car
4,q2,1538,1539,2,12.818864,red car
5,q2,81,82,3,11.981552,red car
6,q2,6816,6817,4,10.791132,red car
7,q2,2888,2889,5,10.184035,red car
8,q2,4292,4293,6,10.184035,red car
9,q2,9514,9515,7,9.996569,red car


In [69]:
# prompt: read the qrels from collection\vaswani\vaswani-qrels.txt
qrels = pd.read_csv("collection/vaswani/vaswani-qrels.txt", sep="\t", names=["qid", "docno", "relevance", "iteration"], dtype={"qid": str, "docno": str})
qrels.drop(columns=["iteration"], inplace=True)
print(qrels.head())

  qid docno  relevance
0   1  1238          1
1   1  1501          1
2   1  4461          1
3   1  4568          1
4   1  5471          1


In [70]:
queries_vaswani = pd.read_csv("collection/vaswani/vaswani-queries.tsv", sep="\t", names=["qid", "query"], dtype={"qid": str, "docno": str})
print(queries_vaswani.head())

  qid                                              query
0   1  MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...
1   2  MATHEMATICAL ANALYSIS AND DESIGN DETAILS OF WA...
2   3  USE OF DIGITAL COMPUTERS IN THE DESIGN OF BAND...
3   4  SYSTEMS OF DATA CODING FOR INFORMATION TRANSFER\n
4   5  USE OF PROGRAMS IN ENGINEERING TESTING OF COMP...


In [71]:
def get_res_with_labels(ranker, df):
  #get the results for the query or queries
  results = ranker( df )
  # left outer join with the qrels
  with_labels = results.merge(qrels, on=["qid", "docno"], how="left").fillna(0)
  return with_labels

In [72]:
#lets get the Tf results for the first query
get_res_with_labels(br, queries_vaswani.head(1))

Unnamed: 0,qid,docid,docno,rank,score,query,relevance
0,1,7232,7233,0,9.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
1,1,3691,3692,1,8.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
2,1,5500,5501,2,7.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,1.0
3,1,8906,8907,3,7.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
4,1,3332,3333,4,6.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
...,...,...,...,...,...,...,...
995,1,2270,2271,995,1.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
996,1,2272,2273,996,1.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
997,1,2285,2286,997,1.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
998,1,2287,2288,998,1.0,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0


In [73]:
#lets get the TFIDF results for the first query
get_res_with_labels(tfidf, queries_vaswani.head(1))

Unnamed: 0,qid,docid,docno,rank,score,query,relevance
0,1,8170,8171,0,13.745939,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,1.0
1,1,9879,9880,1,12.352502,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
2,1,5500,5501,2,12.178013,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,1.0
3,1,1500,1501,3,10.993367,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,1.0
4,1,9857,9858,4,10.271403,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,1.0
...,...,...,...,...,...,...,...
995,1,9549,9550,995,2.645998,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
996,1,11138,11139,996,2.645998,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
997,1,11297,11298,997,2.645998,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0
998,1,3293,3294,998,2.644967,MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS ...,0.0


In [74]:
newQueryDf = pd.DataFrame([
  ["1", "measurement^0.1 of dielectric constant of liquids by "+
    "the use of microwave techniques^2"]
  ], columns=["qid", "query"])
get_res_with_labels(br, newQueryDf)

Unnamed: 0,qid,docid,docno,rank,score,query,relevance
0,1,3691,3692,0,4.00,measurement^0.1 of dielectric constant of liqu...,0.0
1,1,7232,7233,1,3.60,measurement^0.1 of dielectric constant of liqu...,0.0
2,1,8906,8907,2,3.20,measurement^0.1 of dielectric constant of liqu...,0.0
3,1,5500,5501,3,3.05,measurement^0.1 of dielectric constant of liqu...,1.0
4,1,4569,4570,4,3.00,measurement^0.1 of dielectric constant of liqu...,0.0
...,...,...,...,...,...,...,...
995,1,4549,4550,995,0.50,measurement^0.1 of dielectric constant of liqu...,0.0
996,1,4572,4573,996,0.50,measurement^0.1 of dielectric constant of liqu...,0.0
997,1,4611,4612,997,0.50,measurement^0.1 of dielectric constant of liqu...,0.0
998,1,4639,4640,998,0.50,measurement^0.1 of dielectric constant of liqu...,0.0
