# **Install the framework and the needed packages**

In [1]:
#install the Pyterrier framework
!pip install python-terrier
# install the nltk modules
!pip install nltk
import pandas as pd
import pyterrier as pt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
pd.set_option('display.max_colwidth', 150) # to optimize the view of the pd data.

Collecting python-terrier
  Downloading python-terrier-0.10.1.tar.gz (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecated (from python-terrier)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting chest (fr

In [2]:
# additional terrier package for PRF.
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

Cloning into 'terrier-prf'...
remote: Enumerating objects: 196, done.[K
remote: Counting objects: 100% (196/196), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 196 (delta 52), reused 173 (delta 36), pack-reused 0[K
Receiving objects: 100% (196/196), 28.00 KiB | 4.00 MiB/s, done.
Resolving deltas: 100% (52/52), done.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libaopalliance-java libapache-pom-java libatinject-jsr330-api-java libcdi-api-java
  libcommons-cli-java libcommons-io-java libcommons-lang3-java libcommons-parent-java
  libgeronimo-annotation-1.3-spec-java libgeronimo-interceptor-3.0-spec-java libguava-java
  libguice-java libhawtjni-runtime-java libjansi-java libjansi-native-java libjsr305-java
  libmaven-parent-java libmaven-resolver-java libmaven-shared-utils-java libmaven3-core-java
  libplexus-cipher-java libplexus-classworlds-java libpl

### **Preprocessing the data set**

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
# Initialize Porter stemmer
stemmer = PorterStemmer()

In [5]:
def Stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)

def clean(text):
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=\@]", " ", text) # remove special characters
   text = re.sub(r'\t', ' ', text) # remove tabs
   text = re.sub(r'\n', ' ', text) # remove line jump
   text = re.sub(r"\s+", " ", text) # remove extra white space
   text = text.strip()
   return text

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words] #Lower is used to normalize al the words make them in lower case
    return ' '.join(filtered_tokens)

# process the query also as for documents
def preprocess(sentence):
  sentence = clean(sentence)
  sentence = remove_stopwords(sentence)
  sentence = Stem_text(sentence)
  return sentence


# **Load The Dataset for the Indexer**

In [7]:
def load_cisi_dataset(documents_path,queries_path,qrels_path):

    documents_df = read_documents(documents_path)
    queries_df = read_queries(queries_path)
    qrels_df = read_qrels(qrels_path)
    return documents_df, queries_df, qrels_df

# Read documents from CISI.ALL file
def read_documents(documents_path):
    with open(documents_path, 'r') as file:
        lines = file.readlines()
    documents = []
    current_document = None
    for line in lines:
        if line.startswith('.I'):
            if current_document is not None:
                current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
                documents.append(current_document)
            current_document = {'ID': line.strip().split()[1], 'Text': ''}
        elif line.startswith('.T'):
            continue
        elif line.startswith('.A') or line.startswith('.B') or line.startswith('.W') or line.startswith('.X'):
            continue
        else:
            current_document['Text'] += line.strip() + ' '

    # Append the last document
    if current_document is not None:
        current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
        documents.append(current_document)
    documents_df = pd.DataFrame(documents)
    return documents_df

# Read queries from CISI.QRY file
def read_queries(queries_path):
    with open(queries_path, 'r') as file:
        lines = file.readlines()
    query_texts = []
    query_ids = []
    current_query_id = None
    current_query_text = []
    for line in lines:
        if line.startswith('.I'):
            if current_query_id is not None:
                query_texts.append(' '.join(current_query_text))
                current_query_text = []
            current_query_id = line.strip().split()[1]
            query_ids.append(current_query_id)
        elif line.startswith('.W'):
            continue
        elif line.startswith('.X'):
            break
        else:
            current_query_text.append(line.strip())
    # Append the last query
    query_texts.append(' '.join(current_query_text))
    queries_df = pd.DataFrame({
        'qid': query_ids,
        'raw_query': query_texts})
    return queries_df

# Read qrels from CISI.REL file
def read_qrels(qrels_path):
    qrels_df = pd.read_csv(qrels_path, sep='\s+', names=['qid','Q0','docno','label'])
    return qrels_df

In [8]:
documents_path = "/content/CISI.ALL"
queries_path = "/content/CISI.QRY"
qrels_path = "/content/CISI.REL"
documents_df, queries_df, qrels_df = load_cisi_dataset(documents_path,queries_path,qrels_path)
documents_df['Text'][0]

"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 1"

In [9]:
documents_df

Unnamed: 0,ID,Text
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi..."
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar..."
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz..."
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'..."
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl..."
...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex..."
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b..."
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude..."
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ..."


In [12]:
documents_df["docno"]=documents_df["ID"].astype(str)
documents_df

Unnamed: 0,ID,Text,docno
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi...",1
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar...",2
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz...",3
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'...",4
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl...",5
...,...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex...",1456
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b...",1457
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude...",1458
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ...",1459


In [14]:
documents_df['processed_text'] = documents_df['Text'].apply(preprocess)
documents_df

Unnamed: 0,ID,Text,docno,processed_text
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi...",1,18 edit dewey decim classif comaromi j p present studi histori dewey decim classif first edit ddc publish 1876 eighteenth edit 1971 futur edit con...
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar...",2,use made technic librari slater report analysi 6300 act use 104 technic librari unit kingdom librari use one aspect wider pattern inform use infor...
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz...",3,two kind power essay bibliograph control wilson p relationship organ control write organ control knowledg inform inevit enter stori write contain ...
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'...",4,system analysi univers librari ; final report research project buckland k establish nine new univers 1960 's provok highli stimul re-examin natur ...
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl...",5,librari manag game report research project brophi p although use game profession educ becom widespread last decad method use number field mani hun...
...,...,...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex...",1456,world dynam forrest j w last sever decad interest econom develop popul growth world environ expand rapidli world-wid stress increas mani individu ...
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b...",1457,world trend librari educ bramley g one signific aspect evolut librarianship twentieth centuri emerg librari school potent factor shape new philoso...
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude...",1458,legal restrict exploit patent monopoli econom analysi baxter w patent law confer patente power exclud other make use sell invent further constitut...
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ...",1459,languag thought poluskin v book consid basic aspect complex problem - histor social essenc languag thought interact histor evolut essenc linguist ...


In [10]:
queries_df

Unnamed: 0,qid,raw_query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req..."
2,3,What is information science? Give definitions where possible.
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...
...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c..."
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a..."
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W..."
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi..."


In [15]:
queries_df["qid"]=queries_df["qid"].astype(str)
queries_df["query"]=queries_df["raw_query"].apply(preprocess)
queries_df

Unnamed: 0,qid,raw_query,query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...,problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req...",actual pertin data oppos refer entir articl retriev automat respons inform request
2,3,What is information science? Give definitions where possible.,inform scienc give definit possibl
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.,imag recognit method automat transform print text computer-readi form
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...,special train ordinari research businessmen need proper inform manag unobstruct use inform retriev system problem like encount
...,...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c...",program machine-medi search toliv techniqu onlin instruct assist bibliograph data base searcher call individu instruct data access ( iida ) develo...
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a...",author cocit literatur measur intellectu structur white h griffith b c shown map particular area scienc case inform scienc done use author unit an...
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W...",progress document word process introduct apprais whitehead j `` offic futur `` `` offic technolog `` `` word process `` `` electron mail `` `` ele...
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi...",document cluster use invert file approach willett p autom document cluster procedur describ requir use inter-docu similar matrix independ order do...


In [11]:
qrels_df

Unnamed: 0,qid,Q0,docno,label
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0
...,...,...,...,...
3109,111,422,0,0.0
3110,111,448,0,0.0
3111,111,485,0,0.0
3112,111,503,0,0.0


# **Indexing The df**

In [16]:
if not pt.started():
  # To specify that we start PyTerrier with PRF enabled
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-assemblies 5.9 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [17]:
indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
# index the text, record the docnos as metadata
index_ref = indexer.index(documents_df["processed_text"], documents_df["docno"])
print(index_ref.toString())

./DatasetIndex/data.properties


In [18]:
index = pt.IndexFactory.of(index_ref)

In [23]:
# Define The Retrieval Model
TF_IDF = pt.BatchRetrieve(index, wmodel="TF_IDF",num_results=10)
query = "Program for Machine-Mediated Searching"
query = preprocess(query)

results = TF_IDF.search(query)
results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,691,692,0,7.848232,program machine-medi search
1,1,418,419,1,7.035445,program machine-medi search
2,1,808,809,2,6.792702,program machine-medi search
3,1,509,510,3,6.082083,program machine-medi search
4,1,1371,1372,4,6.073753,program machine-medi search
5,1,476,477,5,5.902577,program machine-medi search
6,1,1279,1280,6,5.844263,program machine-medi search
7,1,726,727,7,5.666645,program machine-medi search
8,1,636,637,8,5.648133,program machine-medi search
9,1,578,579,9,5.286324,program machine-medi search


In [24]:
documents_df[['Text']][documents_df['docno'].isin(results['docno'].loc[0:4].tolist())]

Unnamed: 0,Text
418,"A Statistical Approach to Mechanized Encoding and Searching of Literary Information Luhn, H.P. Written communication of ideas is carried out on th..."
509,"Retrieval Efficiency from Titles and the Cost of Indexing Tell, Bjorn V. By the means of the flexible machine search system three experiments have..."
691,"Design and Operation of a Computer Search Center for Chemical Information Williams, M.E. Schipma, P.B. The objective of the Computer Search Center..."
808,"A Single Computer-Based System for Both Current Awareness and Retrospective Search: Operating Experience with ASSASSIN Clough, C. R. Bramwell, K. ..."
1371,"A Comparison of Manual and Machine Literature Searches Bivans, M.M. The NOAA/ERL library in Boulder, Colorado, performed a sample of six literatur..."


In [28]:
for query in queries_df["raw_query"][:10]:
    query = preprocess(query)
    print(query,'\n')
    results = TF_IDF.search(query)
    print(results,'\n')

problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl 

  qid  docid docno  rank     score  \
0   1    428   429     0  6.907603   
1   1    721   722     1  6.212413   
2   1   1298  1299     2  5.823173   
3   1    758   759     3  5.730376   
4   1     64    65     4  5.709225   
5   1     75    76     5  5.526522   
6   1    602   603     6  5.025624   
7   1    927   928     7  5.017109   
8   1   1420  1421     8  4.991572   
9   1   1089  1090     9  4.953308   

                                                                                                                       query  
0  problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl  
1  problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl  
2  problem concern make descript titl difficulti involv automat retriev arti

In [40]:
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the TF_IDF will be fed into the RM3 expander for query expansion.
rm3_qe = TF_IDF >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]

In [41]:
# Just print the expanded query with term scores
for s in expanded_query.split()[1:]:
  print(s)

print("\n" + query)

mathemat^0.167163387
theori^0.237806186
automat^0.029767158
retriev^0.100000009
inform^0.100000009
rate^0.017308220
group^0.100000009
probabl^0.037285324
sign^0.017617296
model^0.027381260
distribut^0.022766966
abstract^0.125596002
distort^0.017308220

use abstract mathemat inform retriev e g group theori


In [42]:
# After that we will search using the expanded query
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = TF_IDF.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))

# the top 5 retrieved tweets
documents_df[['Text']][documents_df['docno'].isin(results_wqe['docno'].loc[0:5].tolist())]

   Before Expansion    After Expansion
   docid_1    score_1  docid_2   score_2
0     1384  10.609905     1384  8.178853
1      535  10.314389      535  7.845765
2      444   9.220041     1243  7.570685
3     1243   8.453261     1172  7.259526
4      828   8.415359     1410  7.078108


Unnamed: 0,Text
535,"Information Theory and Reliable Communication Gallagher, R.G. This book is designed primarily for use as a first-year graduate text in information..."
573,"Information Concepts and Their Utility Artandi, Susan The concept of information is examined within the framework of the Mathematical Theory of Co..."
1172,"A Probability Distribution in Information Flow Systems Kazachkov, L. S. Khursin, L. A. The paper treats the probability distribution in informatio..."
1243,"Rate Distortion Theory A Mathematical Basis for Data Compression Berger, T. The branch of information theory devoted to situations in which the en..."
1384,"Structural Models: an introduction to the theory of directed graphs Harary, F. The purpose of this book is to present an introduction to a body o..."
1410,"Probability Theory Ventsel, E.W. This book is a textbook intended for those generally familiar with mathematics who are interested in the technica..."


In [None]:
run_it = 1
while run_it:
    Query_input = input("\n Enter a query to search for: \n HINT: Enter 0 to exit")
    if Query_input == '0':
        run_it = 0
        print("See you soon . . .")
        break
    Query_input = preprocess(Query_input)
    print('\n',Query_input,'\n')
    results = TF_IDF.search(Query_input)
    print(results,'\n')
    expanded_query = rm3_qe.search(Query_input).iloc[0]["query"]
    for s in expanded_query.split()[1:]:
        print(s)
        print("\n" + query)

    expanded_query_formatted = ' '.join(expanded_query.split()[1:])

    results_wqe = TF_IDF.search(expanded_query_formatted)

    print("   Before Expansion    After Expansion")
    print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
    results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))

    # the top 5 retrieved tweets
    print(documents_df[['Text']][documents_df['docno'].isin(results_wqe['docno'].loc[0:5].tolist())])

