In [None]:
#install the Pyterrier framework
!pip install python-terrier
# install the nltk modules
!pip install nltk



# **Imports**


In [None]:
import pandas as pd
import pyterrier as pt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
pd.set_option('display.max_colwidth', 150)

In [None]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

fatal: destination path 'terrier-prf' already exists and is not an empty directory.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
maven is already the newest version (3.6.3-5).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
/content/terrier-prf
[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m----------------------< [0;36morg.terrier:terrier-prf[0;1m >-----------------------[m
[[1;34mINFO[m] [1mBuilding terrier-prf 0.2-SNAPSHOT[m
[[1;34mINFO[m] [1m--------------------------------[ jar ]---------------------------------[m
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mmaven-resources-plugin:2.6:resources[m [1m(default-resources)[m @ [36mterrier-prf[0;1m ---[m
[[1;34mINFO[m] Using 'UTF-8' encoding to copy filtered resources.
[[1;34mINFO[m] skip non existing resourceDirectory /content/terrier-prf/src/main/resources
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mma

# **Preprocessing the data set**
Tokenization, Removing stop words and Cleaning

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Initialize Porter stemmer
stemmer = PorterStemmer()

In [None]:
def Stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    # print (tokens)
    return ' '.join(stemmed_tokens)

def clean(text):
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=\@]", " ", text) # remove special characters
   text = re.sub(r'\t', ' ', text) # remove tabs
   text = re.sub(r'\n', ' ', text) # remove line jump
   text = re.sub(r"\s+", " ", text) # remove extra white space
   text = text.strip()
   return text

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words] #Lower is used to normalize al the words make them in lower case
    # print('Tokens are:',tokens,'\n')
    return ' '.join(filtered_tokens)

#we need to process the query also as we did for documents
def preprocess(sentence):
  sentence = clean(sentence)
  sentence = remove_stopwords(sentence)
  sentence = Stem_text(sentence)
  return sentence

# Load Data for the Indexer

In [None]:
import zipfile
zip_file_name = 'cisi.zip'
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('cisi_dataset')
!ls cisi_dataset

CISI.ALL  CISI.QRY  CISI.REL


In [None]:
def load_cisi_dataset(data_dir):
    documents_path = os.path.join(data_dir, 'CISI.ALL')
    queries_path = os.path.join(data_dir, 'CISI.QRY')
    qrels_path = os.path.join(data_dir, 'CISI.REL')

    documents_df = read_documents(documents_path)
    queries_df = read_queries(queries_path)
    qrels_df = read_qrels(qrels_path)
    return documents_df, queries_df, qrels_df

# Read documents from CISI.ALL file
def read_documents(documents_path):
    with open(documents_path, 'r') as file:
        lines = file.readlines()
    documents = []
    current_document = None
    for line in lines:
        if line.startswith('.I'):
            if current_document is not None:
                current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
                documents.append(current_document)
            current_document = {'ID': line.strip().split()[1], 'Text': ''}
        elif line.startswith('.T'):
            continue
        elif line.startswith('.A') or line.startswith('.B') or line.startswith('.W') or line.startswith('.X'):
            continue
        else:
            current_document['Text'] += line.strip() + ' '

    # Append the last document
    if current_document is not None:
        current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
        documents.append(current_document)
    documents_df = pd.DataFrame(documents)
    return documents_df

# Read queries from CISI.QRY file
def read_queries(queries_path):
    with open(queries_path, 'r') as file:
        lines = file.readlines()
    query_texts = []
    query_ids = []
    current_query_id = None
    current_query_text = []
    for line in lines:
        if line.startswith('.I'):
            if current_query_id is not None:
                query_texts.append(' '.join(current_query_text))
                current_query_text = []
            current_query_id = line.strip().split()[1]
            query_ids.append(current_query_id)
        elif line.startswith('.W'):
            continue
        elif line.startswith('.X'):
            break
        else:
            current_query_text.append(line.strip())
    # Append the last query
    query_texts.append(' '.join(current_query_text))
    queries_df = pd.DataFrame({
        'qid': query_ids,
        'raw_query': query_texts})
    return queries_df

# Read qrels from CISI.REL file
def read_qrels(qrels_path):
    qrels_df = pd.read_csv(qrels_path, sep='\s+', names=['qid','Q0','docno','label'])
    return qrels_df

In [None]:
data_dir = '/content/cisi_dataset'
documents_df, queries_df, qrels_df = load_cisi_dataset(data_dir)
documents_df['Text'][0]

"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 1"

In [None]:
documents_df

Unnamed: 0,ID,Text
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi..."
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar..."
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz..."
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'..."
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl..."
...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex..."
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b..."
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude..."
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ..."


In [None]:
queries_df

Unnamed: 0,qid,raw_query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req..."
2,3,What is information science? Give definitions where possible.
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...
...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c..."
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a..."
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W..."
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi..."


In [None]:
qrels_df

Unnamed: 0,qid,Q0,docno,label
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0
...,...,...,...,...
3109,111,422,0,0.0
3110,111,448,0,0.0
3111,111,485,0,0.0
3112,111,503,0,0.0


In [None]:
#the docno will be our ID
documents_df["docno"]=documents_df["ID"].astype(str)
documents_df

Unnamed: 0,ID,Text,docno
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi...",1
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar...",2
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz...",3
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'...",4
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl...",5
...,...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex...",1456
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b...",1457
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude...",1458
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ...",1459


In [None]:
queries_df["qid"]=queries_df["qid"].astype(str)
queries_df

Unnamed: 0,qid,raw_query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req..."
2,3,What is information science? Give definitions where possible.
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...
...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c..."
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a..."
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W..."
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi..."


In [None]:
documents_df['processed_text'] = documents_df['Text'].apply(preprocess)
documents_df

Unnamed: 0,ID,Text,docno,processed_text
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi...",1,18 edit dewey decim classif comaromi j p present studi histori dewey decim classif first edit ddc publish 1876 eighteenth edit 1971 futur edit con...
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar...",2,use made technic librari slater report analysi 6300 act use 104 technic librari unit kingdom librari use one aspect wider pattern inform use infor...
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz...",3,two kind power essay bibliograph control wilson p relationship organ control write organ control knowledg inform inevit enter stori write contain ...
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'...",4,system analysi univers librari ; final report research project buckland k establish nine new univers 1960 's provok highli stimul re-examin natur ...
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl...",5,librari manag game report research project brophi p although use game profession educ becom widespread last decad method use number field mani hun...
...,...,...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex...",1456,world dynam forrest j w last sever decad interest econom develop popul growth world environ expand rapidli world-wid stress increas mani individu ...
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b...",1457,world trend librari educ bramley g one signific aspect evolut librarianship twentieth centuri emerg librari school potent factor shape new philoso...
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude...",1458,legal restrict exploit patent monopoli econom analysi baxter w patent law confer patente power exclud other make use sell invent further constitut...
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ...",1459,languag thought poluskin v book consid basic aspect complex problem - histor social essenc languag thought interact histor evolut essenc linguist ...


In [None]:
queries_df["query"]=queries_df["raw_query"].apply(preprocess)
queries_df

Unnamed: 0,qid,raw_query,query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...,problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req...",actual pertin data oppos refer entir articl retriev automat respons inform request
2,3,What is information science? Give definitions where possible.,inform scienc give definit possibl
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.,imag recognit method automat transform print text computer-readi form
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...,special train ordinari research businessmen need proper inform manag unobstruct use inform retriev system problem like encount
...,...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c...",program machine-medi search toliv techniqu onlin instruct assist bibliograph data base searcher call individu instruct data access ( iida ) develo...
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a...",author cocit literatur measur intellectu structur white h griffith b c shown map particular area scienc case inform scienc done use author unit an...
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W...",progress document word process introduct apprais whitehead j `` offic futur `` `` offic technolog `` `` word process `` `` electron mail `` `` ele...
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi...",document cluster use invert file approach willett p autom document cluster procedur describ requir use inter-docu similar matrix independ order do...


# Indexing:

In [None]:
if not pt.started():
  # In this lab, we need to specify that we start PyTerrier with PRF enabled
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-assemblies 5.9 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [None]:
indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
# index the text, record the docnos as metadata
index_ref = indexer.index(documents_df["processed_text"], documents_df["docno"])
print(index_ref.toString())

./DatasetIndex/data.properties


In [None]:
index = pt.IndexFactory.of(index_ref)

In [None]:
#we need to process the query also as we did for documents
def preprocess(sentence):
  sentence = clean(sentence)
  sentence = remove_stopwords(sentence)
  sentence = Stem_text(sentence)
  return sentence

In [None]:
query="ronaldo is the best"
query = preprocess(query)
query

'ronaldo best'

In [None]:
splited_query = query.split()
len(splited_query)

2

In [None]:
# Identify the documents that have the query

# Function(1) : Split the documents into tokens

def split(text):

  docs = {}

  i = 0

  for doc in text:
    docs[i] = doc.split()

    i = i + 1

  splited_docs_val = docs.values()

  return splited_docs_val


# Function(2) : return the the docs that have the same token with query

def num(spli):

 lis = []

 for i in range(len(splited_query)):

   doc_num = 0

   for j in spli:

     doc_num = doc_num + 1

     for k in range(len(j)):

       if splited_query[i] == j[k]:
         lis.append(doc_num)

 return lis


# Function(3) : the all function

def docs_IDs(text):
  split_docs = split(text)

  nums = num(split_docs)

  return nums


def Retrieve_docs(dfs , text):
  split_doc = split(text)

  numo = num(split_doc)

  for d in numo:
    # Printing value of column 'A' at index 2
    print(f"Document Number {d}: \n {dfs['processed_text'].iloc[d]}")



splited_docs_values = docs_IDs(documents_df['processed_text'])

print(splited_docs_values)

[4, 35, 93, 112, 142, 213, 252, 282, 323, 390, 500, 515, 548, 590, 592, 596, 610, 611, 625, 634, 636, 643, 712, 712, 742, 771, 845, 905, 946, 963, 973, 1016, 1065, 1150, 1186, 1212, 1230, 1245, 1246, 1248, 1277, 1321, 1322, 1401, 1445, 1450]


In [None]:
# Rank the retrievd documents based ranking algorithm (TF-IDF)
tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})

In [None]:
results=tfidf_retr.search(query)
results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,711,712,0,4.006465,ronaldo best
1,1,770,771,1,3.40766,ronaldo best
2,1,1321,1322,2,3.38329,ronaldo best
3,1,1245,1246,3,3.289201,ronaldo best
4,1,741,742,4,3.266491,ronaldo best
5,1,34,35,5,3.200203,ronaldo best
6,1,111,112,6,3.136553,ronaldo best
7,1,633,634,7,3.016557,ronaldo best
8,1,3,4,8,2.941534,ronaldo best
9,1,1400,1401,9,2.941534,ronaldo best


In [None]:
import pandas as pd
import pyterrier as pt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
pd.set_option('display.max_colwidth', 150)

In [None]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

fatal: destination path 'terrier-prf' already exists and is not an empty directory.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
maven is already the newest version (3.6.3-5).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
/content/terrier-prf
[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m----------------------< [0;36morg.terrier:terrier-prf[0;1m >-----------------------[m
[[1;34mINFO[m] [1mBuilding terrier-prf 0.2-SNAPSHOT[m
[[1;34mINFO[m] [1m--------------------------------[ jar ]---------------------------------[m
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mmaven-resources-plugin:2.6:resources[m [1m(default-resources)[m @ [36mterrier-prf[0;1m ---[m
[[1;34mINFO[m] Using 'UTF-8' encoding to copy filtered resources.
[[1;34mINFO[m] skip non existing resourceDirectory /content/terrier-prf/src/main/resources
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mma

In [None]:
# Define our retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)

result = bm25.search(query)
result

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,711,712,0,7.190449,ronaldo best
1,1,770,771,1,6.115766,ronaldo best
2,1,1321,1322,2,6.07203,ronaldo best
3,1,1245,1246,3,5.903167,ronaldo best
4,1,741,742,4,5.862408,ronaldo best
5,1,34,35,5,5.743442,ronaldo best
6,1,111,112,6,5.629208,ronaldo best
7,1,633,634,7,5.41385,ronaldo best
8,1,3,4,8,5.279205,ronaldo best
9,1,1400,1401,9,5.279205,ronaldo best


In [None]:
documents_df[['Text']][documents_df['docno'].isin(results['docno'].loc[0:4].tolist())]

Unnamed: 0,Text
711,"Technical-Abstracting Fundamentals. II. Writing Principles and Practices Weil, B.H. Zarember, I. Owen, H. Abstracts can serve their purpose best o..."
741,"Searching the Chemical Abstracts Condensates Data Base via Two On-Line Systems Prewitt, Barbara G. A comparison of the most used features of Syste..."
770,"Survey of Information Needs of Physicists and Chemists Aims, A. Scientific research is expensive and the practical application of its results is e..."
1245,"Reader Instruction in Colleges and Universities Mews, H. DEFINITION: What is 'reader instruction'? Another currently used term is 'library instr..."
1321,"Selecting Materials Broadus, R.N. In reality, the building and shaping of the collection is the heart of librarianship, involving the essential ph..."


In [None]:
# "rewrite" function from PyTerrier will be used to expand queries specifying RM3 as the model
# fb_docs ==> no. expansion documents
# fb_terms ==> no. expansion terms
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the BM25 will be fed into the RM3 expander for query expansion.
rm3_qe = bm25 >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]

expanded_query

'applypipeline:off instruct^0.052078977 reader^0.049909022 best^0.383974701 academ^0.033395279 profess^0.026626848 question^0.026911018 abstract^0.044410627 abil^0.026039489 purpo^0.027547427 util^0.029106591'

In [None]:
# Just print the expanded query with term scores
for s in expanded_query.split()[1:]:
  print(s)

print("\n" + query)

instruct^0.052078977
reader^0.049909022
best^0.383974701
academ^0.033395279
profess^0.026626848
question^0.026911018
abstract^0.044410627
abil^0.026039489
purpo^0.027547427
util^0.029106591

ronaldo best


In [None]:
# After that you can search using the expanded query
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = bm25.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))

#Let's check the tweets text for the top 5 retrieved tweets
documents_df[['Text']][documents_df['docno'].isin(results_wqe['docno'].loc[0:5].tolist())]

   Before Expansion    After Expansion
   docid_1   score_1  docid_2   score_2
0      711  4.006465      711  9.616630
1      770  3.407660     1245  9.434091
2     1321  3.383290       34  7.121713
3     1245  3.289201      741  6.561419
4      741  3.266491     1321  6.533215


Unnamed: 0,Text
3,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'..."
34,"Comparisons of Four Types of Lexical Indicators of Content Rath, G.J. Resnick, A. Savage, T.R. An experiment was conducted to determine which of f..."
711,"Technical-Abstracting Fundamentals. II. Writing Principles and Practices Weil, B.H. Zarember, I. Owen, H. Abstracts can serve their purpose best o..."
741,"Searching the Chemical Abstracts Condensates Data Base via Two On-Line Systems Prewitt, Barbara G. A comparison of the most used features of Syste..."
1245,"Reader Instruction in Colleges and Universities Mews, H. DEFINITION: What is 'reader instruction'? Another currently used term is 'library instr..."
1321,"Selecting Materials Broadus, R.N. In reality, the building and shaping of the collection is the heart of librarianship, involving the essential ph..."


In [None]:
!pip install flask_ngrok

Collecting flask_ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask_ngrok
Successfully installed flask_ngrok-0.0.25


In [None]:
df2 = documents_df.head(50)

df2 = df2.to_dict()

df2

{'ID': {0: '1',
  1: '2',
  2: '3',
  3: '4',
  4: '5',
  5: '6',
  6: '7',
  7: '8',
  8: '9',
  9: '10',
  10: '11',
  11: '12',
  12: '13',
  13: '14',
  14: '15',
  15: '16',
  16: '17',
  17: '18',
  18: '19',
  19: '20',
  20: '21',
  21: '22',
  22: '23',
  23: '24',
  24: '25',
  25: '26',
  26: '27',
  27: '28',
  28: '29',
  29: '30',
  30: '31',
  31: '32',
  32: '33',
  33: '34',
  34: '35',
  35: '36',
  36: '37',
  37: '38',
  38: '39',
  39: '40',
  40: '41',
  41: '42',
  42: '43',
  43: '44',
  44: '45',
  45: '46',
  46: '47',
  47: '48',
  48: '49',
  49: '50'},
 'Text': {0: "18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dew

In [None]:
def sui(df2 , que):
 i = 0

 quer = preprocess(que)

 docs_id = []

 for key, value in df2.items():
   if key == 'processed_text':
         val = value.values()
         for doc in val:
           terms = doc.split()
           for term in terms:
             if term == quer and i not in docs_id:
               docs_id.append(f'''Document number {i} -----> \n{documents_df["Text"][i]}''')
           i = i + 1
 return docs_id

In [None]:
query2 = "book"

x = sui(df2 , query2)
x

['Document number 6 -----> \nAcademic Library Buildings A Guide to Architectural Issues and Solutions Ellsworth, R.E. This book attempts to present representative examples of successful architectural solutions to the important problems librarians and architects face in planning new college and university library buildings or in remodeling and enlarging existing structures.  It does not attempt to make case study evaluations, as was done by Ellsworth Mason for Brown and Yale.  Nor does it present examples of unsuccessful solutions except to show how to avoid mistakes, and in these cases the libraries will not be identified. 7',
 'Document number 7 -----> \nThe Academic Library Essays in Honor of Guy R. Lyle Farber, E.I. As important for staff members\' individual development as was the apprenticeship in administration, perhaps the most significant attitude one acquired while working for Guy was engendered by his insistence that librarians must be interested in and knowledgeable about th

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Assuming preprocess, bm25, rm3_qe, and documents_df are defined elsewhere

def process_query(query):
    query = preprocess(query)
    results = bm25.search(query)
    expanded_query = rm3_qe.search(query).iloc[0]["query"]
    expanded_query_formatted = ' '.join(expanded_query.split()[1:])
    results_wqe = bm25.search(expanded_query_formatted)
    return results_wqe

def process_results(results_wqe, documents_df):
    result_merged = pd.merge(results_wqe, documents_df, on="docno", how="left")
    result_merged = result_merged.sort_values(by="score", ascending=False)
    return result_merged

def classify_reviews(result_merged):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
    inputs = tokenizer(result_merged["processed_text"].tolist(), padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    scores = outputs.logits[:, 1]
    reviews_result_v2 = pd.DataFrame({
        'preprocessed_title': result_merged['processed_text'],
        'score': scores.detach().numpy()
    }).sort_values(by="score", ascending=False)
    return reviews_result_v2

def merge_final_results(reviews_result_v2, documents_df):
    final_results = pd.merge(reviews_result_v2, documents_df, left_on="preprocessed_title", right_on="processed_text", how="left")
    final_results_text = final_results['Text']
    final_results_text_df = final_results_text.to_frame()
    return final_results_text_df

# Usage example:
query = "book"
results_wqe = process_query(query)
result_merged = process_results(results_wqe, documents_df)
reviews_result_v2 = classify_reviews(result_merged)
final_results_text_df = merge_final_results(reviews_result_v2, documents_df)

# Print the final results directly
print(final_results_text_df)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                                                                                                                    Text
0  Book Selection of Censorship Fiske, M. The key question was whether restrictions are being imposed on librarians, or whether they are imposing res...
1  Rare Book Librarianship Cave, R. Although there is an extensive and enjoyable literature on the subject of rare books, most of it is concerned wit...
2  Man's Aggression Montagu, M.F.A. The purpose of this book is to inquire into the validity of the views on human nature expressed in the widely rea...
3  Buyers and Borrowers Mann, P.H. This is the second book based on studies into social aspects of book reading. The present book is largely a report...
4  Book Catalogs Tauber, M.F. In the intervening years since the appearance of the first collection of papers concerning book catalogs (Kingery, Robe...
5  How Biomedical Investigators Use Library Books Raisig, L. Miles Smith, Meredith

In [None]:
from google.colab.output import eval_js
print (eval_js("google.colab.kernel.proxyPort(5000)"))

https://i8qq74vmsc-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [None]:
from flask import Flask, request
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route("/")
def home():
    return """
    <style>
        body {
            background-color: #e6f3ff; /* Light blue background */
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
            display: flex;
            flex-direction: column;
            align-items: center;
            justify-content: flex-start;
            min-height: 100vh; /* Full viewport height */
            padding-top: 20px; /* Space at the top */
        }

        .search-container {
            display: flex;
            align-items: center;
            justify-content: center;
            margin-bottom: 20px; /* Space between search bar and content */
        }

        input[type="text"] {
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 20px;
            margin-right: 10px;
            width: 200px;
            box-sizing: border-box;
        }

        button {
            padding: 10px 20px;
            background-color: #fff;
            color: #007bff; /* Blue text */
            border: none;
            border-radius: 20px;
            cursor: pointer;
            transition: background-color 0.3s, color 0.3s; /* Smooth transition */
        }

        button:hover {
            background-color: #0056b3; /* Darker blue on hover */
            color: white; /* White text on hover */
        }
    </style>

    <div class="search-container">
        <input type="text" id="searchInput" placeholder="Enter your query...">
        <button onclick="search()">Search</button>
    </div>

    <div id="searchResult"></div>

    <script>
        function search() {
            var searchTerm = document.getElementById("searchInput").value;
            fetch('/search', {
                method: 'POST',
                body: JSON.stringify({ query: searchTerm }),
                headers:{
                    'Content-Type': 'application/json'
                }
            })
            .then(response => response.json())
            .then(data => {
                console.log("Received data:", data); // Debug: Check if data is received
                var resultDiv = document.getElementById("searchResult");
                resultDiv.innerHTML = "<h2>Relevant Documents IDs:</h2>";
                if (data.results.length === 0) {
                    resultDiv.innerHTML += "<p>No documents found</p>";
                } else {
                    data.results.forEach(doc => {
                        console.log("Displaying document:", doc); // Debug: Check if document is displayed
                        resultDiv.innerHTML += "<p>" + doc + "</p>";
                    });
                }
            })
            .catch(error => {
                console.error('Error occurred during fetch:', error); // Debug: Log fetch errors
            });
        }
    </script>
    """

@app.route("/search", methods=['POST'])
def search():
    query = request.json['query']
    print("Received query:", query)  # Debug: Check if Flask receives the query
    results = sui(df2, query)
    print("Search results:", results)  # Debug: Check if sui function returns results
    return {'results': results}

app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-17:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 203, in _new_conn
    sock = connection.create_connection(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 791, in urlopen
    response = self._make_request(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 497, in _make_request
    conn.request(
  File "/usr/local/lib/python3.10/dist-packages/urllib3

Received query: qqq
Search results: []


In [None]:
vaswani_dataset = pt.datasets.get_dataset("vaswani")
dff = vaswani_dataset.get_topics()

dff['docno'] = dff.index
# Rename column 'A' to 'X'
dff = dff.rename(columns={'query': 'Text'})
qrels = vaswani_dataset.get_qrels()
qrels['docno']=qrels['docno'].astype(str)
dff

Downloading vaswani topics to /root/.pyterrier/corpora/vaswani/query-text.trec


query-text.trec:   0%|          | 0.00/3.05k [00:00<?, ?iB/s]

Downloading vaswani qrels to /root/.pyterrier/corpora/vaswani/qrels


qrels:   0%|          | 0.00/6.63k [00:00<?, ?iB/s]

Unnamed: 0,qid,Text,docno
0,1,measurement of dielectric constant of liquids by the use of microwave techniques,0
1,2,mathematical analysis and design details of waveguide fed microwave radiations,1
2,3,use of digital computers in the design of band pass filters having given phase and attenuation characteristics,2
3,4,systems of data coding for information transfer,3
4,5,use of programs in engineering testing of computers,4
...,...,...,...
88,89,tunnel diode construction and its electrical characteristics explained,88
89,90,electronic density of states at the surface of a semiconductor compared with that at depth,89
90,91,resistivity of metallic thin films related to surface roughness,90
91,92,the phenomenon of radiation caused by charged particles moving in varying electric and magnetic fields,91


In [None]:
indexref2 = vaswani_dataset.get_index()
index2 = pt.IndexFactory.of(indexref2)

print(index2.getCollectionStatistics().toString())

Downloading vaswani index to /root/.pyterrier/corpora/vaswani/index


data.direct.bf:   0%|          | 0.00/388k [00:00<?, ?iB/s]

data.document.fsarrayfile:   0%|          | 0.00/234k [00:00<?, ?iB/s]

data.inverted.bf:   0%|          | 0.00/362k [00:00<?, ?iB/s]

data.lexicon.fsomapfile:   0%|          | 0.00/682k [00:00<?, ?iB/s]

data.lexicon.fsomaphash:   0%|          | 0.00/777 [00:00<?, ?iB/s]

data.lexicon.fsomapid:   0%|          | 0.00/30.3k [00:00<?, ?iB/s]

data.meta-0.fsomapfile:   0%|          | 0.00/725k [00:00<?, ?iB/s]

data.meta.idx:   0%|          | 0.00/89.3k [00:00<?, ?iB/s]

data.meta.zdata:   0%|          | 0.00/224k [00:00<?, ?iB/s]

data.properties:   0%|          | 0.00/4.29k [00:00<?, ?iB/s]

md5sums:   0%|          | 0.00/619 [00:00<?, ?iB/s]

Number of documents: 11429
Number of terms: 7756
Number of postings: 224573
Number of fields: 1
Number of tokens: 271581
Field names: [text]
Positions:   false



In [None]:
retr = pt.BatchRetrieve(index2, controls = {"wmodel": "TF_IDF"})

res = retr.search("mathematical")
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,4746,4747,0,5.168347,mathematical
1,1,7399,7400,1,5.036916,mathematical
2,1,5629,5630,2,4.912003,mathematical
3,1,7997,7998,3,4.912003,mathematical
4,1,4546,4547,4,4.679886,mathematical
...,...,...,...,...,...,...
147,1,3484,3485,147,1.828498,mathematical
148,1,7283,7284,148,1.747822,mathematical
149,1,6714,6715,149,1.702745,mathematical
150,1,8622,8623,150,1.606095,mathematical


In [None]:
eval = pt.Evaluate(res,qrels)
eval

{'map': 4.7960250544348844e-06, 'ndcg': 0.00022891881462746983}