# Installs

In [1]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

!pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting setuptools
  Downloading setuptools-67.4.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the followin

# Prepare CISI Dataset

## Download dataset

In [2]:
import requests

CISI_FILE = 'cisi.tar.gz'
CISI_PATH = './CISI'

URL = "http://ir.dcs.gla.ac.uk/resources/test_collections/cisi/cisi.tar.gz"
response = requests.get(URL)
open(CISI_FILE, "wb").write(response.content)

2385920

## Untar Files

In [3]:
import gzip
import io
from tqdm import tqdm

import tarfile

with tarfile.open(CISI_FILE) as gz:
  gz.extractall(CISI_PATH)

# Load CISI dataset

In [4]:
import os

# Loading code from: https://www.kaggle.com/code/rid17pawar/sentence-bert

def load_data(path):
    #_____________ Read data from CISI.ALL file and store in dictinary ________________
    
    with open(os.path.join(path, 'CISI.ALL')) as f:
        lines = ""
        for l in f.readlines():
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        lines = lines.lstrip("\n").split("\n")
 
    doc_set = {}
    doc_id = ""
    doc_text = ""

    for l in lines:
        if l.startswith(".I"):
            doc_id = l.split(" ")[1].strip() 
        elif l.startswith(".X"):
            doc_set[doc_id] = doc_text.lstrip(" ")
            doc_id = ""
            doc_text = ""
        else:
            doc_text += l.strip()[3:] + " " 

    print(f"Number of documents = {len(doc_set)}")
    print(doc_set["1"]) 
    
    
    #_____________ Read data from CISI.QRY file and store in dictinary ________________
    
    with open(os.path.join(path, 'CISI.QRY')) as f:
        lines = ""
        for l in f.readlines():
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        lines = lines.lstrip("\n").split("\n")
          
    qry_set = {}
    qry_id = ""
    for l in lines:
        if l.startswith(".I"):
            qry_id = l.split(" ")[1].strip() 
        elif l.startswith(".W"):
            qry_set[qry_id] = l.strip()[3:]
            qry_id = ""

    print(f"\n\nNumber of queries = {len(qry_set)}")    
    print(qry_set["1"]) 
    
    
    #_____________ Read data from CISI.REL file and store in dictinary ________________
    
    rel_set = {}
    with open(os.path.join(path, 'CISI.REL')) as f:
        for l in f.readlines():
            qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0] 
            doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]

            if qry_id in rel_set:
                rel_set[qry_id].append(doc_id)
            else:
                rel_set[qry_id] = []
                rel_set[qry_id].append(doc_id)

    print(f"\n\nNumber of mappings = {len(rel_set)}")
    print(rel_set["1"]) 
    
    doc_set = {int(id):doc for (id,doc) in doc_set.items()}
    qry_set = {int(id):qry for (id,qry) in qry_set.items()}
    rel_set = {int(qid):list(map(int, did_lst)) for (qid,did_lst) in rel_set.items()}
    
    return doc_set, qry_set, rel_set

In [5]:
doc_set, query_set, rel_set = load_data(CISI_PATH)

Number of documents = 1460
18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 


Number of queries = 112
What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?


Number of mappings = 76
['28', '35', '38', '42', '43', '52', '65', '76', '86', '150', '189', '192', '193', '1

# Text Preprocessing

Lemmatization and stop words removal

In [6]:
import spacy
from tqdm import tqdm

# apply lemmatization in docs (doc_set) and queries (query_set)
# remove stop words
nlp = spacy.load("en_core_web_sm")

doc_set_lemma = {}
for doc, key in tqdm(zip(nlp.pipe(doc_set.values(), batch_size=32, n_process=3, disable=["parser", "ner"]), doc_set.keys()), 
                     total=len(doc_set),
                     desc="doc_set lemmatization"):
  doc_set_lemma[key] = ' '.join([tok.lemma_ for tok in doc if not tok.is_stop])

query_set_lemma = {}
for doc, key in tqdm(zip(nlp.pipe(query_set.values(), batch_size=32, n_process=3, disable=["parser", "ner"]), query_set.keys()),
                     total=len(query_set),
                     desc="query_set lemmatization"):
  query_set_lemma[key] = ' '.join([tok.lemma_ for tok in doc if not tok.is_stop])
  
assert len(doc_set_lemma) == len(doc_set)
assert len(query_set_lemma) == len(query_set)

doc_set lemmatization: 100%|██████████| 1460/1460 [00:29<00:00, 50.23it/s]
query_set lemmatization: 100%|██████████| 112/112 [00:01<00:00, 81.12it/s]


# Create bm25 corpus

In [7]:
from rank_bm25 import BM25Okapi

corpus = [doc for doc in doc_set_lemma.values()]
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# Metric Functions

Functions extracted from this [kaggle notebook](https://www.kaggle.com/code/rid17pawar/universal-sentence-encoder).

In [8]:
def recall_k(ground_truth, predictions, k):
  avg_recall = 0
  for query_id in ground_truth:    
    truth_set = set(ground_truth[query_id])
    pred_set = set(predictions[query_id][:k])
    result = round(len(truth_set & pred_set) / float(len(truth_set)), 2) 
    avg_recall += result
  avg_recall /= len(ground_truth)

  return round(avg_recall, 3)

In [9]:
def precision_k(ground_truth, predictions, k):
  avg_precision = 0
  for query_id in ground_truth:    
    truth_set = set(ground_truth[query_id]) 
    pred_set = set(predictions[query_id][:k])
    result = round(len(truth_set & pred_set) / float(len(pred_set)), 2)
    avg_precision += result
  avg_precision /= len(ground_truth)

  return round(avg_precision, 3)

In [10]:
def get_first_relevent_docid(predictions, truth):
    for doc_id in predictions:
        is_exist = doc_id in truth 
        if is_exist:
            return predictions.index(doc_id)+1 
    else:
        return -1

In [11]:
def mrr(doc_scores, rel_set):
    Q = len(rel_set) 
    cumulative_reciprocal = 0 
    
    for query_id in rel_set:
        first_result = get_first_relevent_docid(doc_scores[query_id], rel_set[query_id])
        first_result_rank = len(doc_scores['1'])+1 if first_result<1 else first_result 
        reciprocal = 1 / first_result_rank
        cumulative_reciprocal += reciprocal
        
    mrr = 1/Q * cumulative_reciprocal 
    return round(mrr,3)

In [12]:
def map_k(rel_set, doc_scores, K):
    Q = len(rel_set) 
    avg_precision = [] 

    for query_id in rel_set:
        precision_relevance_summation = 0

        for k in range(0,K):
            # calculate precision@k
            truth_set = set(rel_set[query_id])
            pred_set = set(doc_scores[query_id][:k+1])
            precision_at_k = round(len(truth_set & pred_set) / float(len(pred_set)), 2)  
            
            rel_k = 1 if doc_scores[query_id][k] in rel_set[query_id] else 0 
            precision_relevance_summation += precision_at_k * rel_k 

        # AP value of query qid
        avg_precision_q = precision_relevance_summation / len(rel_set[query_id])
        avg_precision.append(avg_precision_q)

    map_k = sum(avg_precision) / Q 
    return round(map_k, 3)

# Evaluate

## Generate scores

In [13]:
query_doc_scores = {}
query_doc_sorted = {}

for query_id in query_set_lemma:
  query_txt = query_set_lemma[query_id]
  tokenized_query = query_txt.split(" ")
  doc_scores = bm25.get_scores(tokenized_query)

  # starts in 1 because documents ids start with 1
  doc_scores_dict = {idx: score for idx, score in enumerate(doc_scores, start=1)}
  
  query_doc_sorted[query_id] = sorted(doc_scores_dict, key=doc_scores_dict.get, reverse=True) 

  query_doc_scores[query_id] = dict(sorted(doc_scores_dict.items(), key=lambda x:x[1], reverse=True))

## Metrics

In [14]:
print(f"Recall@10 = {recall_k(rel_set, query_doc_sorted, 10)}") #Top-10 results

Recall@10 = 0.103


In [15]:
print(f"Precision@10 = {precision_k(rel_set, query_doc_sorted, 10)}") #Top-10 results

Precision@10 = 0.261


In [16]:
mrr_result = mrr(query_doc_sorted, rel_set)
print(f"Mean Reciprocal Rank (MRR): {mrr_result}")

Mean Reciprocal Rank (MRR): 0.541


In [17]:
map_10 = map_k(rel_set, query_doc_sorted, K=10)
print(f"MAP@10 (MAP)= {map_10}")

MAP@10 (MAP)= 0.056


# Search System

In [18]:
def make_search(query_txt, topk):
  """Returns a sorted list of the topk ids of relevant document given 
  a query text.

    Parameters
    ----------
    query_txt : str
        A query text
    topk      : int
        total numer of relevant documents

    Returns
    -------
    list
        sorted list of the topk ids of relevant document given the query text
    """

  tokenized_query = query_txt.split(" ")
  doc_scores = bm25.get_scores(tokenized_query)

  doc_scores_dict = {idx: score for idx, score in enumerate(doc_scores, start=1)}
  query_doc_sorted = sorted(doc_scores_dict, key=doc_scores_dict.get, reverse=True) 

  return query_doc_sorted[:topk]

In [19]:
#@title String fields

query = 'What problems and concerns are there in making up descriptive titles?' #@param {type:"string"}
topk = 5 #@param [5, 10, 20, 30]

In [20]:
import ipywidgets as widgets
from IPython.display import display
button = widgets.Button(description="Search")
output = widgets.Output()

def on_button_clicked(b):
  search_result = make_search(query, topk)
  # Display the message within the output widget.
  with output:
    for doc_id in search_result:
      print('id: {0}'.format(doc_id))
      print(doc_set[doc_id])
      print('-'*100)

button.on_click(on_button_clicked)
display(button, output)

Button(description='Search', style=ButtonStyle())

Output()