In [15]:
# Install necessary libraries (run these commands in a Colab cell)
!pip install rank_bm25 scikit-learn nltk




In [23]:
# Import the Natural Language Toolkit (NLTK) library,
import nltk
from nltk.tokenize import word_tokenize
from IPython.display import display

# Download additional NLTK tokenizer data called 'punkt_tab'.
# 'punkt_tab' is used to fix specific tokenization errors that may occur with the default 'punkt' models.
# This ensures our tokenization is as accurate as possible.
nltk.download('punkt_tab')

def exercise1_tokenization():
  print("Explore Tokenization")
  sentence = "Hello you are my sunshine"
  print("\nOriginal Sentence:", sentence)
    
  tokens = word_tokenize(sentence)
  print("Tokens (without lowercasing):", tokens)
  sentence_lower = sentence.lower()
  token_lower = word_tokenize(sentence_lower)
  print("Token (with loweringcainsg):", token_lower)

  user_sentence = input("\nEnter a sentence to tokenize: ")
  user_tokens = word_tokenize(user_sentence.lower())
  print("Token (user input token):", user_tokens)

exercise1_tokenization()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jasmine.frantz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Explore Tokenization

Original Sentence: Hello you are my sunshine
Tokens (without lowercasing): ['Hello', 'you', 'are', 'my', 'sunshine']
Token (with loweringcainsg): ['hello', 'you', 'are', 'my', 'sunshine']



Enter a sentence to tokenize:  All I wanna say is blah blah


Token (user input token): ['all', 'i', 'wan', 'na', 'say', 'is', 'blah', 'blah']


In [None]:
# Import the word_tokenize function from NLTK which is used to split text into individual words.
from nltk.tokenize import word_tokenize
# Import BM25Okapi from rank_bm25, which implements the BM25 ranking algorithm.
# BM25 is a popular information retrieval model that scores documents based on query terms.
from rank_bm25 import BM25Okapi
# Import the nltk library itself, which is used for various natural language processing tasks.
import nltk

nltk.download('punkt_tab')

documents = ["The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly.",
    "A quick movement of the enemy will jeopardize six gunboats.",
    "All questions asked by five watched experts amaze the judge.",
    "The five boxing wizards jump quickly."
]

tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

# -----------------------------------------------
# Initialize the BM25 Model
# -----------------------------------------------
# BM25Okapi is initialized with the preprocessed (tokenized) documents.
# BM25 will use these tokens to compute relevance scores for each document given a query.
bm25 = BM25Okapi(tokenized_docs)

def exercise2_bm25():
     print("Exercise 2: BM25 Retrieval Exploration")
     print("Document Body")
     for i, doc in enumerate(documents):
         print(f" {i+1}. {doc}")
     while True:
         query = input("\nEnter a query for BM25 retrieval (or type 'exit' to quit): ")
         if query.lower() == "exit":
             break

         tokenized_query = word_tokenize(query.lower())

         scores = bm25.get_scores(tokenized_query)

         print("\nBM25 Scores for your query:")
         for doc, score in zip(documents, scores):
             print(f"Score: {score:.2f} | Document: {doc}")

# The following block checks if this script is being run directly.
# This ensures that the exercise runs only when the file is executed as a standalone program.
if __name__ == "__main__":
    exercise2_bm25()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jasmine.frantz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Exercise 2: BM25 Retrieval Exploration
Document Body
 1. The quick brown fox jumps over the lazy dog.
 2. Never jump over the lazy dog quickly.
 3. A quick movement of the enemy will jeopardize six gunboats.
 4. All questions asked by five watched experts amaze the judge.
 5. The five boxing wizards jump quickly.



Enter a query for BM25 retrieval (or type 'exit' to quit):  1. The quick brown fox jumps over the lazy dog.  2. Never jump over the lazy dog quickly.  3. A quick movement of the enemy will jeopardize six gunboats.  4. All questions asked by five watched experts amaze the judge.  5. The five boxing wizards jump quickly.



BM25 Scores for your query:
Score: 8.14 | Document: The quick brown fox jumps over the lazy dog.
Score: 6.85 | Document: Never jump over the lazy dog quickly.
Score: 10.58 | Document: A quick movement of the enemy will jeopardize six gunboats.
Score: 10.58 | Document: All questions asked by five watched experts amaze the judge.
Score: 6.94 | Document: The five boxing wizards jump quickly.



Enter a query for BM25 retrieval (or type 'exit' to quit):  Bye bye bye love,bye bye happiness.



BM25 Scores for your query:
Score: 0.17 | Document: The quick brown fox jumps over the lazy dog.
Score: 0.19 | Document: Never jump over the lazy dog quickly.
Score: 0.16 | Document: A quick movement of the enemy will jeopardize six gunboats.
Score: 0.16 | Document: All questions asked by five watched experts amaze the judge.
Score: 0.20 | Document: The five boxing wizards jump quickly.



Enter a query for BM25 retrieval (or type 'exit' to quit):  "Natogo, why are you here"



BM25 Scores for your query:
Score: 0.00 | Document: The quick brown fox jumps over the lazy dog.
Score: 0.00 | Document: Never jump over the lazy dog quickly.
Score: 0.00 | Document: A quick movement of the enemy will jeopardize six gunboats.
Score: 0.00 | Document: All questions asked by five watched experts amaze the judge.
Score: 0.00 | Document: The five boxing wizards jump quickly.
