In [5]:
# Install necessary libraries (run these commands in a Colab cell)
!pip install rank_bm25 scikit-learn nltk



In [6]:
# Import the word_tokenize function from NLTK for breaking text into tokens (words).
from nltk.tokenize import word_tokenize
# Import BM25Okapi from the rank_bm25 package to implement the BM25 ranking algorithm,
# which is widely used for information retrieval to score document relevance based on query terms.
from rank_bm25 import BM25Okapi
# Import TfidfVectorizer from scikit-learn to convert text documents into TF-IDF feature vectors.
from sklearn.feature_extraction.text import TfidfVectorizer
# Import cosine_similarity from scikit-learn to measure similarity between the query vector and document vectors.
from sklearn.metrics.pairwise import cosine_similarity
# Import the nltk library itself to manage downloading necessary data models.
import nltk

# -----------------------------------------------
# Download Required NLTK Data
# -----------------------------------------------
# The 'punkt' tokenizer model is required for the word_tokenize function to properly split sentences into tokens.
# Downloading this data ensures that tokenization works correctly across various texts.
nltk.download('punkt')
nltk.download('punkt_tab')
# -----------------------------------------------
# Define the Original Document Corpus
# -----------------------------------------------
# A list of documents is defined here. In real applications, these might come from external files or databases.
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly.",
    "A quick movement of the enemy will jeopardize six gunboats.",
    "All questions asked by five watched experts amaze the judge.",
    "The five boxing wizards jump quickly."
]

def exercise3_modify_corpus():
    """
    Exercise 3: Modify the Document Corpus

    This interactive exercise demonstrates:
      1. How to display and update a corpus of documents by allowing the user to add new documents.
      2. How to preprocess the updated corpus (lowercase conversion and tokenization).
      3. How to perform BM25-based document retrieval using a fixed query.
      4. How to perform TF-IDF-based retrieval using cosine similarity with the same fixed query.

    The exercise emphasizes the importance of text preprocessing and the effect it has on retrieval models.
    """

    # Display the header for this exercise.
    print("Exercise 3: Modify the Document Corpus")

    # -----------------------------------------------
    # Display the Original Corpus
    # -----------------------------------------------
    print("Original Corpus:")
    # Enumerate through the original documents and print them with index numbers.
    for i, doc in enumerate(documents):
        print(f"  {i+1}. {doc}")

    # -----------------------------------------------
    # Allow the User to Add New Documents to the Corpus
    # -----------------------------------------------
    # Ask the user if they want to add a new document. This makes the exercise interactive.
    new_docs_input = input("\nWould you like to add a new document? (yes/no): ")
    # Initialize an empty list to store any new documents the user may provide.
    new_docs = []
    # Continue to ask the user for new documents until they type "no".
    while new_docs_input.lower() == "yes":
        # Prompt the user to enter the new document's text.
        new_doc = input("Enter the new document text: ")
        # Append the new document to the new_docs list.
        new_docs.append(new_doc)
        # Ask again if the user wants to add another document.
        new_docs_input = input("Would you like to add another document? (yes/no): ")

    # -----------------------------------------------
    # Update the Corpus with New Documents
    # -----------------------------------------------
    # Combine the original documents with any new documents added by the user.
    updated_documents = documents + new_docs
    print("\nUpdated Corpus:")
    # Print the updated corpus with indices to show the complete list.
    for i, doc in enumerate(updated_documents):
        print(f"  {i+1}. {doc}")

    # -----------------------------------------------
    # Preprocess the Updated Corpus
    # -----------------------------------------------
    # Preprocessing involves standardizing text, which in this case means converting to lowercase and tokenizing.
    # Lowercasing is important because it reduces differences caused by case, ensuring that "Dog" and "dog" are treated the same.
    # Tokenization splits text into individual words or tokens which are then used in the retrieval models.
    tokenized_docs = [word_tokenize(doc.lower()) for doc in updated_documents]

    # -----------------------------------------------
    # BM25 Retrieval on the Updated Corpus
    # -----------------------------------------------
    # Initialize the BM25 model with the tokenized documents.
    bm25 = BM25Okapi(tokenized_docs)
    # Define a fixed query string to test retrieval. This could be modified or made interactive if desired.
    query = "quick jump"
    # Convert the query to lowercase and tokenize it to ensure consistency with the corpus preprocessing.
    tokenized_query = word_tokenize(query.lower())
    # Compute BM25 scores for each document. Higher scores indicate higher relevance to the query.
    bm25_scores = bm25.get_scores(tokenized_query)

    print("\nBM25 Scores for query 'quick jump':")
    # Loop through the updated documents and their corresponding BM25 scores, printing each score.
    for doc, score in zip(updated_documents, bm25_scores):
        print(f"Score: {score:.2f}  |  Document: {doc}")

    # -----------------------------------------------
    # TF-IDF Retrieval using Cosine Similarity
    # -----------------------------------------------
    # Initialize a TfidfVectorizer from scikit-learn.
    # The parameter stop_words='english' removes common English words (like "the", "is", etc.) that are unlikely to be informative.
    vectorizer = TfidfVectorizer(stop_words='english')
    # Fit the vectorizer on the updated documents and transform the documents into a TF-IDF matrix.
    # The TF-IDF matrix represents each document as a vector of TF-IDF weights.
    tfidf_matrix = vectorizer.fit_transform(updated_documents)
    # Transform the query into the same TF-IDF vector space.
    query_vector = vectorizer.transform([query])
    # Compute cosine similarity between the query vector and each document vector.
    # Cosine similarity measures the cosine of the angle between two vectors, providing an indication of how similar they are.
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    print("\nTF-IDF (Cosine Similarity) Scores for query 'quick jump':")
    # Loop through the updated documents and their corresponding cosine similarity scores, printing each score.
    for doc, score in zip(updated_documents, cosine_similarities):
        print(f"Score: {score:.2f}  |  Document: {doc}")

# The following block checks if this script is being run directly,
# ensuring that the exercise function is executed when the file is run as a standalone script.
if __name__ == "__main__":
    exercise3_modify_corpus()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jasmine.frantz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jasmine.frantz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Exercise 3: Modify the Document Corpus
Original Corpus:
  1. The quick brown fox jumps over the lazy dog.
  2. Never jump over the lazy dog quickly.
  3. A quick movement of the enemy will jeopardize six gunboats.
  4. All questions asked by five watched experts amaze the judge.
  5. The five boxing wizards jump quickly.



Would you like to add a new document? (yes/no):  yes
Enter the new document text:  Where is the fox that jumped over the lazy pup
Would you like to add another document? (yes/no):  yes
Enter the new document text:  The five boxing idiots jump slowly.
Would you like to add another document? (yes/no):  no



Updated Corpus:
  1. The quick brown fox jumps over the lazy dog.
  2. Never jump over the lazy dog quickly.
  3. A quick movement of the enemy will jeopardize six gunboats.
  4. All questions asked by five watched experts amaze the judge.
  5. The five boxing wizards jump quickly.
  6. Where is the fox that jumped over the lazy pup
  7. The five boxing idiots jump slowly.

BM25 Scores for query 'quick jump':
Score: 0.76  |  Document: The quick brown fox jumps over the lazy dog.
Score: 0.27  |  Document: Never jump over the lazy dog quickly.
Score: 0.72  |  Document: A quick movement of the enemy will jeopardize six gunboats.
Score: 0.00  |  Document: All questions asked by five watched experts amaze the judge.
Score: 0.28  |  Document: The five boxing wizards jump quickly.
Score: 0.00  |  Document: Where is the fox that jumped over the lazy pup
Score: 0.28  |  Document: The five boxing idiots jump slowly.

TF-IDF (Cosine Similarity) Scores for query 'quick jump':
Score: 0.30  |  Docu