In [1]:
# Install necessary libraries (run these commands in a Colab cell)
!pip install rank_bm25 scikit-learn nltk




In [None]:
# Import word_tokenize from NLTK to break sentences into individual words (tokens).
from nltk.tokenize import word_tokenize
# Import BM25Okapi from rank_bm25 to implement the BM25 ranking algorithm.
# BM25 is widely used in information retrieval to score documents based on query term frequencies,
# document length, and other factors.
from rank_bm25 import BM25Okapi
# Import TfidfVectorizer from scikit-learn to transform text documents into TF-IDF vectors.
# TF-IDF weighs the importance of words in a document relative to the corpus.
from sklearn.feature_extraction.text import TfidfVectorizer
# Import cosine_similarity from scikit-learn to compute the similarity between TF-IDF vectors.
# Cosine similarity measures the cosine of the angle between two vectors, indicating their similarity.
from sklearn.metrics.pairwise import cosine_similarity
# Import nltk for managing language data resources (like tokenizer models).
import nltk

# ---------------------------------------------------------
# Download the necessary NLTK data.
# ---------------------------------------------------------
# The 'punkt' tokenizer is required by word_tokenize to correctly split text into tokens.
nltk.download('punkt')

# ---------------------------------------------------------
# Define the Pre-defined Document Corpus.
# ---------------------------------------------------------
# This list of documents acts as our dataset for retrieval tasks.
# In real-world applications, these could be articles, web pages, or other text sources.
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly.",
    "A quick movement of the enemy will jeopardize six gunboats.",
    "All questions asked by five watched experts amaze the judge.",
    "The five boxing wizards jump quickly."
]

def exercise6_advanced():
    """
    Exercise 6: Extend the System (Advanced)

    This advanced exercise demonstrates multiple aspects of a retrieval system:
      1. Custom stop words: Filtering out common words to focus on more meaningful terms.
      2. Query expansion: Using a synonym dictionary to add related terms to the user's query.
      3. Reinitializing the BM25 model and TF-IDF retrieval on the (unchanged) document corpus.
      4. Allowing the user to choose between BM25 and TF-IDF (cosine similarity) ranking methods,
         and specify the number of top documents to display.

    This exercise illustrates how you can enhance a basic retrieval system with additional features,
    improving both recall (through query expansion) and flexibility (by allowing multiple ranking methods).
    """
    print("Exercise 6: Extend the System (Advanced)")

    # -----------------------------
    # Step 1: Use Custom Stop Words.
    # -----------------------------
    # Custom stop words are words we want to exclude from the analysis.
    # Removing words like 'the', 'over', 'a', and 'an' helps to focus on more informative terms.
    custom_stop_words = ['the', 'over', 'a', 'an']
    print("\nUsing custom stop words:", custom_stop_words)

    # Initialize TfidfVectorizer with the custom stop words list.
    # This vectorizer converts text documents into TF-IDF feature vectors while excluding the specified stop words.
    vectorizer = TfidfVectorizer(stop_words=custom_stop_words)
    # Transform the document corpus into a TF-IDF matrix.
    # Each row represents a document and each column represents a term's TF-IDF weight.
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Display the extracted feature names (vocabulary) after removing the custom stop words.
    # This helps to understand what terms are being used to represent the documents.
    feature_names = vectorizer.get_feature_names_out()
    print("\nTF-IDF Feature Names (with custom stop words removed):")
    print(feature_names)

    # -----------------------------
    # Step 2: Implement a Simple Query Expansion Mechanism.
    # -----------------------------
    # Query expansion is used to enrich the user's query by adding synonyms.
    # This can improve retrieval results by capturing related concepts not present in the original query.
    # Here we define a basic dictionary mapping words to their synonyms.
    synonym_dict = {
        "quick": ["fast", "rapid"],
        "jump": ["leap", "hop"]
    }

    def expand_query(query):
        """
        Expand the input query by adding synonyms from the synonym_dict.

        Why? Query expansion can help retrieve documents that contain synonyms of the query terms,
             potentially increasing recall (i.e., finding more relevant documents).

        Parameters:
          query (str): The original user query.

        Returns:
          str: The expanded query as a single string with additional synonyms.
        """
        # Split the query into individual words.
        words = query.split()
        # Initialize a list to hold the expanded set of query words.
        expanded_words = []
        # Iterate over each word in the original query.
        for word in words:
            # Add the original word to the expanded query.
            expanded_words.append(word)
            # If the word has synonyms in our dictionary, add them as well.
            if word in synonym_dict:
                expanded_words.extend(synonym_dict[word])
        # Join all words back into a single string and return the expanded query.
        return ' '.join(expanded_words)

    # -----------------------------
    # Step 3: Process the Query and Compute Retrieval Scores.
    # -----------------------------
    # Prompt the user to enter a query.
    # Converting to lowercase ensures consistency with our preprocessing (lowercasing is done on documents).
    query = input("\nEnter a query for advanced retrieval: ").lower()
    # Expand the query using the simple synonym mechanism.
    expanded_query = expand_query(query)
    # Show the expanded query to the user for transparency.
    print("Expanded Query:", expanded_query)

    # Reinitialize BM25 for the unchanged document corpus.
    # Preprocess the documents: convert to lowercase and tokenize.
    tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
    # Create a BM25 model using the tokenized documents.
    bm25 = BM25Okapi(tokenized_docs)
    # Tokenize the expanded query to prepare it for BM25 scoring.
    tokenized_query = word_tokenize(expanded_query)
    # Compute BM25 scores for the expanded query against each document.
    bm25_scores = bm25.get_scores(tokenized_query)

    # Also, compute TF-IDF retrieval scores using the expanded query.
    # Transform the expanded query into the TF-IDF vector space.
    query_vector = vectorizer.transform([expanded_query])
    # Compute cosine similarity between the query vector and each document's TF-IDF vector.
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # -----------------------------
    # Step 4: Let the User Choose the Ranking Method and Display Top Results.
    # -----------------------------
    # Ask the user which ranking method they would like to use:
    #   'bm25' for BM25 retrieval or 'tfidf' for TF-IDF cosine similarity.
    method = input("\nChoose ranking method ('bm25' or 'tfidf'): ").strip().lower()
    # Ask the user how many top documents they wish to see.
    try:
        top_n = int(input("How many top documents would you like to see? "))
    except ValueError:
        # If the user enters an invalid number, default to 3 top documents.
        print("Invalid number. Defaulting to top 3 documents.")
        top_n = 3

    # Depending on the chosen method, sort and display the top results.
    if method == 'bm25':
        # Sort the indices of documents in descending order based on BM25 scores.
        # The lambda function is used as the sorting key.
        top_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_n]
        print("\nTop Documents using BM25:")
        # Display each top document with its BM25 score.
        for idx in top_indices:
            print(f"Score: {bm25_scores[idx]:.2f}  |  Document: {documents[idx]}")
    elif method == 'tfidf':
        # Sort the indices of documents in descending order based on TF-IDF cosine similarity scores.
        top_indices = sorted(range(len(cosine_similarities)), key=lambda i: cosine_similarities[i], reverse=True)[:top_n]
        print("\nTop Documents using TF-IDF (Cosine Similarity):")
        # Display each top document with its cosine similarity score.
        for idx in top_indices:
            print(f"Score: {cosine_similarities[idx]:.2f}  |  Document: {documents[idx]}")
    else:
        # If the user did not choose a valid ranking method, inform them.
        print("Invalid ranking method selected. Please choose either 'bm25' or 'tfidf'.")

# ---------------------------------------------------------
# Execute the Advanced Exercise if this script is run directly.
# ---------------------------------------------------------
if __name__ == "__main__":
    exercise6_advanced()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jasmine.frantz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Exercise 6: Extend the System (Advanced)

Using custom stop words: ['the', 'over', 'a', 'an']

TF-IDF Feature Names (with custom stop words removed):
['all' 'amaze' 'asked' 'boxing' 'brown' 'by' 'dog' 'enemy' 'experts'
 'five' 'fox' 'gunboats' 'jeopardize' 'judge' 'jump' 'jumps' 'lazy'
 'movement' 'never' 'of' 'questions' 'quick' 'quickly' 'six' 'watched'
 'will' 'wizards']
