# Keyword Search

In [1]:
# Built-in library
import re
import json
from typing import Any, Optional, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
from rank_bm25 import BM25Okapi, BM25L


corpus: list[str] = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?",
]

tokenized_corpus: list[list[Any]] = [doc.split(" ") for doc in corpus]
tokenized_corpus

[['Hello', 'there', 'good', 'man!'],
 ['It', 'is', 'quite', 'windy', 'in', 'London'],
 ['How', 'is', 'the', 'weather', 'today?']]

In [3]:
bm25 = BM25Okapi(tokenized_corpus)
bm25

<rank_bm25.BM25Okapi at 0x7f8a3a42f220>

### Note

- It's important to note that we also need to tokenize our query, and apply the same preprocessing steps we did to the documents in order to have an apples-to-apples comparison.

In [4]:
query: str = "windy London"
tokenized_query = query.split(" ")

doc_scores: npt.NDArray[np.float_] = bm25.get_scores(tokenized_query)
doc_scores

array([0.        , 0.93729472, 0.        ])

In [5]:
# Instead of getting the document scores, you can also just retrieve the best documents with
bm25.get_top_n(query=tokenized_query, documents=corpus, n=1)

['It is quite windy in London']

<hr><br>

## Using SpaCy And Another Variant of BM25

In [6]:
import spacy

from spacy.tokens.doc import Doc
from spacy.lang.en import English


# Load the spaCy model
nlp: English = spacy.load(
    "en_core_web_sm",
    disable=["tagger", "lemmatizer", "parser", "ner", "attribute_ruler"],
)

# Process a text
text: str = "This is a sentence."
doc: Doc = nlp.tokenizer(text)

# Only the tokenizer is applied
print([x.text for x in doc])

2023-12-08 04:33:18.320321: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
tokenized_corpus: list[list[Any]] = [
    [token.text for token in nlp.tokenizer(text)] for text in corpus
]
tokenized_corpus

[['Hello', 'there', 'good', 'man', '!'],
 ['It', 'is', 'quite', 'windy', 'in', 'London'],
 ['How', 'is', 'the', 'weather', 'today', '?']]

In [8]:
class Tokenizer:
    def __init__(self, corpus: list[str]) -> None:
        self.corpus = corpus
        # self._tok_corpus = None

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(num_corpus={len(self.corpus)})"

    def tokenize_corpus(self) -> list[list[Any]]:
        """This is used to tokenize the entire corpus."""
        tok_corpus: list[list[Any]] = [
            self._tokenized_document(document=text) for text in self.corpus
        ]
        return tok_corpus

    def tokenized_doc(self, document: str) -> list[str]:
        """This is used to tokenize a single document/query."""
        tok_doc: list[str] = self._tokenized_document(document=document)
        return tok_doc

    @staticmethod
    def _tokenized_document(document: str) -> list[str]:
        """This is a helper function used to tokenize a single document."""
        tok_doc: list[str] = [token.text.lower() for token in nlp.tokenizer(document)]
        return tok_doc

    @property
    def tok_corpus(self) -> list[list[Any]]:
        tok_corpus = self.tokenize_corpus()
        return tok_corpus

In [9]:
tokenizer: Tokenizer = Tokenizer(corpus=corpus)
tokenizer

Tokenizer(num_corpus=3)

In [10]:
tokenized_corpus: list[list[str]] = tokenizer.tokenize_corpus()
tokenized_corpus

[['hello', 'there', 'good', 'man', '!'],
 ['it', 'is', 'quite', 'windy', 'in', 'london'],
 ['how', 'is', 'the', 'weather', 'today', '?']]

In [11]:
bm25l = BM25L(tokenized_corpus)


query: str = "windy London"
tokenized_query: list[str] = tokenizer.tokenized_doc(document=query)

tokenized_query

['windy', 'london']

In [12]:
doc_scores: npt.NDArray[np.float_] = bm25l.get_scores(query=tokenized_query)
doc_scores

array([0.        , 2.41704352, 0.        ])

In [13]:
bm25l.get_top_n(query=tokenized_query, documents=corpus, n=1)

['It is quite windy in London']

In [14]:
corpus: list[str] = [
    "The quick brown fox jumps over the lazy dog.",
    "The enigmatic sphinx posed an unanswerable riddle to the trembling travelers.",
    "The bustling city lights glimmered through the rain-streaked windowpane.",
    "The concert was a resounding success, leaving the audience cheering for more.",  # A
    "The enigmatic symbols on the ancient scroll remained indecipherable for centuries.",
    "The music show was a smashing hit, leaving the crowd ecstatic and clamoring for an encore.",  # similar to A
    "The chirping of crickets and the gentle rustling of leaves filled the night air.",
    "With a heavy heart, he bid farewell to his beloved companion.",
    "The intricate clockwork mechanism whirred and buzzed, a marvel of human ingenuity.",
    "A wave of cheers and enthusiastic shouts erupted from the captivated crowd, urging the artists to return.",  # similar to A
]

In [15]:
tokenizer: Tokenizer = Tokenizer(corpus=corpus)
tokenized_corpus: list[list[str]] = tokenizer.tokenize_corpus()
print(tokenized_corpus[-2:])

In [16]:
bm25 = BM25Okapi(corpus=tokenized_corpus)


query: str = "Amazing concert"
tokenized_query: list[str] = tokenizer.tokenized_doc(document=query)

tokenized_query

['amazing', 'concert']

In [17]:
# It only accurately matches 2 out of the 3 correct documents
results: list[list[str]] = bm25.get_top_n(
    query=tokenized_query, documents=tokenized_corpus, n=3
)

print(results)