# Introduction To Search

Using:
- **Count**
- **Term Frequency**
- **Dampened Term Frequency**
- **Term Frequency Inverse Document Frequency**

<br>

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

numpy    : 1.26.4
pandas   : 2.2.2
polars   : 1.0.0
torch    : 2.2.2
lightning: 2.3.2

conda environment: ai_search



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
unk_token: str = "ukn"

text: list[str] = ["i", "am", "very", "happy", "today"]
vocab_set: set = sorted(set(text))
vocab: dict[str, int] = {word: idx for idx, word in enumerate(vocab_set, start=0)}

vocab

{'am': 0, 'happy': 1, 'i': 2, 'today': 3, 'very': 4}

In [4]:
pattern: str = r'([,.?_!"()\']|--|\s)'
result: list[str] = re.split(pattern=pattern, string=" ".join(text))
result = [x for x in result if x.strip()]
result

['i', 'am', 'very', 'happy', 'today']

In [5]:
import string


def tokenize(doc: str | list[str], drop_punct: bool = True) -> list[str]:
    """
    Tokenize the input document and optionally remove punctuation.

    Args:
        doc (str | list[str]): The input document as a string or list of strings.
        drop_punct (bool, optional): Whether to remove punctuation. Defaults to True.

    Returns:
        list[str]: The tokenized document as a list of strings.
    """
    # Pattern for separating tokens
    pattern: str = r'([,.?_!"()\':]|\s)'

    if isinstance(doc, str):
        tok_doc: list[str] = re.split(pattern=pattern, string=doc.lower())

    if isinstance(doc, list):
        doc = [word.lower() for word in doc]
        tok_doc = re.split(pattern=pattern, string=" ".join(doc))

    # Remove whitespaces and empty strings
    tok_doc = [word for word in tok_doc if word.strip()]

    if drop_punct:
        tok_doc = [word for word in tok_doc if word not in string.punctuation]

    return tok_doc


def flatten_documents(docs: list[list[str]]) -> list[str]:

    assert all(
        [True if isinstance(row, list) else False for row in docs]
    ), "Not all elements are lists"

    flattened_doc: list[str] = [word.lower() for row in docs for word in row]
    return flattened_doc


def generate_vocab(docs: list[list[str]], drop_punct: bool = True) -> dict[str, int]:
    """
    Generate a vocabulary dictionary from the given document.

    Args:
        docs (list[list[str]]): The input document as a list of lists of strings.
        drop_punct (bool, optional): Whether to drop punctuation during tokenization. Defaults to True.

    Returns:
        dict[str, int]: A dictionary mapping words to their indices in the vocabulary.
    """

    unk_tok: str = "ukn"
    flattened_doc: list[str] = flatten_documents(docs)
    tok_doc: list[str] = tokenize(flattened_doc, drop_punct=drop_punct)
    tok_doc = sorted(set(tok_doc))

    vocab: dict[str, int] = {word: idx for idx, word in enumerate(tok_doc, start=0)}
    # Add unknown token
    vocab[unk_tok] = len(vocab)
    print(f"Vocab size: {len(vocab)}")
    return vocab


def encode(doc: list[list[str]], vocab: dict[str, int]) -> list[int]:
    """This particular implementation checks for the occurrence of a term."""
    unk_tok: str = "ukn"
    arr: np.ndarray = np.zeros((1, len(vocab)), dtype=int)
    for row in doc:
        for word in tokenize(row):
            if word in vocab:
                arr[0, vocab.get(word, unk_tok)] = 1
    return arr


def encode_n_create_df(doc: list[list[str]], vocab: dict[str, int]) -> pl.DataFrame:
    df: pl.DataFrame = pl.DataFrame(encode(doc=doc, vocab=vocab))
    df.columns = list(vocab.keys())

    return df

In [6]:
docs: list[list[str]] = [["Hey! Tell me something about neidu."]]
vocab: dict[str, int] = generate_vocab(docs=docs, drop_punct=True)
print(f"{vocab = }")

encode(doc=[["tell Tell"]], vocab=vocab)

Vocab size: 7
vocab = {'about': 0, 'hey': 1, 'me': 2, 'neidu': 3, 'something': 4, 'tell': 5, 'ukn': 6}


array([[0, 0, 0, 0, 0, 1, 0]])

In [7]:
encode_n_create_df(doc=[["about Tell"]], vocab=vocab)

# vocab

about,hey,me,neidu,something,tell,ukn
i64,i64,i64,i64,i64,i64,i64
1,0,0,0,0,1,0


### Comment

#### [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity#:~:text=Cosine%20similarity%20is%20the%20cosine,but%20only%20on%20their%20angle.)

- Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths.
- It follows that the cosine similarity does not depend on the magnitudes of the vectors, but only on their angle.

$$cosSimilarity = cos(\theta) = \frac{A . B}{||A||.||B||}$$

- $||A||$ is the length of vector A, and $||B||$ is the length of vector B. aka. norm.

In [8]:
from numpy.linalg import norm


def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> float:
    return np.dot(vector1, vector2) / (norm(vector1) * norm(vector2))


def check_equality(vector_1: np.ndarray, vector_2: np.ndarray) -> bool:
    result: bool = np.array_equal(vector_1, vector_2)
    print(f"{result = }")
    return result

In [9]:
cosine_similarity(vector1=np.array([1, 2, 3]), vector2=np.array([4, 5, 6]))

0.9746318461970762

In [10]:
doc1: list[str] = [
    (
        "Lynn: ham and cheese sandwich, chocolate cookie, ice water. "
        "Brian: turkey avocado sandwich, plain potato chips, apple juice "
        "Mohammed: grilled chicken salad, fruit cup, lemonade "
    )
]

doc2: list[str] = [
    (
        "Orchard Farms apple juice is premium, organic apple juice made from the "
        "freshest apples, never from concentrate. Its juice has received the "
        "regional award for best apple juice three years in a row. "
    )
]


docs: list[list[str]] = [doc1, doc2]
vocab: dict[str, int] = generate_vocab(docs=docs)

query: list[list[str]] = [["apple juice"]]
query_vector: np.ndarray = encode(doc=query, vocab=vocab)
doc1_vector: np.ndarray = encode(doc=[doc1], vocab=vocab)
doc2_vector: np.ndarray = encode(doc=[doc2], vocab=vocab)

print(query_vector)

Vocab size: 49
[[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [11]:
console.print(list(vocab.keys()))

In [12]:
doc1_score = cosine_similarity(query_vector.squeeze(), doc1_vector.squeeze())
doc2_score = cosine_similarity(query_vector.squeeze(), doc2_vector.squeeze())

doc1_score, doc2_score

(0.2886751345948129, 0.2773500981126146)

In [13]:
encode_n_create_df(doc=[doc1], vocab=vocab)
# encode(doc=[doc1], vocab=vocab)

a,and,apple,apples,avocado,award,best,brian,cheese,chicken,chips,chocolate,concentrate,cookie,cup,farms,for,freshest,from,fruit,grilled,ham,has,ice,in,is,its,juice,lemonade,lynn,made,mohammed,never,orchard,organic,plain,potato,premium,received,regional,row,salad,sandwich,the,three,turkey,water,years,ukn
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0


In [14]:
encode_n_create_df(doc=[doc2], vocab=vocab)

a,and,apple,apples,avocado,award,best,brian,cheese,chicken,chips,chocolate,concentrate,cookie,cup,farms,for,freshest,from,fruit,grilled,ham,has,ice,in,is,its,juice,lemonade,lynn,made,mohammed,never,orchard,organic,plain,potato,premium,received,regional,row,salad,sandwich,the,three,turkey,water,years,ukn
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1,0,1,1,0,1,1,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,0,0,1,1,1,1,0,0,1,1,0,0,1,0


In [15]:
check_equality(doc1_vector.squeeze(), doc1_vector.squeeze())
check_equality(doc1_vector.squeeze(), doc2_vector.squeeze())

result = True
result = False


False

<hr>

### [Term Frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

- Term frequency, tf(t,d), is the relative frequency of term t within document d.
- where $f_{t,d}$ is the number of times term t appears in document d and $\sum_{t' \in d} f_{t',d}$ is the total number of terms in document d.

$$tf(t,d) = \frac{f_{t,d}}{\sum_{t' \in d} f_{t',d}}$$

#### Dampen the Term Frequency

- Reduce weight of common words within documents.
- i.e. Lower importance of frequent terms per document.

$$log_{tf} = 1 + log(freq_{count} + 1)$$

- Here, 1 is added to both the logarithm and the frequency count to avoid taking the log of zero and to dampen the effect of term frequency.

In [16]:
doc1: list[str] = [
    (
        "Thank you Jesus for the gift of today. I am grateful. "
        "It is a definitely a beautiful day."
    )
]
doc2: list[str] = [
    (
        "Consistency, the bedrock of progress and reliability, fosters trust, "
        "builds habits, and drives success across all endeavors, from personal "
        "growth to professional achievements, enabling steady improvement and "
        "creating a foundation for excellence in life's myriad pursuits."
    )
]
doc3: list[str] = [
    (
        "the cat sat on the mat the dog sat on the log "
        "and the cat chased the mouse and the dog chased the cat"
    )
]
corpus: list[list[str]] = [tokenize(doc1), tokenize(doc2), tokenize(doc3)]
vocab: dict[str, int] = generate_vocab(docs=corpus)
print(f"{vocab = }\n")
print(f"{corpus = }\n")

Vocab size: 57
vocab = {'a': 0, 'achievements': 1, 'across': 2, 'all': 3, 'am': 4, 'and': 5, 'beautiful': 6, 'bedrock': 7, 'builds': 8, 'cat': 9, 'chased': 10, 'consistency': 11, 'creating': 12, 'day': 13, 'definitely': 14, 'dog': 15, 'drives': 16, 'enabling': 17, 'endeavors': 18, 'excellence': 19, 'for': 20, 'fosters': 21, 'foundation': 22, 'from': 23, 'gift': 24, 'grateful': 25, 'growth': 26, 'habits': 27, 'i': 28, 'improvement': 29, 'in': 30, 'is': 31, 'it': 32, 'jesus': 33, 'life': 34, 'log': 35, 'mat': 36, 'mouse': 37, 'myriad': 38, 'of': 39, 'on': 40, 'personal': 41, 'professional': 42, 'progress': 43, 'pursuits': 44, 'reliability': 45, 's': 46, 'sat': 47, 'steady': 48, 'success': 49, 'thank': 50, 'the': 51, 'to': 52, 'today': 53, 'trust': 54, 'you': 55, 'ukn': 56}

corpus = [['thank', 'you', 'jesus', 'for', 'the', 'gift', 'of', 'today', 'i', 'am', 'grateful', 'it', 'is', 'a', 'definitely', 'a', 'beautiful', 'day'], ['consistency', 'the', 'bedrock', 'of', 'progress', 'and', 'reli

In [17]:
# (n_docs, n_terms)
freq_count: np.ndarray = np.zeros((len(corpus), len(vocab)), dtype=np.int32)

for idx, doc in enumerate(corpus, start=0):
    for word in doc:
        word_idx = vocab[word]
        freq_count[idx, word_idx] += 1


term_freq: np.ndarray = freq_count / np.sum(freq_count, axis=1, keepdims=True)
term_freq

array([[0.11111111, 0.        , 0.        , 0.        , 0.05555556,
        0.        , 0.05555556, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.05555556, 0.05555556,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05555556, 0.        , 0.        , 0.        , 0.05555556,
        0.05555556, 0.        , 0.        , 0.05555556, 0.        ,
        0.        , 0.05555556, 0.05555556, 0.05555556, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.05555556,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05555556, 0.05555556, 0.        , 0.05555556, 0.        ,
        0.05555556, 0.        ],
       [0.02702703, 0.02702703, 0.02702703, 0.02702703, 0.        ,
        0.08108108, 0.        , 0.02702703, 0.02702703, 0.        ,
        0.        , 0.02702703, 0.02702703, 0.        , 0.        ,
        0.     

In [18]:
log_term_freq: np.ndarray = np.log1p(term_freq)
dampened_tf: np.ndarray = log_term_freq / np.sum(log_term_freq, axis=1, keepdims=True)
dampened_tf

array([[0.10857028, 0.        , 0.        , 0.        , 0.05571436,
        0.        , 0.05571436, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.05571436, 0.05571436,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05571436, 0.        , 0.        , 0.        , 0.05571436,
        0.05571436, 0.        , 0.        , 0.05571436, 0.        ,
        0.        , 0.05571436, 0.05571436, 0.05571436, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.05571436,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05571436, 0.05571436, 0.        , 0.05571436, 0.        ,
        0.05571436, 0.        ],
       [0.02708311, 0.02708311, 0.02708311, 0.02708311, 0.        ,
        0.07917434, 0.        , 0.02708311, 0.02708311, 0.        ,
        0.        , 0.02708311, 0.02708311, 0.        , 0.        ,
        0.     

In [19]:
freq_count

array([[2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 1, 1, 0, 3, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0,
        0, 0, 0, 2, 0, 0, 0, 8, 0, 0, 0, 0, 0]], dtype=int32)

In [20]:
from collections import Counter


counter: dict = Counter(flatten_documents(corpus))
counter

Counter({'the': 10,
         'and': 5,
         'a': 3,
         'cat': 3,
         'for': 2,
         'of': 2,
         'sat': 2,
         'on': 2,
         'dog': 2,
         'chased': 2,
         'thank': 1,
         'you': 1,
         'jesus': 1,
         'gift': 1,
         'today': 1,
         'i': 1,
         'am': 1,
         'grateful': 1,
         'it': 1,
         'is': 1,
         'definitely': 1,
         'beautiful': 1,
         'day': 1,
         'consistency': 1,
         'bedrock': 1,
         'progress': 1,
         'reliability': 1,
         'fosters': 1,
         'trust': 1,
         'builds': 1,
         'habits': 1,
         'drives': 1,
         'success': 1,
         'across': 1,
         'all': 1,
         'endeavors': 1,
         'from': 1,
         'personal': 1,
         'growth': 1,
         'to': 1,
         'professional': 1,
         'achievements': 1,
         'enabling': 1,
         'steady': 1,
         'improvement': 1,
         'creating': 1,
      

In [21]:
freq_df: pl.DataFrame = pl.DataFrame(freq_count)
freq_df.columns = [*vocab.keys()]
freq_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you,ukn
i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
2,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0
1,1,1,1,0,3,0,1,1,0,0,1,1,0,0,0,1,1,1,1,1,1,1,1,0,0,1,1,0,1,1,0,0,0,1,0,0,0,1,1,0,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0
0,0,0,0,0,2,0,0,0,3,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,2,0,0,0,0,0,0,2,0,0,0,8,0,0,0,0,0


In [22]:
dampened_tf_df: pl.DataFrame = pl.DataFrame(dampened_tf)
dampened_tf_df.columns = [*vocab.keys()]
dampened_tf_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you,ukn
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.10857,0.0,0.0,0.0,0.055714,0.0,0.055714,0.0,0.0,0.0,0.0,0.0,0.0,0.055714,0.055714,0.0,0.0,0.0,0.0,0.0,0.055714,0.0,0.0,0.0,0.055714,0.055714,0.0,0.0,0.055714,0.0,0.0,0.055714,0.055714,0.055714,0.0,0.0,0.0,0.0,0.0,0.055714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055714,0.055714,0.0,0.055714,0.0,0.055714,0.0
0.027083,0.027083,0.027083,0.027083,0.0,0.079174,0.0,0.027083,0.027083,0.0,0.0,0.027083,0.027083,0.0,0.0,0.0,0.027083,0.027083,0.027083,0.027083,0.027083,0.027083,0.027083,0.027083,0.0,0.0,0.027083,0.027083,0.0,0.027083,0.027083,0.0,0.0,0.0,0.027083,0.0,0.0,0.0,0.027083,0.027083,0.0,0.027083,0.027083,0.027083,0.027083,0.027083,0.027083,0.0,0.027083,0.027083,0.0,0.027083,0.027083,0.0,0.027083,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.086239,0.0,0.0,0.0,0.126902,0.086239,0.0,0.0,0.0,0.0,0.086239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043982,0.043982,0.043982,0.0,0.0,0.086239,0.0,0.0,0.0,0.0,0.0,0.0,0.086239,0.0,0.0,0.0,0.309954,0.0,0.0,0.0,0.0,0.0


### Inverse Document Frequency

- The inverse document frequency is a `measure of how much information the word provides`, i.e., how common or rare it is across all documents.
- It is the logarithmically scaled inverse fraction of the documents that contain the word (obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient)

$$ n_{t} = |d \in D: t \in d| $$
$$ idf_{(t,D)} = log (\frac{N}{n_{t} + 1}) + 1 $$

- where $N$ is the total number of documents in the corpus, $t \in d$ is the number of terms in a document, $d \in D$ is a document in the corpus $D$ and 1 is added to the denominator to avoid division-by-zero errors.
- $n_{t}$ is the number of documents that contain the term $t$.

- If the term is not in the corpus, this will lead to a division-by-zero.
  - It is therefore common to adjust the numerator and denominator by adding a smoothing term to avoid this.

In [23]:
(term_freq != 0)[:, 7].sum()

N: int = len(corpus)  # number of documents
S_F: int = 1  # smoothing factor

# Number of documents containing a term
doc_freq: np.ndarray = term_freq != 0
doc_freq = doc_freq.sum(axis=0, keepdims=True)

idf: np.ndarray = np.log1p((N) / (doc_freq + S_F))
idf

array([[0.69314718, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.69314718, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.69314718, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.69314718,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.55961579, 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 1.38629436]])

In [24]:
idf_df: pl.DataFrame = pl.DataFrame(idf)
idf_df.columns = [*vocab.keys()]
idf_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you,ukn
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.693147,0.916291,0.916291,0.916291,0.916291,0.693147,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.693147,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.693147,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.916291,0.559616,0.916291,0.916291,0.916291,0.916291,1.386294


In [25]:
tf_idf: np.ndarray = dampened_tf * idf
# Normalize
tf_idf = tf_idf / tf_idf.sum(axis=-1, keepdims=True)

tf_idf_df: pl.DataFrame = pl.DataFrame(tf_idf)
tf_idf_df.columns = [*vocab.keys()]
tf_idf_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you,ukn
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.088815,0.0,0.0,0.0,0.060249,0.0,0.060249,0.0,0.0,0.0,0.0,0.0,0.0,0.060249,0.060249,0.0,0.0,0.0,0.0,0.0,0.045577,0.0,0.0,0.0,0.060249,0.060249,0.0,0.0,0.060249,0.0,0.0,0.060249,0.060249,0.060249,0.0,0.0,0.0,0.0,0.0,0.045577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060249,0.036796,0.0,0.060249,0.0,0.060249,0.0
0.021557,0.028497,0.028497,0.028497,0.0,0.063019,0.0,0.028497,0.028497,0.0,0.0,0.028497,0.028497,0.0,0.0,0.0,0.028497,0.028497,0.028497,0.028497,0.021557,0.028497,0.028497,0.028497,0.0,0.0,0.028497,0.028497,0.0,0.028497,0.028497,0.0,0.0,0.0,0.028497,0.0,0.0,0.0,0.028497,0.021557,0.0,0.028497,0.028497,0.028497,0.028497,0.028497,0.028497,0.0,0.028497,0.028497,0.0,0.017404,0.028497,0.0,0.028497,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.076004,0.0,0.0,0.0,0.147844,0.100472,0.0,0.0,0.0,0.0,0.100472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051241,0.051241,0.051241,0.0,0.0,0.100472,0.0,0.0,0.0,0.0,0.0,0.0,0.100472,0.0,0.0,0.0,0.220542,0.0,0.0,0.0,0.0,0.0


In [26]:
tf_idf_df.with_columns(total=pl.sum_horizontal("*"))

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you,ukn,total
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.088815,0.0,0.0,0.0,0.060249,0.0,0.060249,0.0,0.0,0.0,0.0,0.0,0.0,0.060249,0.060249,0.0,0.0,0.0,0.0,0.0,0.045577,0.0,0.0,0.0,0.060249,0.060249,0.0,0.0,0.060249,0.0,0.0,0.060249,0.060249,0.060249,0.0,0.0,0.0,0.0,0.0,0.045577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060249,0.036796,0.0,0.060249,0.0,0.060249,0.0,1.0
0.021557,0.028497,0.028497,0.028497,0.0,0.063019,0.0,0.028497,0.028497,0.0,0.0,0.028497,0.028497,0.0,0.0,0.0,0.028497,0.028497,0.028497,0.028497,0.021557,0.028497,0.028497,0.028497,0.0,0.0,0.028497,0.028497,0.0,0.028497,0.028497,0.0,0.0,0.0,0.028497,0.0,0.0,0.0,0.028497,0.021557,0.0,0.028497,0.028497,0.028497,0.028497,0.028497,0.028497,0.0,0.028497,0.028497,0.0,0.017404,0.028497,0.0,0.028497,0.0,0.0,1.0
0.0,0.0,0.0,0.0,0.0,0.076004,0.0,0.0,0.0,0.147844,0.100472,0.0,0.0,0.0,0.0,0.100472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051241,0.051241,0.051241,0.0,0.0,0.100472,0.0,0.0,0.0,0.0,0.0,0.0,0.100472,0.0,0.0,0.0,0.220542,0.0,0.0,0.0,0.0,0.0,1.0


In [27]:
def to_lower(text: str | list[str]) -> str | list[str]:
    """
    Convert input text to lowercase.

    Args:
        text (str | list[str]): Input text or list of strings to convert.

    Returns:
        str | list[str]: Lowercase version of input text or list of lowercase strings.

    Raises:
        ValueError: If input is not a string or list of strings.
    """
    if isinstance(text, str):
        return text.lower()
    elif isinstance(text, list):
        return [t.lower() if isinstance(t, str) else t for t in text]
    else:
        raise TypeError(f"Expected str or list of str, got {type(text).__name__}")


to_lower("Hello World from Python")

'hello world from python'

<hr>

### Putting It Together

In [28]:
class TFIDF:
    """
    A class for calculating TF-IDF (Term Frequency-Inverse Document Frequency) scores
    for a corpus of documents.
    """

    def __init__(self, tok_corpus: list[list[str]], vocab: dict[str, int]) -> None:
        """
        Initialize the TFIDF class with a corpus and vocabulary.

        Args:
            corpus (list[list[str]]): A list of documents, where each document is a list of tokenized words.
            vocab (dict[str, int]): A dictionary mapping words to their indices in the vocabulary.
        """
        self.corpus: list[list[str]] = tok_corpus
        self.vocab: dict[str, int] = vocab

    def _calculate_tf(self, dampen: bool = True) -> np.ndarray:
        """
        Calculate the term frequency (TF) for each term in each document.

        Args:
            dampen (bool): Whether to apply logarithmic dampening to the term frequencies.
            Defaults to True.

        Returns:
            np.ndarray: The term frequency matrix with shape (n_docs, n_terms).
        """
        unk_token: str = "ukn"

        # (n_docs, n_terms)
        freq_count: np.ndarray = np.zeros(
            (len(self.corpus), len(self.vocab)), dtype=np.int32
        )

        for idx, doc in enumerate(self.corpus, start=0):
            for word in doc:
                word_idx: int = self.vocab.get(word, self.vocab[unk_token])
                freq_count[idx, word_idx] += 1

        if dampen:
            # Aggressively dampen the term frequencies.
            log_tf: np.ndarray = np.log1p(np.log1p(freq_count))
            tf: np.ndarray = log_tf / np.sum(log_tf, axis=1, keepdims=True)
        else:
            tf = freq_count / np.sum(freq_count, axis=1, keepdims=True)

        return tf

    def _calculate_idf(self) -> np.ndarray:
        """
        Calculate the inverse document frequency (IDF) for each term.
        """
        N: int = len(self.corpus)  # number of documents
        S_F: int = 1  # smoothing factor
        tf: np.ndarray = self._calculate_tf()

        # Number of documents containing a term
        doc_freq: np.ndarray = tf > 0
        doc_freq: np.ndarray = doc_freq.sum(axis=0, keepdims=True)
        idf: np.ndarray = np.log1p(N / (doc_freq + S_F))

        return idf

    def calculate_tfidf(self, normalize: bool = True) -> np.ndarray:
        """
        Calculate the TF-IDF scores for the corpus.
        """
        tf: np.ndarray = self._calculate_tf()
        idf: np.ndarray = self._calculate_idf()
        tf_idf: np.ndarray = tf * idf
        if normalize:
            tf_idf = tf_idf / tf_idf.sum(axis=-1, keepdims=True)

        return tf_idf


def convert_array_to_df(array: np.ndarray, columns: list[str]) -> pl.DataFrame:
    """
    Convert a NumPy array to a Polars DataFrame.

    Args:
        array (np.ndarray): The input NumPy array to be converted.
        columns (list[str]): A list of column names for the DataFrame.

    Returns:
        pl.DataFrame: Polars DataFrame with the specified column names.
    """
    data: pl.DataFrame = pl.DataFrame(array)
    data.columns = columns
    return data

In [29]:
vectorizer: TFIDF = TFIDF(tok_corpus=corpus, vocab=vocab)
tf_idf: np.ndarray = vectorizer.calculate_tfidf(normalize=True)

tf_idf_df_2: pl.DataFrame = pl.DataFrame(tf_idf)
tf_idf_df_2.columns = [*vocab.keys()]
tf_idf_df_2

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you,ukn
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.06578,0.0,0.0,0.0,0.061772,0.0,0.061772,0.0,0.0,0.0,0.0,0.0,0.0,0.061772,0.061772,0.0,0.0,0.0,0.0,0.0,0.046729,0.0,0.0,0.0,0.061772,0.061772,0.0,0.0,0.061772,0.0,0.0,0.061772,0.061772,0.061772,0.0,0.0,0.0,0.0,0.0,0.046729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061772,0.037727,0.0,0.061772,0.0,0.061772,0.0
0.022165,0.0293,0.0293,0.0293,0.0,0.036608,0.0,0.0293,0.0293,0.0,0.0,0.0293,0.0293,0.0,0.0,0.0,0.0293,0.0293,0.0293,0.0293,0.022165,0.0293,0.0293,0.0293,0.0,0.0,0.0293,0.0293,0.0,0.0293,0.0293,0.0,0.0,0.0,0.0293,0.0,0.0,0.0,0.0293,0.022165,0.0,0.0293,0.0293,0.0293,0.0293,0.0293,0.0293,0.0,0.0293,0.0293,0.0,0.017895,0.0293,0.0,0.0293,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.08388,0.0,0.0,0.0,0.130099,0.110883,0.0,0.0,0.0,0.0,0.110883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078769,0.078769,0.078769,0.0,0.0,0.110883,0.0,0.0,0.0,0.0,0.0,0.0,0.110883,0.0,0.0,0.0,0.106182,0.0,0.0,0.0,0.0,0.0


### Applying `Dot Product` And `Term Count` Vectors To Search

In [30]:
def term_count(content: str, term: str) -> float:
    """
    Count the occurrences of a term in the given content.

    Args:
        content (str): The text content to search in.
        term (str): The term to search for.

    Returns:
        float: The number of occurrences of the term in the content.
    """
    tokenized_content: list[str] = tokenize(content)
    term_count: int = tokenized_content.count(term.lower())

    return float(term_count)

In [31]:
text: str = "Hello world, hi everyone in the world"
query: str = "hi world"

tok_query: list[str] = tokenize(query)
print(f"{tok_query = }")
query_vector: list[int] = [1 for _ in tok_query]
print(f"{query_vector = }")
count_hello: int = term_count(content=text, term="hello")
print(f"{count_hello = }")

tok_query = ['hi', 'world']
query_vector = [1, 1]
count_hello = 1.0


In [32]:
# "Hello world, hi everyone in the world"
[f"{t}: {term_count(content=text, term=t)}" for t in tok_query]

['hi: 1.0', 'world: 2.0']

In [33]:
doc1: str = """In light of the big reveal in her interview, the interesting
          thing is that the person in the wrong probably made a good
          decision in the end."""
doc2: str = """My favorite book is the cat in the hat, which is about a crazy
          cat in a hat who breaks into a house and creates the craziest
          afternoon for two kids."""
doc3: str = """My careless neighbors apparently let a stray cat stay in their
          garage unsupervised, which resulted in my favorite hat that I
          let them borrow being ruined."""

query: str = "the cat in the hat"


tok_query: list[str] = tokenize(query)
# Count vectors
query_vector: list[int] = [1 for t in tok_query]
print(f"{tok_query = }")
print(f"{query_vector = }")

# Count vectors
doc_vectors: list[list[float]] = [
    [term_count(content=doc, term=tok) for tok in tok_query]
    for doc in [doc1, doc2, doc3]
]
print(f"{doc_vectors = }")

doc_scores: list[float] = [
    np.dot(query_vector, doc_vector) for doc_vector in doc_vectors
]
print()
print(f"{doc_scores = }")

tok_query = ['the', 'cat', 'in', 'the', 'hat']
query_vector = [1, 1, 1, 1, 1]
doc_vectors = [[5.0, 0.0, 4.0, 5.0, 0.0], [3.0, 2.0, 2.0, 3.0, 2.0], [0.0, 1.0, 2.0, 0.0, 1.0]]

doc_scores = [14.0, 12.0, 4.0]


#### Comment

- The current ranking system prioritizes documents with the most keywords, not necessarily the most relevant ones.
- This means documents with common words (like "the" and "in") rank higher than those with all the specific keywords, even if they appear less frequently.

<br>

### Replace `Term Count` With `Term Frequency`

In [34]:
def calculate_tf(
    tok_corpus: list[list[str]], vocab: dict[str, int], dampen: bool = True
) -> np.ndarray:
    """
    Calculate the term frequency (TF) for each term in the corpus.

    Args:
        tok_corpus (list[list[str]]): A list of tokenized documents.
        vocab (dict[str, int]): A dictionary mapping terms to their indices.
        dampen (bool, optional): Whether to apply logarithmic dampening. Defaults to True.

    Returns:
        np.ndarray: The term frequency matrix.
    """
    unk_token: str = "ukn"
    # (n_docs, n_terms)
    freq_count: np.ndarray = np.zeros((len(tok_corpus), len(vocab)), dtype=np.int32)

    for idx, doc in enumerate(tok_corpus, start=0):
        for word in doc:
            word_idx: int = vocab.get(word, vocab[unk_token])
            freq_count[idx, word_idx] += 1

    if dampen:
        # Aggressive dampening
        log_tf: np.ndarray = np.log1p(np.log1p(freq_count))
        tf: np.ndarray = log_tf / np.sum(log_tf, axis=1, keepdims=True)
    else:
        tf = freq_count / np.sum(freq_count, axis=1, keepdims=True)

    return tf


def calculate_idf(tok_corpus: list[list[str]], tf: np.ndarray) -> np.ndarray:
    """
    Calculate the inverse document frequency (IDF) for each term.
    """
    N: int = len(tok_corpus)  # number of documents
    S_F: int = 1  # smoothing factor

    # Number of documents containing a term
    doc_freq: np.ndarray = tf > 0
    doc_freq: np.ndarray = doc_freq.sum(axis=0, keepdims=True)
    idf: np.ndarray = np.log1p(N / (doc_freq + S_F))

    return idf


def calculate_tfidf(
    tf: np.ndarray, idf: np.ndarray, normalize: bool = True
) -> np.ndarray:
    """
    Calculate the TF-IDF scores for the corpus.
    """
    tf_idf: np.ndarray = tf * idf
    if normalize:
        tf_idf = tf_idf / tf_idf.sum(axis=-1, keepdims=True)

    return tf_idf

In [35]:
docs: list[list[str]] = [
    ["my president is black"],
    ["i love jesus"],
]

tok_corpus: list[list[str]] = [tokenize(doc=doc) for doc in docs]
print(f"{tok_corpus = }")
vocab: dict[str, int] = generate_vocab(docs=docs)
print(f"{vocab = }")

tf: np.ndarray = calculate_tf(tok_corpus=tok_corpus, vocab=vocab, dampen=True)
tf.tolist()

tok_corpus = [['my', 'president', 'is', 'black'], ['i', 'love', 'jesus']]
Vocab size: 8
vocab = {'black': 0, 'i': 1, 'is': 2, 'jesus': 3, 'love': 4, 'my': 5, 'president': 6, 'ukn': 7}


[[0.25, 0.0, 0.25, 0.0, 0.0, 0.25, 0.25, 0.0],
 [0.0,
  0.3333333333333333,
  0.0,
  0.3333333333333333,
  0.3333333333333333,
  0.0,
  0.0,
  0.0]]

In [36]:
convert_array_to_df(array=tf, columns=list(vocab.keys())).to_dicts()

[{'black': 0.25,
  'i': 0.0,
  'is': 0.25,
  'jesus': 0.0,
  'love': 0.0,
  'my': 0.25,
  'president': 0.25,
  'ukn': 0.0},
 {'black': 0.0,
  'i': 0.3333333333333333,
  'is': 0.0,
  'jesus': 0.3333333333333333,
  'love': 0.3333333333333333,
  'my': 0.0,
  'president': 0.0,
  'ukn': 0.0}]

In [37]:
def calculate_custom_tf(
    tok_corpus: list[list[str]], vocab: dict[str, int], dampen: bool = True
) -> list[dict[str, float]]:
    tf: np.ndarray = calculate_tf(tok_corpus, vocab, dampen)
    tf_list: list[dict[str, float]] = convert_array_to_df(
        tf, columns=list(vocab.keys())
    ).to_dicts()
    return tf_list

In [38]:
docs: list[list[str]] = [
    ["my president is a black man. he is also a kenyan man"],
]

tok_corpus: list[list[str]] = [tokenize(doc=doc) for doc in docs]
print(f"{tok_corpus = }")
vocab: dict[str, int] = generate_vocab(docs=docs)
print(f"{vocab = }")

tf: np.ndarray = calculate_tf(tok_corpus=tok_corpus, vocab=vocab)
print(f"{tf.tolist() = }")

tok_corpus = [['my', 'president', 'is', 'a', 'black', 'man', 'he', 'is', 'also', 'a', 'kenyan', 'man']]
Vocab size: 10
vocab = {'a': 0, 'also': 1, 'black': 2, 'he': 3, 'is': 4, 'kenyan': 5, 'man': 6, 'my': 7, 'president': 8, 'ukn': 9}
tf.tolist() = [[0.13769762363055363, 0.09781785485138984, 0.09781785485138984, 0.09781785485138984, 0.13769762363055363, 0.09781785485138984, 0.13769762363055363, 0.09781785485138984, 0.09781785485138984, 0.0]]


In [39]:
# Without dampening
res: list[dict[str]] = calculate_custom_tf(
    tok_corpus=tok_corpus, vocab=vocab, dampen=False
)
console.print(f"[Without dampening]: \n{res}", style="error")

# With dampening
res: list[dict[str]] = calculate_custom_tf(
    tok_corpus=tok_corpus, vocab=vocab, dampen=True
)
console.print(f"[With dampening]: \n{res}", style="info")

In [40]:
doc1: str = """In light of the big reveal in her interview, the interesting
          thing is that the person in the wrong probably made a good
          decision in the end."""
doc2: str = """My favorite book is the cat in the hat, which is about a crazy
          cat in a hat who breaks into a house and creates the craziest
          afternoon for two kids."""
doc3: str = """My careless neighbors apparently let a stray cat stay in their
          garage unsupervised, which resulted in my favorite hat that I
          let them borrow being ruined."""
docs: list[list[str]] = [[doc1], [doc2], [doc3]]

query: str = "the cat in the hat"


tok_query: list[str] = tokenize(query)
# Count vectors
query_vector: list[int] = [1 for _ in tok_query]
console.print(f"{tok_query = }")
print(f"{query_vector = }")

tok_corpus: list[list[str]] = [tokenize(doc) for doc in docs]
print(f"{tok_corpus = }")
vocab: dict[str, int] = generate_vocab(docs=docs)
print(f"{vocab = }")
tf_list: list[dict[str, int]] = calculate_custom_tf(
    tok_corpus=tok_corpus, vocab=vocab, dampen=True
)
print(f"{tf_list = }")

# Term frequency
tf_vectors: list[list[float]] = [
    [row.get(tok, row[unk_token]) for tok in tok_query] for row in tf_list
]
console.print(f"{tf_vectors = }")

doc_scores: list[float] = [np.dot(query_vector, tf_vector) for tf_vector in tf_vectors]
print()
console.print(f"{doc_scores = }")

query_vector = [1, 1, 1, 1, 1]
tok_corpus = [['in', 'light', 'of', 'the', 'big', 'reveal', 'in', 'her', 'interview', 'the', 'interesting', 'thing', 'is', 'that', 'the', 'person', 'in', 'the', 'wrong', 'probably', 'made', 'a', 'good', 'decision', 'in', 'the', 'end'], ['my', 'favorite', 'book', 'is', 'the', 'cat', 'in', 'the', 'hat', 'which', 'is', 'about', 'a', 'crazy', 'cat', 'in', 'a', 'hat', 'who', 'breaks', 'into', 'a', 'house', 'and', 'creates', 'the', 'craziest', 'afternoon', 'for', 'two', 'kids'], ['my', 'careless', 'neighbors', 'apparently', 'let', 'a', 'stray', 'cat', 'stay', 'in', 'their', 'garage', 'unsupervised', 'which', 'resulted', 'in', 'my', 'favorite', 'hat', 'that', 'i', 'let', 'them', 'borrow', 'being', 'ruined']]
Vocab size: 55
vocab = {'a': 0, 'about': 1, 'afternoon': 2, 'and': 3, 'apparently': 4, 'being': 5, 'big': 6, 'book': 7, 'borrow': 8, 'breaks': 9, 'careless': 10, 'cat': 11, 'craziest': 12, 'crazy': 13, 'creates': 14, 'decision': 15, 'end': 16, 'favorite': 17




<br><br>

#### Comment

- The new way of weighting terms (dampened tf) fixed the ranking issue with `"the"` and `"in"` by giving less importance to frequent words in a single document.
- This put the most relevant document (`doc2`) on top, but the improvement wasn't enough to completely overcome the high ranking of `doc1`.


### Applying TF-IDF To Search

In [41]:
def calculate_custom_tf_idf(
    tf: np.ndarray, idf: np.ndarray, normalize: bool = True
) -> list[dict[str, float]]:
    """
    Calculate custom TF-IDF scores for given term frequency and inverse document frequency arrays.

    Args:
        tf (np.ndarray): Term frequency array.
        idf (np.ndarray): Inverse document frequency array.
        normalize (bool, optional): Whether to normalize the TF-IDF scores. Defaults to True.

    Returns:
        list[dict[str, float]]: A list of dictionaries containing TF-IDF scores for each term.
    """
    tf_idf: np.ndarray = calculate_tfidf(tf, idf, normalize)
    tf_idf_list: list[dict[str, float]] = convert_array_to_df(
        array=tf_idf, columns=list(vocab.keys())
    ).to_dicts()
    return tf_idf_list

In [42]:
query: str = "the cat in the hat"


tok_query: list[str] = tokenize(query)
# Count vectors
query_vector: list[int] = [1 for _ in tok_query]
console.print(f"{tok_query = }")
print(f"{query_vector = }")

tok_corpus: list[list[str]] = [tokenize(doc) for doc in docs]
print(f"{tok_corpus = }")
vocab: dict[str, int] = generate_vocab(docs=docs)
print(f"{vocab = }")
tf: np.ndarray = calculate_tf(tok_corpus=tok_corpus, vocab=vocab, dampen=True)
idf: np.ndarray = calculate_idf(tok_corpus=tok_corpus, tf=tf)

tf_idf_list: list[dict[str, int]] = calculate_custom_tf_idf(
    tf=tf, idf=idf, normalize=False
)
print(f"{tf_idf_list = }")

# TF-IDF
tf_idf_vectors: list[list[float]] = [
    [row.get(tok, row[unk_token]) for tok in tok_query] for row in tf_list
]
console.print(f"{tf_idf_vectors = }")

doc_scores: list[float] = [np.dot(query_vector, t_vec) for t_vec in tf_idf_vectors]
print()
console.print(f"{doc_scores = }")

query_vector = [1, 1, 1, 1, 1]
tok_corpus = [['in', 'light', 'of', 'the', 'big', 'reveal', 'in', 'her', 'interview', 'the', 'interesting', 'thing', 'is', 'that', 'the', 'person', 'in', 'the', 'wrong', 'probably', 'made', 'a', 'good', 'decision', 'in', 'the', 'end'], ['my', 'favorite', 'book', 'is', 'the', 'cat', 'in', 'the', 'hat', 'which', 'is', 'about', 'a', 'crazy', 'cat', 'in', 'a', 'hat', 'who', 'breaks', 'into', 'a', 'house', 'and', 'creates', 'the', 'craziest', 'afternoon', 'for', 'two', 'kids'], ['my', 'careless', 'neighbors', 'apparently', 'let', 'a', 'stray', 'cat', 'stay', 'in', 'their', 'garage', 'unsupervised', 'which', 'resulted', 'in', 'my', 'favorite', 'hat', 'that', 'i', 'let', 'them', 'borrow', 'being', 'ruined']]
Vocab size: 55
vocab = {'a': 0, 'about': 1, 'afternoon': 2, 'and': 3, 'apparently': 4, 'being': 5, 'big': 6, 'book': 7, 'borrow': 8, 'breaks': 9, 'careless': 10, 'cat': 11, 'craziest': 12, 'crazy': 13, 'creates': 14, 'decision': 15, 'end': 16, 'favorite': 17




### Comment

**NOTE**

- In a real life scenario where we have thousands of document, tf-idf will produce a better result than using only the term frequency.