In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

numpy    : 1.26.4
pandas   : 2.2.2
polars   : 1.0.0
torch    : 2.2.2
lightning: 2.3.2

conda environment: ai_search



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
text: list[str] = ["i", "am", "very", "happy", "today"]
vocab_set: set = sorted(set(text))
vocab: dict[str, int] = {word: idx for idx, word in enumerate(vocab_set, start=0)}

vocab

{'am': 0, 'happy': 1, 'i': 2, 'today': 3, 'very': 4}

In [4]:
pattern: str = r'([,.?_!"()\']|--|\s)'
result: list[str] = re.split(pattern=pattern, string=" ".join(text))
result = [x for x in result if x.strip()]
result

['i', 'am', 'very', 'happy', 'today']

In [5]:
import string


def tokenize(doc: list[str], drop_punct: bool = True) -> list[str]:
    # Pattern for separating tokens
    pattern: str = r'([,.?_!"()\':]|\s)'
    # Lowercase all words
    doc = [word.lower() for word in doc]
    tok_doc: list[str] = re.split(pattern=pattern, string=" ".join(doc))
    # Remove whitespaces and empty strings
    tok_doc = [word for word in tok_doc if word.strip()]
    if drop_punct:
        tok_doc = [word for word in tok_doc if word not in string.punctuation]
        print(f"Punctuation removed: {len(tok_doc)}")
    return tok_doc


def flatten_documents(docs: list[list[str]]) -> list[str]:

    assert all(
        [True if isinstance(row, list) else False for row in docs]
    ), "Not all elements are lists"

    flattened_doc: list[str] = [word.lower() for row in docs for word in row]
    return flattened_doc


def generate_vocab(doc: list[list[str]], drop_punct: bool = True) -> dict[str, int]:

    flattened_doc: list[str] = flatten_documents(doc)
    tok_doc: list[str] = tokenize(flattened_doc)
    tok_doc = sorted(set(tok_doc))

    vocab: dict[str, int] = {word: idx for idx, word in enumerate(tok_doc, start=0)}
    print(f"Vocab size: {len(vocab)}")
    return vocab


def encode(doc: list[list[str]], vocab: dict[str, int]) -> list[int]:
    """This particular implementation checks for the occurrence of a term."""
    arr: np.ndarray = np.zeros((1, len(vocab)), dtype=int)
    for row in doc:
        for word in tokenize(row):
            if word in vocab:
                arr[0, vocab[word]] = 1  # change!
    return arr


def encode_n_create_df(doc: list[list[str]], vocab: dict[str, int]) -> pl.DataFrame:
    df: pl.DataFrame = pl.DataFrame(encode(doc=doc, vocab=vocab))
    df.columns = list(vocab.keys())

    return df

In [6]:
doc: list[list[str]] = [["Hey! Tell me something about neidu."]]
vocab: dict[str, int] = generate_vocab(doc=doc, drop_punct=True)
print(f"{vocab = }")

encode(doc=[["tell Tell"]], vocab=vocab)

Punctuation removed: 6
Vocab size: 6
vocab = {'about': 0, 'hey': 1, 'me': 2, 'neidu': 3, 'something': 4, 'tell': 5}
Punctuation removed: 2


array([[0, 0, 0, 0, 0, 1]])

In [7]:
encode_n_create_df(doc=[["about Tell"]], vocab=vocab)

# vocab

Punctuation removed: 2


about,hey,me,neidu,something,tell
i64,i64,i64,i64,i64,i64
1,0,0,0,0,1


### Comment

#### [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity#:~:text=Cosine%20similarity%20is%20the%20cosine,but%20only%20on%20their%20angle.)

- Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths.
- It follows that the cosine similarity does not depend on the magnitudes of the vectors, but only on their angle.

$$cosSimilarity = cos(\theta) = \frac{A . B}{||A||.||B||}$$

In [8]:
from numpy.linalg import norm


def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> float:
    return np.dot(vector1, vector2) / (norm(vector1) * norm(vector2))


def check_equality(vector_1: np.ndarray, vector_2: np.ndarray) -> bool:
    result: bool = np.array_equal(vector_1, vector_2)
    print(f"{result = }")
    return result

In [9]:
cosine_similarity(vector1=np.array([1, 2, 3]), vector2=np.array([4, 5, 6]))

0.9746318461970762

In [10]:
doc1: list[str] = [
    (
        "Lynn: ham and cheese sandwich, chocolate cookie, ice water. "
        "Brian: turkey avocado sandwich, plain potato chips, apple juice "
        "Mohammed: grilled chicken salad, fruit cup, lemonade "
    )
]

doc2: list[str] = [
    (
        "Orchard Farms apple juice is premium, organic apple juice made from the "
        "freshest apples, never from concentrate. Its juice has received the "
        "regional award for best apple juice three years in a row. "
    )
]


doc: list[list[str]] = [doc1, doc2]
vocab: dict[str, int] = generate_vocab(doc=doc)

query: list[list[str]] = [[" apple juice"]]
query_vector: np.ndarray = encode(doc=query, vocab=vocab)
doc1_vector: np.ndarray = encode(doc=[doc1], vocab=vocab)
doc2_vector: np.ndarray = encode(doc=[doc2], vocab=vocab)

print(query_vector)

Punctuation removed: 58
Vocab size: 48
Punctuation removed: 2
Punctuation removed: 25
Punctuation removed: 33
[[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0]]


In [11]:
console.print(list(vocab.keys()))

In [12]:
doc1_score = cosine_similarity(query_vector.squeeze(), doc1_vector.squeeze())
doc2_score = cosine_similarity(query_vector.squeeze(), doc2_vector.squeeze())

doc1_score, doc2_score

(0.2886751345948129, 0.2773500981126146)

In [13]:
encode_n_create_df(doc=[doc1], vocab=vocab)
# encode(doc=[doc1], vocab=vocab)

Punctuation removed: 25


a,and,apple,apples,avocado,award,best,brian,cheese,chicken,chips,chocolate,concentrate,cookie,cup,farms,for,freshest,from,fruit,grilled,ham,has,ice,in,is,its,juice,lemonade,lynn,made,mohammed,never,orchard,organic,plain,potato,premium,received,regional,row,salad,sandwich,the,three,turkey,water,years
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0


In [14]:
encode_n_create_df(doc=[doc2], vocab=vocab)

Punctuation removed: 33


a,and,apple,apples,avocado,award,best,brian,cheese,chicken,chips,chocolate,concentrate,cookie,cup,farms,for,freshest,from,fruit,grilled,ham,has,ice,in,is,its,juice,lemonade,lynn,made,mohammed,never,orchard,organic,plain,potato,premium,received,regional,row,salad,sandwich,the,three,turkey,water,years
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1,0,1,1,0,1,1,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,0,0,1,1,1,1,0,0,1,1,0,0,1


In [15]:
check_equality(doc1_vector.squeeze(), doc1_vector.squeeze())
check_equality(doc1_vector.squeeze(), doc2_vector.squeeze())

result = True
result = False


False

<hr>

### [Term Frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

- Term frequency, tf(t,d), is the relative frequency of term t within document d.
- where $f_{t,d}$ is the number of times term t appears in document d and $\sum_{t' \in d} f_{t',d}$ is the total number of terms in document d.

$$tf(t,d) = \frac{f_{t,d}}{\sum_{t' \in d} f_{t',d}}$$

In [16]:
doc1: list[str] = [
    (
        "Thank you Jesus for the gift of today. I am grateful. "
        "It is a definitely a beautiful day."
    )
]
doc2: list[str] = [
    (
        "Consistency, the bedrock of progress and reliability, fosters trust, "
        "builds habits, and drives success across all endeavors, from personal "
        "growth to professional achievements, enabling steady improvement and "
        "creating a foundation for excellence in life's myriad pursuits."
    )
]
doc3: list[str] = [
    (
        "the cat sat on the mat the dog sat on the log "
        "and the cat chased the mouse and the dog chased the cat"
    )
]
corpus: list[list[str]] = [tokenize(doc1), tokenize(doc2), tokenize(doc3)]
vocab: dict[str, int] = generate_vocab(doc=corpus)
print(f"{vocab = }\n")
print(f"{corpus = }\n")

Punctuation removed: 18
Punctuation removed: 37
Punctuation removed: 24
Punctuation removed: 79
Vocab size: 56
vocab = {'a': 0, 'achievements': 1, 'across': 2, 'all': 3, 'am': 4, 'and': 5, 'beautiful': 6, 'bedrock': 7, 'builds': 8, 'cat': 9, 'chased': 10, 'consistency': 11, 'creating': 12, 'day': 13, 'definitely': 14, 'dog': 15, 'drives': 16, 'enabling': 17, 'endeavors': 18, 'excellence': 19, 'for': 20, 'fosters': 21, 'foundation': 22, 'from': 23, 'gift': 24, 'grateful': 25, 'growth': 26, 'habits': 27, 'i': 28, 'improvement': 29, 'in': 30, 'is': 31, 'it': 32, 'jesus': 33, 'life': 34, 'log': 35, 'mat': 36, 'mouse': 37, 'myriad': 38, 'of': 39, 'on': 40, 'personal': 41, 'professional': 42, 'progress': 43, 'pursuits': 44, 'reliability': 45, 's': 46, 'sat': 47, 'steady': 48, 'success': 49, 'thank': 50, 'the': 51, 'to': 52, 'today': 53, 'trust': 54, 'you': 55}

corpus = [['thank', 'you', 'jesus', 'for', 'the', 'gift', 'of', 'today', 'i', 'am', 'grateful', 'it', 'is', 'a', 'definitely', 'a', 

In [17]:
# (n_docs, n_terms)
freq_count: np.ndarray = np.zeros((len(corpus), len(vocab)), dtype=np.int32)

for idx, doc in enumerate(corpus, start=0):
    for word in doc:
        word_idx = vocab[word]
        freq_count[idx, word_idx] += 1


term_freq: np.ndarray = freq_count / np.sum(freq_count, axis=1, keepdims=True)
term_freq

array([[0.11111111, 0.        , 0.        , 0.        , 0.05555556,
        0.        , 0.05555556, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.05555556, 0.05555556,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05555556, 0.        , 0.        , 0.        , 0.05555556,
        0.05555556, 0.        , 0.        , 0.05555556, 0.        ,
        0.        , 0.05555556, 0.05555556, 0.05555556, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.05555556,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05555556, 0.05555556, 0.        , 0.05555556, 0.        ,
        0.05555556],
       [0.02702703, 0.02702703, 0.02702703, 0.02702703, 0.        ,
        0.08108108, 0.        , 0.02702703, 0.02702703, 0.        ,
        0.        , 0.02702703, 0.02702703, 0.        , 0.        ,
        0.        , 0.02702

In [18]:
freq_count

array([[2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1],
       [1, 1, 1, 1, 0, 3, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0,
        0, 0, 0, 2, 0, 0, 0, 8, 0, 0, 0, 0]], dtype=int32)

In [19]:
from collections import Counter


counter: dict = Counter(flatten_documents(corpus))
counter

Counter({'the': 10,
         'and': 5,
         'a': 3,
         'cat': 3,
         'for': 2,
         'of': 2,
         'sat': 2,
         'on': 2,
         'dog': 2,
         'chased': 2,
         'thank': 1,
         'you': 1,
         'jesus': 1,
         'gift': 1,
         'today': 1,
         'i': 1,
         'am': 1,
         'grateful': 1,
         'it': 1,
         'is': 1,
         'definitely': 1,
         'beautiful': 1,
         'day': 1,
         'consistency': 1,
         'bedrock': 1,
         'progress': 1,
         'reliability': 1,
         'fosters': 1,
         'trust': 1,
         'builds': 1,
         'habits': 1,
         'drives': 1,
         'success': 1,
         'across': 1,
         'all': 1,
         'endeavors': 1,
         'from': 1,
         'personal': 1,
         'growth': 1,
         'to': 1,
         'professional': 1,
         'achievements': 1,
         'enabling': 1,
         'steady': 1,
         'improvement': 1,
         'creating': 1,
      

In [20]:
freq_df: pl.DataFrame = pl.DataFrame(freq_count)
freq_df.columns = [*vocab.keys()]
freq_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you
i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
2,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1
1,1,1,1,0,3,0,1,1,0,0,1,1,0,0,0,1,1,1,1,1,1,1,1,0,0,1,1,0,1,1,0,0,0,1,0,0,0,1,1,0,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0
0,0,0,0,0,2,0,0,0,3,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,2,0,0,0,0,0,0,2,0,0,0,8,0,0,0,0


In [21]:
term_freq_df: pl.DataFrame = pl.DataFrame(term_freq)
term_freq_df.columns = [*vocab.keys()]
term_freq_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.111111,0.0,0.0,0.0,0.055556,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.055556,0.0,0.0,0.055556,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.055556,0.0,0.055556
0.027027,0.027027,0.027027,0.027027,0.0,0.081081,0.0,0.027027,0.027027,0.0,0.0,0.027027,0.027027,0.0,0.0,0.0,0.027027,0.027027,0.027027,0.027027,0.027027,0.027027,0.027027,0.027027,0.0,0.0,0.027027,0.027027,0.0,0.027027,0.027027,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.027027,0.027027,0.0,0.027027,0.027027,0.027027,0.027027,0.027027,0.027027,0.0,0.027027,0.027027,0.0,0.027027,0.027027,0.0,0.027027,0.0
0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.125,0.083333,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.041667,0.041667,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0


### Inverse Document Frequency

- The inverse document frequency is a `measure of how much information the word provides`, i.e., how common or rare it is across all documents.
- It is the logarithmically scaled inverse fraction of the documents that contain the word (obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient)

$$ n_{t} = |d \in D: t \in d| $$
$$ idf_{(t,D)} = log (\frac{N}{n_{t} + 1}) + 1 $$

- where $N$ is the total number of documents in the corpus, $t \in d$ is the number of terms in a document, $d \in D$ is a document in the corpus $D$ and 1 is added to the denominator to avoid division-by-zero errors.
- If the term is not in the corpus, this will lead to a division-by-zero.
  - It is therefore common to adjust the numerator and denominator by adding a smoothing term to avoid this.

In [22]:
(term_freq != 0)[:, 7].sum()

N: int = len(corpus)  # number of documents
S_F: int = 1  # smoothing factor

# Number of documents containing a term
doc_freq: np.ndarray = term_freq != 0
doc_freq = doc_freq.sum(axis=0, keepdims=True)

idf: np.ndarray = np.log1p(N + 1 / doc_freq + 1)
idf

array([[1.70474809, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.70474809, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.79175947, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.79175947, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.70474809, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.79175947, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.79175947, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.79175947, 1.79175947, 1.79175947, 1.79175947, 1.70474809,
        1.79175947, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.79175947, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
        1.79175947, 1.67397643, 1.79175947, 1.79175947, 1.79175947,
        1.79175947]])

In [23]:
idf_df: pl.DataFrame = pl.DataFrame(idf)
idf_df.columns = [*vocab.keys()]
idf_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.704748,1.791759,1.791759,1.791759,1.791759,1.704748,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.704748,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.704748,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.673976,1.791759,1.791759,1.791759,1.791759


In [24]:
tf_idf: np.ndarray = term_freq * idf
# Normalize
tf_idf = tf_idf / tf_idf.sum(axis=-1, keepdims=True)

tf_idf_df: pl.DataFrame = pl.DataFrame(tf_idf)
tf_idf_df.columns = [*vocab.keys()]
tf_idf_df

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.107265,0.0,0.0,0.0,0.05637,0.0,0.05637,0.0,0.0,0.0,0.0,0.0,0.0,0.05637,0.05637,0.0,0.0,0.0,0.0,0.0,0.053632,0.0,0.0,0.0,0.05637,0.05637,0.0,0.0,0.05637,0.0,0.0,0.05637,0.05637,0.05637,0.0,0.0,0.0,0.0,0.0,0.053632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05637,0.052664,0.0,0.05637,0.0,0.05637
0.025965,0.02729,0.02729,0.02729,0.0,0.077895,0.0,0.02729,0.02729,0.0,0.0,0.02729,0.02729,0.0,0.0,0.0,0.02729,0.02729,0.02729,0.02729,0.025965,0.02729,0.02729,0.02729,0.0,0.0,0.02729,0.02729,0.0,0.02729,0.02729,0.0,0.0,0.0,0.02729,0.0,0.0,0.0,0.02729,0.025965,0.0,0.02729,0.02729,0.02729,0.02729,0.02729,0.02729,0.0,0.02729,0.02729,0.0,0.025496,0.02729,0.0,0.02729,0.0
0.0,0.0,0.0,0.0,0.0,0.0814,0.0,0.0,0.0,0.128331,0.085554,0.0,0.0,0.0,0.0,0.085554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042777,0.042777,0.042777,0.0,0.0,0.085554,0.0,0.0,0.0,0.0,0.0,0.0,0.085554,0.0,0.0,0.0,0.319721,0.0,0.0,0.0,0.0


In [25]:
tf_idf_df.with_columns(total=pl.sum_horizontal("*"))

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you,total
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.107265,0.0,0.0,0.0,0.05637,0.0,0.05637,0.0,0.0,0.0,0.0,0.0,0.0,0.05637,0.05637,0.0,0.0,0.0,0.0,0.0,0.053632,0.0,0.0,0.0,0.05637,0.05637,0.0,0.0,0.05637,0.0,0.0,0.05637,0.05637,0.05637,0.0,0.0,0.0,0.0,0.0,0.053632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05637,0.052664,0.0,0.05637,0.0,0.05637,1.0
0.025965,0.02729,0.02729,0.02729,0.0,0.077895,0.0,0.02729,0.02729,0.0,0.0,0.02729,0.02729,0.0,0.0,0.0,0.02729,0.02729,0.02729,0.02729,0.025965,0.02729,0.02729,0.02729,0.0,0.0,0.02729,0.02729,0.0,0.02729,0.02729,0.0,0.0,0.0,0.02729,0.0,0.0,0.0,0.02729,0.025965,0.0,0.02729,0.02729,0.02729,0.02729,0.02729,0.02729,0.0,0.02729,0.02729,0.0,0.025496,0.02729,0.0,0.02729,0.0,1.0
0.0,0.0,0.0,0.0,0.0,0.0814,0.0,0.0,0.0,0.128331,0.085554,0.0,0.0,0.0,0.0,0.085554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042777,0.042777,0.042777,0.0,0.0,0.085554,0.0,0.0,0.0,0.0,0.0,0.0,0.085554,0.0,0.0,0.0,0.319721,0.0,0.0,0.0,0.0,1.0


In [26]:
def to_lower(text: str | list[str]) -> str | list[str]:
    if isinstance(text, str):
        return text.lower()
    elif isinstance(text, list):
        return [to_lower(t) for t in text]
    else:
        raise ValueError("Invalid input type")


to_lower("Hello World from Python")

'hello world from python'

### Putting It Together

In [30]:
class TFIDF:
    def __init__(self, corpus: list[str], vocab: dict[str, int]) -> None:
        self.corpus = corpus
        self.vocab = vocab

    def _calculate_tf(self) -> np.ndarray:
        # (n_docs, n_terms)
        freq_count: np.ndarray = np.zeros(
            (len(self.corpus), len(self.vocab)), dtype=np.int32
        )

        for idx, doc in enumerate(self.corpus, start=0):
            for word in doc:
                word_idx = vocab[word]
                freq_count[idx, word_idx] += 1

        tf: np.ndarray = freq_count / np.sum(freq_count, axis=1, keepdims=True)
        return tf

    def _calculate_idf(self) -> np.ndarray:
        N: int = len(self.corpus)  # number of documents
        S_F: int = 1  # smoothing factor
        tf: np.ndarray = self._calculate_tf()

        # Number of documents containing a term
        doc_freq: np.ndarray = tf > 0
        doc_freq = doc_freq.sum(axis=0, keepdims=True)
        idf: np.ndarray = np.log1p(N + S_F / doc_freq + S_F)

        return idf

    def calculate_tfidf(self, normalize: bool = True) -> np.ndarray:
        tf: np.ndarray = self._calculate_tf()
        idf: np.ndarray = self._calculate_idf()
        tf_idf: np.ndarray = tf * idf
        if normalize:
            tf_idf = tf_idf / tf_idf.sum(axis=-1, keepdims=True)

        return tf_idf

In [31]:
vectorizer: TFIDF = TFIDF(corpus=corpus, vocab=vocab)
tf_idf: np.ndarray = vectorizer.calculate_tfidf(normalize=True)

tf_idf_df_2: pl.DataFrame = pl.DataFrame(tf_idf)
tf_idf_df_2.columns = [*vocab.keys()]
tf_idf_df_2

a,achievements,across,all,am,and,beautiful,bedrock,builds,cat,chased,consistency,creating,day,definitely,dog,drives,enabling,endeavors,excellence,for,fosters,foundation,from,gift,grateful,growth,habits,i,improvement,in,is,it,jesus,life,log,mat,mouse,myriad,of,on,personal,professional,progress,pursuits,reliability,s,sat,steady,success,thank,the,to,today,trust,you
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.107265,0.0,0.0,0.0,0.05637,0.0,0.05637,0.0,0.0,0.0,0.0,0.0,0.0,0.05637,0.05637,0.0,0.0,0.0,0.0,0.0,0.053632,0.0,0.0,0.0,0.05637,0.05637,0.0,0.0,0.05637,0.0,0.0,0.05637,0.05637,0.05637,0.0,0.0,0.0,0.0,0.0,0.053632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05637,0.052664,0.0,0.05637,0.0,0.05637
0.025965,0.02729,0.02729,0.02729,0.0,0.077895,0.0,0.02729,0.02729,0.0,0.0,0.02729,0.02729,0.0,0.0,0.0,0.02729,0.02729,0.02729,0.02729,0.025965,0.02729,0.02729,0.02729,0.0,0.0,0.02729,0.02729,0.0,0.02729,0.02729,0.0,0.0,0.0,0.02729,0.0,0.0,0.0,0.02729,0.025965,0.0,0.02729,0.02729,0.02729,0.02729,0.02729,0.02729,0.0,0.02729,0.02729,0.0,0.025496,0.02729,0.0,0.02729,0.0
0.0,0.0,0.0,0.0,0.0,0.0814,0.0,0.0,0.0,0.128331,0.085554,0.0,0.0,0.0,0.0,0.085554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042777,0.042777,0.042777,0.0,0.0,0.085554,0.0,0.0,0.0,0.0,0.0,0.0,0.085554,0.0,0.0,0.0,0.319721,0.0,0.0,0.0,0.0
