In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

numpy    : 1.26.4
pandas   : 2.2.2
polars   : 1.0.0
torch    : 2.2.2
lightning: 2.3.2

conda environment: ai_search



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
text: list[str] = ["i", "am", "very", "happy", "today"]
vocab_set: set = sorted(set(text))
vocab: dict[str, int] = {word: idx for idx, word in enumerate(vocab_set, start=0)}

vocab

{'am': 0, 'happy': 1, 'i': 2, 'today': 3, 'very': 4}

In [4]:
pattern: str = r'([,.?_!"()\']|--|\s)'
result: list[str] = re.split(pattern=pattern, string=" ".join(text))
result = [x for x in result if x.strip()]
result

['i', 'am', 'very', 'happy', 'today']

In [7]:
import string


def tokenize(doc: list[str]) -> list[str]:
    # Pattern for separating tokens
    pattern: str = r'([,.?_!"()\':]|\s)'
    # Lowercase all words
    doc = [word.lower() for word in doc]
    tok_doc: list[str] = re.split(pattern=pattern, string=" ".join(doc))
    # Remove whitespaces and empty strings
    tok_doc = [word for word in tok_doc if word.strip()]
    return tok_doc


def generate_vocab(doc: list[list[str]], drop_punct: bool = True) -> dict[str, int]:

    assert all(
        [True if isinstance(row, list) else False for row in doc]
    ), "Not all elements are lists"

    flattened_doc: list[str] = [word.lower() for row in doc for word in row]
    tok_doc: list[str] = tokenize(flattened_doc)
    if drop_punct:
        tok_doc = [word for word in tok_doc if word not in string.punctuation]
    tok_doc = sorted(set(tok_doc))

    vocab: dict[str, int] = {word: idx for idx, word in enumerate(tok_doc, start=0)}
    print(f"Vocab size: {len(vocab)}")
    return vocab


def encode(doc: list[list[str]], vocab: dict[str, int]) -> list[int]:
    arr: np.ndarray = np.zeros((1, len(vocab)), dtype=int)
    for row in doc:
        for word in tokenize(row):
            if word in vocab:
                arr[0, vocab[word]] = 1  # change!
    return arr


def encode_n_create_df(doc: list[list[str]], vocab: dict[str, int]) -> pl.DataFrame:
    df: pl.DataFrame = pl.DataFrame(encode(doc=doc, vocab=vocab))
    df.columns = list(vocab.keys())

    return df

In [8]:
doc: list[list[str]] = [["Hey! Tell me something about neidu."]]
vocab: dict[str, int] = generate_vocab(doc=doc, drop_punct=True)
print(f"{vocab = }")

encode(doc=[["tell Tell"]], vocab=vocab)

Vocab size: 6
vocab = {'about': 0, 'hey': 1, 'me': 2, 'neidu': 3, 'something': 4, 'tell': 5}


array([[0, 0, 0, 0, 0, 1]])

In [9]:
encode_n_create_df(doc=[["about Tell"]], vocab=vocab)

# vocab

about,hey,me,neidu,something,tell
i64,i64,i64,i64,i64,i64
1,0,0,0,0,1


### Comment

#### [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity#:~:text=Cosine%20similarity%20is%20the%20cosine,but%20only%20on%20their%20angle.)

- Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths.
- It follows that the cosine similarity does not depend on the magnitudes of the vectors, but only on their angle.

$$cosSimilarity = cos(\theta) = \frac{A . B}{||A||.||B||}$$

In [10]:
from numpy.linalg import norm


def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray) -> float:
    return np.dot(vector1, vector2) / (norm(vector1) * norm(vector2))


def check_equality(vector_1: np.ndarray, vector_2: np.ndarray) -> bool:
    result: bool = np.array_equal(vector_1, vector_2)
    print(f"{result = }")
    return result

In [11]:
cosine_similarity(vector1=np.array([1, 2, 3]), vector2=np.array([4, 5, 6]))

0.9746318461970762

In [16]:
doc1: list[str] = [
    (
        "Lynn: ham and cheese sandwich, chocolate cookie, ice water. "
        "Brian: turkey avocado sandwich, plain potato chips, apple juice "
        "Mohammed: grilled chicken salad, fruit cup, lemonade "
    )
]

doc2: list[str] = [
    (
        "Orchard Farms apple juice is premium, organic apple juice made from the "
        "freshest apples, never from concentrate. Its juice has received the "
        "regional award for best apple juice three years in a row. "
    )
]


doc: list[list[str]] = [doc1, doc2]
vocab: dict[str, int] = generate_vocab(doc=doc)

query: list[list[str]] = [[" apple juice"]]
query_vector: np.ndarray = encode(doc=query, vocab=vocab)
doc1_vector: np.ndarray = encode(doc=[doc1], vocab=vocab)
doc2_vector: np.ndarray = encode(doc=[doc2], vocab=vocab)

print(query_vector)

Vocab size: 48
[[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0]]


In [17]:
console.print(list(vocab.keys()))

In [18]:
doc1_score = cosine_similarity(query_vector.squeeze(), doc1_vector.squeeze())
doc2_score = cosine_similarity(query_vector.squeeze(), doc2_vector.squeeze())

doc1_score, doc2_score

(0.2886751345948129, 0.2773500981126146)

In [19]:
doc1

['Lynn: ham and cheese sandwich, chocolate cookie, ice water. Brian: turkey avocado sandwich, plain potato chips, apple juice Mohammed: grilled chicken salad, fruit cup, lemonade ']

In [20]:
encode_n_create_df(doc=[doc1], vocab=vocab)
# encode(doc=[doc1], vocab=vocab)

a,and,apple,apples,avocado,award,best,brian,cheese,chicken,chips,chocolate,concentrate,cookie,cup,farms,for,freshest,from,fruit,grilled,ham,has,ice,in,is,its,juice,lemonade,lynn,made,mohammed,never,orchard,organic,plain,potato,premium,received,regional,row,salad,sandwich,the,three,turkey,water,years
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0


In [24]:
encode_n_create_df(doc=[doc2], vocab=vocab)

a,and,apple,apples,avocado,award,best,brian,cheese,chicken,chips,chocolate,concentrate,cookie,cup,farms,for,freshest,from,fruit,grilled,ham,has,ice,in,is,its,juice,lemonade,lynn,made,mohammed,never,orchard,organic,plain,potato,premium,received,regional,row,salad,sandwich,the,three,turkey,water,years
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1,0,1,1,0,1,1,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,0,0,1,1,1,1,0,0,1,1,0,0,1


In [21]:
check_equality(doc1_vector.squeeze(), doc1_vector.squeeze())
check_equality(doc1_vector.squeeze(), doc2_vector.squeeze())

result = True
result = False


False