In [189]:
import asyncio
import datetime
import json
import os

import lancedb
import tiktoken
import torch
import torch.nn.functional as F
from datasets import load_dataset
from dotenv import load_dotenv
from loguru import logger
from openai import OpenAI, AsyncOpenAI
from sentence_transformers import SentenceTransformer
from tenacity import (
    RetryCallState,
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

try:
    from google.colab import userdata  # type: ignore
except ImportError:
    userdata = None

load_dotenv(override=True)

True

In [96]:
ds = load_dataset("AiresPucrs/tmdb-5000-movies", split="train")
ds

README.md:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

(…)-00000-of-00001-6db04ab1c75d6817.parquet:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4803 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'budget', 'genres', 'homepage', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'cast', 'crew'],
    num_rows: 4803
})

In [None]:
def preprocess(example: dict) -> dict:
    example["genres"] = [g["name"] for g in json.loads(example["genres"])]
    example["keywords"] = [k["name"] for k in json.loads(example["keywords"])]
    example["production_companies"] = [
        pc["name"] for pc in json.loads(example["production_companies"])
    ]
    example["production_countries"] = [
        pc["name"] for pc in json.loads(example["production_countries"])
    ]
    example["release_date"] = datetime.datetime.strptime(
        d if (d := example["release_date"]) else "1970-01-01", "%Y-%m-%d"
    ).date()
    example["release_year"] = example["release_date"].year
    example["spoken_languages"] = [
        sl["name"] for sl in json.loads(example["spoken_languages"])
    ]
    example["cast"] = [
        {
            "name": c["name"],
            "character": c["character"],
        }
        for c in json.loads(example["cast"])
    ]
    example["crew"] = [
        {
            "name": c["name"],
            "job": c["job"],
        }
        for c in json.loads(example["crew"])
    ]
    return example


In [105]:
ds = ds.map(preprocess, remove_columns=["homepage"], num_proc=4)

Map (num_proc=4):   0%|          | 0/4803 [00:00<?, ? examples/s]

In [109]:
ds = ds.shuffle(seed=42)
ds_sample = ds.select(range(1000))

In [111]:
ds_sample[0]

{'id': 2649,
 'budget': 50000000,
 'genres': ['Drama', 'Thriller', 'Mystery'],
 'keywords': ['brother brother relationship',
  'birthday',
  'danger of life',
  'birthday party',
  'surprising'],
 'original_language': 'en',
 'original_title': 'The Game',
 'overview': 'In honor of his birthday, San Francisco banker Nicholas Van Orton, a financial genius and a coldhearted loner, receives an unusual present from his younger brother, Conrad -- a gift certificate to play a unique kind of game. In nearly a nanosecond, Nicholas finds himself consumed by a dangerous set of ever-changing rules, unable to distinguish where the charade ends and reality begins.',
 'popularity': 62.925175,
 'production_companies': ['PolyGram Filmed Entertainment',
  'A&B Producoes',
  'Popaganda Films'],
 'production_countries': ['United States of America'],
 'release_date': datetime.date(1997, 9, 12),
 'revenue': 109423648,
 'runtime': 129.0,
 'spoken_languages': ['广州话 / 廣州話', 'Deutsch', 'English'],
 'status': 'Re

# RAG

## Embeddings

In [126]:
query = ["I want to watch an exciting superhero movie"]
movies = [
    "A movie about a group of friends who go on a road trip",
    "A romantic comedy about a couple who meet at a wedding",
    "An autobiography of George Washington, the first president of the United States",
    "Spider-Man is fighting against the Green Goblin in another universe",
]

### sentence-transformers


In [117]:
model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device="cpu"
)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Compute embeddings

In [137]:
query_embedding = model.encode(query)
movie_embeddings = model.encode(movies)
print(query_embedding.shape, movie_embeddings.shape)
print(f"Query: {query_embedding[0][:5]}")
print(f"Movie: {movie_embeddings[0][:5]}")

(1, 384) (4, 384)
Query: [-0.40846434 -0.24543124 -0.6005591  -0.11258559  0.19218627]
Movie: [ 0.30984214 -0.12824544 -0.33090717 -0.2385715   0.33415732]


#### Computing similarity + retrieving top k

In [132]:
# Compute cosine similarities
similarities = model.similarity(query_embedding, movie_embeddings)
similarities

tensor([[ 0.2611,  0.1753, -0.0321,  0.3556]])

In [144]:
YELLOW = "\033[33m"
END = "\033[0m"
# Output the pairs with their score
print(query[0])
for idx_j, sentence2 in enumerate(movies):
    print(f" - {sentence2: <30}: {YELLOW}{similarities[0][idx_j]:.4f}{END}")

I want to watch an exciting superhero movie
 - A movie about a group of friends who go on a road trip: [33m0.2611[0m
 - A romantic comedy about a couple who meet at a wedding: [33m0.1753[0m
 - An autobiography of George Washington, the first president of the United States: [33m-0.0321[0m
 - Spider-Man is fighting against the Green Goblin in another universe: [33m0.3556[0m


In [141]:
torch.topk(similarities[0], k=4)

torch.return_types.topk(
values=tensor([ 0.3556,  0.2611,  0.1753, -0.0321]),
indices=tensor([3, 0, 1, 2]))

In [145]:
print(query[0])
for idx in torch.topk(similarities[0], k=4).indices:
    print(f" - {movies[idx]: <30}: {YELLOW}{similarities[0][idx]:.4f}{END}")

I want to watch an exciting superhero movie
 - Spider-Man is fighting against the Green Goblin in another universe: [33m0.3556[0m
 - A movie about a group of friends who go on a road trip: [33m0.2611[0m
 - A romantic comedy about a couple who meet at a wedding: [33m0.1753[0m
 - An autobiography of George Washington, the first president of the United States: [33m-0.0321[0m


### OpenAI

[Documentation](https://platform.openai.com/docs/guides/embeddings?lang=python)

In [186]:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key and userdata:
    # If running in Google Colab, try to get the API key from userdata
    api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set")

client = AsyncOpenAI(api_key=api_key)

#### Compute embeddings|

In [187]:
res = await client.embeddings.create(input=query[0], model="text-embedding-3-small")
embedding = res.data[0].embedding
print(len(embedding))
print(embedding[:5])

1536
[-0.010025657713413239, 0.007190898526459932, -0.08766952902078629, 0.010150458663702011, -0.025221630930900574]


In [169]:
print(f"Total tokens: {res.usage.total_tokens}")

Total tokens: 8


In [193]:
# Convert to torch.Tensor for cosine similarity
query_embedding = torch.Tensor(
    (await client.embeddings.create(input=query[0], model="text-embedding-3-small"))
    .data[0]
    .embedding
).unsqueeze(0)
_movie_embeddings = await asyncio.gather(
    *[
        client.embeddings.create(input=movie, model="text-embedding-3-small")
        for movie in movies
    ]
)
movie_embeddings = torch.Tensor(
    [embedding.data[0].embedding for embedding in _movie_embeddings]
)

In [191]:
torch.Tensor(movie_embeddings)

tensor([[ 0.0125,  0.0218, -0.0479,  ...,  0.0026, -0.0386,  0.0079],
        [-0.0325,  0.0204, -0.0673,  ...,  0.0057, -0.0081, -0.0111],
        [ 0.0083, -0.0117,  0.0179,  ...,  0.0175, -0.0041, -0.0108],
        [-0.0362, -0.0462,  0.0150,  ..., -0.0224,  0.0127,  0.0131]])

#### Computing similarity + retrieving top k

In [195]:
similarities = F.cosine_similarity(
    query_embedding, movie_embeddings, dim=1
)
similarities

tensor([0.2881, 0.2119, 0.0502, 0.3344])

In [198]:
YELLOW = "\033[33m"
END = "\033[0m"
print(query[0])
for idx in torch.topk(similarities, k=4).indices:
    print(f" - {movies[idx]: <30}: {YELLOW}{similarities[idx]:.4f}{END}")

I want to watch an exciting superhero movie
 - Spider-Man is fighting against the Green Goblin in another universe: [33m0.3344[0m
 - A movie about a group of friends who go on a road trip: [33m0.2881[0m
 - A romantic comedy about a couple who meet at a wedding: [33m0.2119[0m
 - An autobiography of George Washington, the first president of the United States: [33m0.0502[0m


#### Calculate tokens and price

[OpenAI pricing](https://platform.openai.com/docs/pricing)

In [None]:
enc = tiktoken.encoding_for_model("text-embedding-3-small")

<Encoding 'cl100k_base'>

In [170]:
encoded = enc.encode(query[0])
print(f"Total tokens: {len(encoded)}")
print(encoded)

Total tokens: 8
[40, 1390, 311, 3821, 459, 13548, 46244, 5818]


In [None]:
# price per 1M tokens
model_to_price = {
    "text-embedding-3-small": 0.02,
    "text-embedding-3-large": 0.13,
}

In [None]:
def get_token_count_and_price(
    texts: list[str], model: str = "text-embedding-3-small"
) -> tuple[int, float]:
    if model not in model_to_price:
        raise ValueError(f"Model {model} not supported")
    enc = tiktoken.encoding_for_model(model)
    token_count = sum(len(e) for e in enc.encode_batch(texts))
    price_per_1m_tokens = model_to_price[model]
    price = (token_count / 1_000_000) * price_per_1m_tokens
    return token_count, price

In [177]:
get_token_count_and_price(movies * 10, model="text-embedding-3-small")

(480, 9.600000000000001e-06)

#### Handling rate limits

In [None]:
def log_backoff_attempt(retry_state: RetryCallState) -> None:
    """
    Logs a message before a retry attempt, detailing the attempt number,
    the exception, and the wait time.
    """
    attempt_num = retry_state.attempt_number
    exception = retry_state.outcome.exception() if retry_state.outcome else "N/A"
    wait_time = retry_state.next_action.sleep if retry_state.next_action else 0.0
    func_name = retry_state.fn.__name__ if retry_state.fn else "N/A"

    logger.info(
        f"Backing off for function '{func_name}': "
        f"Attempt {attempt_num} failed due to '{exception.__class__.__name__}: {exception}'. "
        f"Waiting {wait_time:.2f} seconds before next attempt."
    )


@retry(
    wait=wait_random_exponential(min=1, max=60),
    stop=stop_after_attempt(6),
    before_sleep=log_backoff_attempt,
)
async def embedding_with_backoff(**kwargs):
    return client.embeddings.create(**kwargs)

# Setting up a vector database

In [None]:
db = lancedb.connect("../data/tmdb-5000-movies-sentence-transformers")