In [2]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl (284 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.6/284.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: regex, tiktoken
  Attempting uninstall: regex
    Found existing installation: regex 2021.11.10
    Uninstalling regex-2021.11.10:
      Successfully uninstalled regex-2021.11.10
Successfully inst

In [7]:
import pandas as pd
import tiktoken
import math

# Initialize tokenizer
enc = tiktoken.get_encoding("cl100k_base")


def count_tokens_for_row(row):
    """
    Concatenate the five fields, converting everything to str:
      - title       : TEXT
      - description : TEXT
      - features    : TEXT_ARRAY
      - store       : TEXT
      - details     : TEXT (JSON string)
    """
    # Safely extract and stringify each field
    title = str(row.get("title", "")) if pd.notna(row.get("title", "")) else ""
    description = (
        str(row.get("description", "")) if pd.notna(row.get("description", "")) else ""
    )
    # Features is a list; convert each element to str and join
    features_list = row.get("features", [])
    features = " ".join(str(f) for f in features_list if pd.notna(f))
    store = str(row.get("store", "")) if pd.notna(row.get("store", "")) else ""
    details = str(row.get("details", "")) if pd.notna(row.get("details", "")) else ""

    # Build the full text and split on spaces
    parts = [title, description, features, store, details]
    # Keep only non-empty strings
    text = " ".join(p for p in parts if p)

    # Return the number of tokens
    return len(enc.encode(text))


# Example usage:
df = pd.read_csv("./data/products.csv", compression="zip")
df["token_count"] = df.apply(count_tokens_for_row, axis=1)
total_tokens = df["token_count"].sum()
print(f"Total tokens: {total_tokens}")

Total tokens: 118299744


In [8]:
price_per_1k = 0.00002  # text-embedding-3-small rate
estimated_cost = (total_tokens / 1000) * price_per_1k
print(f"Estimated embedding cost: ${estimated_cost:.2f}")

Estimated embedding cost: $2.37
