# Estimate cost to embed U.S. Code

In [1]:
import attrs
from decimal import Decimal
from pathlib import Path
import xml.etree.ElementTree as ET

from tabulate import tabulate
import tiktoken

In [2]:
# As of this writing, https://openai.com/pricing lists the cost for
# text-embedding-ada-002 as "$0.0004 / 1K tokens".
TOKEN_COST = Decimal('0.0004') / 1000

In [3]:
encoding = tiktoken.encoding_for_model('text-embedding-ada-002')

In [4]:
def count_tokens(text):
    return len(encoding.encode(text))

In [5]:
def drop_attributes(element_text):  # See usc_manual.ipynb.
    tree = ET.fromstring(element_text)
    for element in tree.iter():
        element.attrib.clear()
    return ET.tostring(tree, encoding='unicode')

In [6]:
@attrs.frozen
class CostTableRow:  # TODO: Maybe use a pandas or Polars data frame instead.
    """A row in a table of token count and cost information."""

    title = attrs.field()
    """Name of the full USC title, represented as its XML file's basename."""

    tokens = attrs.field()
    """Number of tokens in the full USC title."""

    cost = attrs.field()
    """Cost to embed the full USC title, in US dollars."""

    clean_tokens = attrs.field()
    """Number of tokens in the USC title if tag attributes are removed."""

    clean_cost = attrs.field()
    """Cost to embed the USC title with tag attributes removed."""

    @classmethod
    def compute(cls, path):
        """Compute a ``CostTableRow`` for a USC title, given its file path."""
        path = Path(path)  # In case it was given as a string.
        full_text = path.read_text(encoding='utf-8')
        tokens = count_tokens(full_text)
        clean_tokens = count_tokens(drop_attributes(full_text))

        return cls(
            title=path.stem,
            tokens=tokens,
            cost=(tokens * TOKEN_COST),
            clean_tokens=clean_tokens,
            clean_cost=(clean_tokens * TOKEN_COST),
        )

In [7]:
# Try this out on just one of the files first.
CostTableRow.compute('../../data/xml_uscAll@118-3not328/usc01.xml')

CostTableRow(title='usc01', tokens=86448, cost=Decimal('0.0345792'), clean_tokens=65046, clean_cost=Decimal('0.0260184'))

In [8]:
def compute_totals(rows):
    """Make a "TOTALS" row with the sums of all four numbers."""
    return CostTableRow(
        title='TOTALS',
        tokens=sum(row.tokens for row in rows),
        cost=sum(row.cost for row in rows),
        clean_tokens=sum(row.clean_tokens for row in rows),
        clean_cost=sum(row.clean_cost for row in rows),
    )

In [9]:
def build_table_rows(dir):
    unsorted = (CostTableRow.compute(path) for path in Path(dir).glob('*.xml'))
    without_totals = sorted(unsorted, key=lambda row: row.title)
    with_totals = [*without_totals, compute_totals(without_totals)]
    return [attrs.astuple(row) for row in with_totals]

In [10]:
headers = ['Title', 'Tokens', 'Cost ($)', 'Clean Tokens', 'Clean Cost ($)']
table_rows = build_table_rows('../../data/xml_uscAll@118-3not328/')
tabulate(table_rows, headers=headers, tablefmt='html')

Title,Tokens,Cost ($),Clean Tokens,Clean Cost ($)
usc01,86448,0.0345792,65046,0.0260184
usc02,2787354,1.11494,1926874,0.77075
usc03,241628,0.0966512,183674,0.0734696
usc04,88632,0.0354528,60806,0.0243224
usc05,5752131,2.30085,3975940,1.59038
usc05A,402069,0.160828,338226,0.13529
usc06,1868609,0.747444,1270617,0.508247
usc07,8890968,3.55639,6136383,2.45455
usc08,2211879,0.884752,1634466,0.653786
usc09,35461,0.0141844,23886,0.0095544


By this rough estimate, embedding the whole U.S. Code, even with tag attributes removed, would cost about $60, require making at least [17947 requests](https://www.wolframalpha.com/input?i=147002851.0+%2F+8191), and would take about [105 MiB](https://www.wolframalpha.com/input?i=%28147002851+%2F+8191%29+*+4+*+1536+bytes+in+MiB) to store, if stored as compactly as NumPy represents them in memory. (Uncompressed JSON would be significantly bigger.)