# Estimate cost to embed U.S. Code

In [1]:
from decimal import Decimal
from pathlib import Path
import polars as pl
import xml.etree.ElementTree as ET

import tiktoken

In [2]:
# As of this writing, https://openai.com/pricing lists the cost for
# text-embedding-ada-002 as "$0.0004 / 1K tokens".
TOKEN_COST = Decimal('0.0004') / 1000

In [3]:
data_dir = Path('../../data/')  # FIXME: See data_dir fixme in usc.ipynb.

In [4]:
encoding = tiktoken.encoding_for_model('text-embedding-ada-002')

In [5]:
def count_tokens(text):
    return len(encoding.encode(text))

In [6]:
def drop_attributes(element_text):  # See usc_manual.ipynb.
    tree = ET.fromstring(element_text)
    for element in tree.iter():
        element.attrib.clear()
    return ET.tostring(tree, encoding='unicode')

In [7]:
def compute_totals(rows):
    """Make a "TOTALS" row with the sums of all four numbers."""
    return CostTableRow(
        title='TOTALS',
        tokens=sum(row.tokens for row in rows),
        cost=sum(row.cost for row in rows),
        clean_tokens=sum(row.clean_tokens for row in rows),
        clean_cost=sum(row.clean_cost for row in rows),
    )

In [8]:
# FIXME: Remove or completely rewrite this, when switching to Polars.
def build_table_rows(dir):
    unsorted = (CostTableRow.compute(path) for path in Path(dir).glob('*.xml'))
    without_totals = sorted(unsorted, key=lambda row: row.title)
    with_totals = [*without_totals, compute_totals(without_totals)]
    return [attrs.astuple(row) for row in with_totals]

In [9]:
token_table_path = data_dir / 'usc_token_counts.csv'
try:
    df = pl.read_csv(token_table_path)
except OSError:
    df = pl.DataFrame(
        {
            'Title': path.stem,
            'Tokens': count_tokens(full_text := path.read_text(encoding='utf-8')),
            'Clean Tokens': count_tokens(drop_attributes(full_text)),
        }
        for path in (data_dir / 'xml_uscAll@118-3not328/').glob('usc0?.xml')
        # FIXME: After debugging, change pattern to '*.xml' to use all the files. ^
    ).sort('Title')

    df.write_csv(token_table_path)
    assert pl.read_csv(token_table_path).frame_equal(df)

In [10]:
df = df.with_columns(
    pl.col('Tokens').apply(TOKEN_COST.__mul__).alias('Cost ($)'),
    pl.col('Clean Tokens').apply(TOKEN_COST.__mul__).alias('Clean Cost ($)'),
).select('Title', 'Tokens', 'Cost ($)', 'Clean Tokens', 'Clean Cost ($)')

In [11]:
# FIXME: Remove or completely rewrite this, when switching to Polars.

# headers = ['Title', 'Tokens', 'Cost ($)', 'Clean Tokens', 'Clean Cost ($)']
# table_rows = build_table_rows('../../data/xml_uscAll@118-3not328/')
# tabulate(table_rows, headers=headers, tablefmt='html')

In [17]:
df.to_pandas().sum(numeric_only=False)

ModuleNotFoundError: No module named 'pyarrow'

By this rough estimate, embedding the whole U.S. Code, even with tag attributes removed, would cost about $60, require making at least [17947 requests](https://www.wolframalpha.com/input?i=147002851.0+%2F+8191), and would take about [105 MiB](https://www.wolframalpha.com/input?i=%28147002851+%2F+8191%29+*+4+*+1536+bytes+in+MiB) to store, if stored as compactly as NumPy represents them in memory. (Uncompressed JSON would be significantly bigger.)

In [13]:
# unclean title 42 ratio
33311058/207668909

0.16040464680247346

In [14]:
# clean title 42 ratio
23884845/147002851

0.16247878757126963