# Estimate cost to embed U.S. Code

In [1]:
from decimal import Decimal
from pathlib import Path
import polars as pl
import xml.etree.ElementTree as ET

import tiktoken

In [2]:
# As of this writing, https://openai.com/pricing lists the cost for
# text-embedding-ada-002 as "$0.0004 / 1K tokens".
TOKEN_COST = Decimal('0.0004') / 1000

In [3]:
data_dir = Path('../../data/')  # FIXME: See data_dir fixme in usc.ipynb.

In [4]:
encoding = tiktoken.encoding_for_model('text-embedding-ada-002')

In [5]:
def count_tokens(text):
    return len(encoding.encode(text))

In [6]:
def drop_attributes(element_text):  # See usc_manual.ipynb.
    tree = ET.fromstring(element_text)
    for element in tree.iter():
        element.attrib.clear()
    return ET.tostring(tree, encoding='unicode')

In [7]:
def with_cost_columns(df_without_costs):
    return df_without_costs.with_columns(
        pl.col('Tokens')
          .apply(TOKEN_COST.__mul__)
          .alias('Cost ($)'),
        pl.col('Clean Tokens')
          .apply(TOKEN_COST.__mul__)
          .alias('Clean Cost ($)'),
    ).select('Title', 'Tokens', 'Cost ($)', 'Clean Tokens', 'Clean Cost ($)')

In [8]:
token_table_path = data_dir / 'usc_token_counts.csv'
try:
    df = pl.read_csv(token_table_path)
except OSError:
    df = pl.DataFrame(
        {
            'Title': path.stem,
            'Tokens': count_tokens(full_text := path.read_text(encoding='utf-8')),
            'Clean Tokens': count_tokens(drop_attributes(full_text)),
        }
        for path in (data_dir / 'xml_uscAll@118-3not328/').glob('usc0?.xml')
        # FIXME: After debugging, change pattern to '*.xml' to use all the files. ^
    ).sort('Title')

    df.write_csv(token_table_path)
    assert pl.read_csv(token_table_path).frame_equal(df)

In [9]:
totals = df.sum().with_columns(pl.Series("Title", ["TOTALS"]))
df = pl.concat([df, totals])
df = with_cost_columns(df)
df

Title,Tokens,Cost ($),Clean Tokens,Clean Cost ($)
str,i64,decimal[7],i64,decimal[7]
"""usc01""",86448,0.0345792,65046,0.0260184
"""usc02""",2787354,1.1149416,1926874,0.7707496
"""usc03""",241628,0.0966512,183674,0.0734696
"""usc04""",88632,0.0354528,60806,0.0243224
"""usc05""",5752131,2.3008524,3975940,1.590376
"""usc06""",1868609,0.7474436,1270617,0.5082468
"""usc07""",8890968,3.5563872,6136383,2.4545532
"""usc08""",2211879,0.8847516,1634466,0.6537864
"""usc09""",35461,0.0141844,23886,0.0095544
"""TOTALS""",21963110,8.785244,15277692,6.1110768


By this rough estimate, embedding the whole U.S. Code, even with tag attributes removed, would cost about $60, require making at least [17947 requests](https://www.wolframalpha.com/input?i=147002851.0+%2F+8191), and would take about [105 MiB](https://www.wolframalpha.com/input?i=%28147002851+%2F+8191%29+*+4+*+1536+bytes+in+MiB) to store, if stored as compactly as NumPy represents them in memory. (Uncompressed JSON would be significantly bigger.)

In [10]:
# unclean title 42 ratio
33311058/207668909

0.16040464680247346

In [11]:
# clean title 42 ratio
23884845/147002851

0.16247878757126963