# Estimate cost to embed U.S. Code

In [1]:
from decimal import Decimal
from pathlib import Path

from IPython.display import display
import polars as pl

from embed.demos import usc

In [2]:
# As of this writing, https://openai.com/pricing lists the cost for
# text-embedding-ada-002 as "$0.0001 / 1K tokens".
TOKEN_COST = Decimal('0.0001') / 1000

In [3]:
data_dir = Path('../../data/')

In [4]:
df = usc.full_tabulate_token_counts(data_dir, TOKEN_COST)

In [5]:
with pl.Config(tbl_rows=500):
    display(df)

Title,Tokens,Cost ($),Clean Tokens,Clean Cost ($)
str,i64,decimal[7],i64,decimal[7]
"""usc01""",86448,0.0086448,49186,0.0049186
"""usc02""",2787354,0.2787354,1429091,0.1429091
"""usc03""",241628,0.0241628,143233,0.0143233
"""usc04""",88632,0.0088632,45539,0.0045539
"""usc05""",5752131,0.5752131,3022284,0.3022284
"""usc05A""",402069,0.0402069,292921,0.0292921
"""usc06""",1868609,0.1868609,923624,0.0923624
"""usc07""",8890968,0.8890968,4600229,0.4600229
"""usc08""",2211879,0.2211879,1266676,0.1266676
"""usc09""",35461,0.0035461,17127,0.0017127


By that rough estimate, embedding the whole U.S. Code, even with tag attributes
removed, would cost about $11, require making at least [13 575
requests](https://www.wolframalpha.com/input?i=111189563.0+%2F+8191), and would
take about [80
MiB](https://www.wolframalpha.com/input?i=%28111189563+%2F+8191%29+*+4+*+1536+bytes+in+MiB)
to store, if stored as compactly as NumPy represents them in memory.
(Uncompressed JSON would be significantly bigger.)

## How many of the tokens are Title 42?

In [6]:
(
    df.filter(pl.col('Title').is_in(['usc42', 'TOTALS']))
      .select('Title', 'Tokens', 'Clean Tokens')
)

Title,Tokens,Clean Tokens
str,i64,i64
"""usc42""",33311058,18184537
"""TOTALS""",207668909,111189563


In [7]:
usc42_row = df.filter(pl.col('Title') == 'usc42')
usc42_tokens = usc42_row.get_column('Tokens')[0]
usc42_clean_tokens = usc42_row.get_column('Clean Tokens')[0]

totals_row = df.filter(pl.col('Title') == 'TOTALS')
total_tokens = totals_row.get_column('Tokens')[0]
total_clean_tokens = totals_row.get_column('Clean Tokens')[0]

In [8]:
# Unclean title 42 ratio.
print(f'{usc42_tokens} / {total_tokens} = '
      f'{(usc42_tokens / total_tokens):.3F}')

33311058 / 207668909 = 0.160


In [9]:
# Clean title 42 ratio.
print(f'{usc42_clean_tokens} / {total_clean_tokens} = '
      f'{(usc42_clean_tokens / total_clean_tokens):.3F}')

18184537 / 111189563 = 0.164
