# Estimate cost to embed U.S. Code

In [1]:
from decimal import Decimal
from pathlib import Path

from IPython.display import display
import polars as pl

from embed.demos import usc

In [2]:
# As of this writing, https://openai.com/pricing lists the cost for
# text-embedding-ada-002 as "$0.0001 / 1K tokens".
TOKEN_COST = Decimal('0.0001') / 1000

In [3]:
data_dir = Path('../../data/')

In [4]:
df = usc.full_tabulate_token_counts(data_dir, TOKEN_COST)

In [5]:
with pl.Config(tbl_rows=500):
    display(df)

Title,Tokens,Cost ($),Clean Tokens,Clean Cost ($)
str,i64,decimal[7],i64,decimal[7]
"""usc01""",85612,0.0085612,49319,0.0049319
"""usc02""",2767913,0.2767913,1437089,0.1437089
"""usc03""",254388,0.0254388,150486,0.0150486
"""usc04""",88628,0.0088628,45535,0.0045535
"""usc05""",5797942,0.5797942,3013402,0.3013402
"""usc05A""",402768,0.0402768,293862,0.0293862
"""usc06""",1920290,0.192029,924886,0.0924886
"""usc07""",8966523,0.8966523,4632796,0.4632796
"""usc08""",2153419,0.2153419,1267539,0.1267539
"""usc09""",35457,0.0035457,17123,0.0017123


By that rough estimate, embedding the whole U.S. Code, even with tag attributes
removed, would cost about $11, require making at least [13 661
requests](https://www.wolframalpha.com/input?i=111893880.0+%2F+8191), and would
take about [80
MiB](https://www.wolframalpha.com/input?i=%28111893880+%2F+8191%29+*+4+*+1536+bytes+in+MiB)
to store, if stored as compactly as NumPy represents them in memory.
(Uncompressed JSON would be significantly bigger.)

## How many of the tokens are Title 42?

In [6]:
(
    df.filter(pl.col('Title').is_in(['usc42', 'TOTALS']))
      .select('Title', 'Tokens', 'Clean Tokens')
)

Title,Tokens,Clean Tokens
str,i64,i64
"""usc42""",33833321,18403173
"""TOTALS""",208867263,111893880


In [7]:
usc42_row = df.filter(pl.col('Title') == 'usc42')
usc42_tokens = usc42_row.get_column('Tokens')[0]
usc42_clean_tokens = usc42_row.get_column('Clean Tokens')[0]

totals_row = df.filter(pl.col('Title') == 'TOTALS')
total_tokens = totals_row.get_column('Tokens')[0]
total_clean_tokens = totals_row.get_column('Clean Tokens')[0]

In [8]:
# Unclean title 42 ratio.
print(f'{usc42_tokens} / {total_tokens} = '
      f'{(usc42_tokens / total_tokens):.3F}')

33833321 / 208867263 = 0.162


In [9]:
# Clean title 42 ratio.
print(f'{usc42_clean_tokens} / {total_clean_tokens} = '
      f'{(usc42_clean_tokens / total_clean_tokens):.3F}')

18403173 / 111893880 = 0.164
