# Estimate cost to embed U.S. Code

In [1]:
from decimal import Decimal
from pathlib import Path

from lxml import etree as ET
import polars as pl

from embed import count_tokens

In [2]:
# As of this writing, https://openai.com/pricing lists the cost for
# text-embedding-ada-002 as "$0.0004 / 1K tokens".
TOKEN_COST = Decimal('0.0004') / 1000

In [3]:
data_dir = Path('../../data/')  # FIXME: See data_dir fixme in usc.ipynb.

In [4]:
def drop_attributes(element_text):  # See usc_manual.ipynb.
    tree = ET.fromstring(element_text.encode('utf-8'))
    for element in tree.iter():
        element.attrib.clear()
    return ET.tostring(tree, encoding='unicode')

In [5]:
def with_cost_columns(df_without_costs):
    return df_without_costs.with_columns(
        pl.col('Tokens')
          .apply(TOKEN_COST.__mul__)
          .alias('Cost ($)'),
        pl.col('Clean Tokens')
          .apply(TOKEN_COST.__mul__)
          .alias('Clean Cost ($)'),
    ).select('Title', 'Tokens', 'Cost ($)', 'Clean Tokens', 'Clean Cost ($)')

In [6]:
token_table_path = data_dir / 'usc_token_counts.csv'
try:
    df = pl.read_csv(token_table_path)
except OSError:
    # Find paths to relevant XML files, failing clearly if none were found.
    paths = sorted((data_dir / 'xml_uscAll@118-3not328/').glob('*.xml'))
    assert paths, 'No XML files found.'

    # Read the files and count tokens with and without tag attributes.
    named_full_texts = (
        (path.stem, path.read_text(encoding='utf-8')) for path in paths
    )
    df = pl.DataFrame(
        {
            'Title': name,
            'Tokens': count_tokens(full_text),
            'Clean Tokens': count_tokens(drop_attributes(full_text)),
        }
        for name, full_text in named_full_texts
    )

    # Save the results and check that they were saved correctly.
    df.write_csv(token_table_path)
    assert pl.read_csv(token_table_path).frame_equal(df)

In [7]:
# Add a totals row at the bottom, and cost columns for each tokens column.
totals = df.sum().with_columns(pl.Series("Title", ["TOTALS"]))
df = pl.concat([df, totals])
df = with_cost_columns(df)

In [8]:
with pl.Config(tbl_rows=500):
    display(df)

Title,Tokens,Cost ($),Clean Tokens,Clean Cost ($)
str,i64,decimal[7],i64,decimal[7]
"""usc01""",86448,0.0345792,49186,0.0196744
"""usc02""",2787354,1.1149416,1429091,0.5716364
"""usc03""",241628,0.0966512,143233,0.0572932
"""usc04""",88632,0.0354528,45539,0.0182156
"""usc05""",5752131,2.3008524,3022284,1.2089136
"""usc05A""",402069,0.1608276,292921,0.1171684
"""usc06""",1868609,0.7474436,923624,0.3694496
"""usc07""",8890968,3.5563872,4600229,1.8400916
"""usc08""",2211879,0.8847516,1266676,0.5066704
"""usc09""",35461,0.0141844,17127,0.0068508


By that rough estimate, embedding the whole U.S. Code, even with tag attributes
removed, would cost about $45, require making at least [13 575
requests](https://www.wolframalpha.com/input?i=111189563.0+%2F+8191), and would
take about [80
MiB](https://www.wolframalpha.com/input?i=%28111189563+%2F+8191%29+*+4+*+1536+bytes+in+MiB)
to store, if stored as compactly as NumPy represents them in memory.
(Uncompressed JSON would be significantly bigger.)

## How many of the tokens are Title 42?

In [9]:
(
    df.filter(pl.col('Title').is_in(['usc42', 'TOTALS']))
      .select('Title', 'Tokens', 'Clean Tokens')
)

Title,Tokens,Clean Tokens
str,i64,i64
"""usc42""",33311058,18184537
"""TOTALS""",207668909,111189563


In [10]:
usc42_row = df.filter(pl.col('Title') == 'usc42')
usc42_tokens = usc42_row.get_column('Tokens')[0]
usc42_clean_tokens = usc42_row.get_column('Clean Tokens')[0]

totals_row = df.filter(pl.col('Title') == 'TOTALS')
total_tokens = totals_row.get_column('Tokens')[0]
total_clean_tokens = totals_row.get_column('Clean Tokens')[0]

In [11]:
# Unclean title 42 ratio.
print(f'{usc42_tokens} / {total_tokens} = '
      f'{(usc42_tokens / total_tokens):.3F}')

33311058 / 207668909 = 0.160


In [12]:
# Clean title 42 ratio.
print(f'{usc42_clean_tokens} / {total_clean_tokens} = '
      f'{(usc42_clean_tokens / total_clean_tokens):.3F}')

18184537 / 111189563 = 0.164
