In [1]:
import logging
from pathlib import Path

from lxml import etree as ET
import polars as pl

import embed
from embed.demos import usc

In [2]:
# Consider switching to DEBUG if this doesn't produce enough info.
logging.basicConfig(level=logging.INFO)

In [17]:
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [3]:
data_dir = Path('../../data/usc/')
usc.extract_usc(data_dir, download=True)
usc42root = ET.parse(data_dir / usc.USC_STEM / 'usc42.xml').getroot()

In [4]:
# For now, we work just with sections small enough to embed without spliting.
sections = usc.get_direct_sections(usc42root)

In [5]:
# What we want to do: 
#  Save serializatinos of sections that are embeddeable

In [6]:
# identifier, status, text
# identifier, status, text, embedding

In [7]:
df = pl.DataFrame(
    {
        "Identifier": section.attrib['identifier'], 
        "Status": section.get('status'),
        "Text": text,
    }
    for section in sections[:10]  # FIXME: After initial testing, stop slicing.
    if embed.count_tokens(text := usc.serialize_xml_clean(section)) <= embed.TOKEN_LIMIT
)

In [19]:
df

Identifier,Status,Text
str,str,str
"""/us/usc/t42/s1...1j""","""repealed""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§§ 1 to 1j.</num><heading> Repealed. <r…"
"""/us/usc/t42/s2""","""omitted""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 2.</num><heading> Omitted</heading><n…"
"""/us/usc/t42/s3 /us/usc/t42/s4""","""repealed""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§§ 3, 4.</num><heading> Repealed. <ref>…"
"""/us/usc/t42/s5""","""omitted""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 5.</num><heading> Omitted</heading><n…"
"""/us/usc/t42/s6...15a""","""repealed""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§§ 6 to 15a.</num><heading> Repealed. <…"
"""/us/usc/t42/s16""","""omitted""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 16.</num><heading> Omitted</heading><…"
"""/us/usc/t42/s17...25e""","""repealed""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§§ 17 to 25e.</num><heading> Repealed. …"
"""/us/usc/t42/s26""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 26.</num><heading> Isolation of civil…"
"""/us/usc/t42/s27""",,"""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§ 27.</num><heading> Definitions</headi…"
"""/us/usc/t42/s28...43""","""repealed""","""<section xmlns=""http://xml.house.gov/schemas/uslm/1.0""><num>§§ 28 to 43.</num><heading> Repealed. <…"


In [27]:
df.filter(pl.col('Identifier') == '/us/usc/t42/s1...1j').get_column('Text')[0]

'<section xmlns="http://xml.house.gov/schemas/uslm/1.0"><num>§§\u202f1 to 1j.</num><heading> Repealed. <ref>July 1, 1944, ch. 373</ref>, title XIII, §\u202f1313, <ref>58 Stat. 714</ref></heading><notes>\n<note>\n<p>Section 1, acts <ref>July 1, 1902, ch. 1370, §\u202f1</ref>, <ref>32 Stat. 712</ref>; <ref>Aug. 14, 1912, ch. 288, §\u202f1</ref>, <ref>37 Stat. 309</ref>, provided that Public Health and Marine Hospital Service should be known as the Public Health Service. See <ref>section 202 of this title</ref>.</p>\n</note>\n<note>\n<p>Section 1a, <ref>act Nov. 11, 1943, ch. 298, §\u202f1</ref>, <ref>57 Stat. 587</ref>, provided for organization and function of Public Health Service. See <ref>section 203 of this title</ref>.</p>\n</note>\n<note>\n<p>Section 1b, <ref>act Nov. 11, 1943, ch. 298, §\u202f2</ref>, <ref>57 Stat. 587</ref>, provided for appointment of Assistant Surgeons General, their grade, pay, and allowances. See sections 206, 207, and 210 of this title.</p>\n</note>\n<note>