In [1]:
from collections import Counter
import contextlib
import copy
import itertools
from pathlib import Path
from pprint import pp

from IPython.display import display
from clock_timer import ClockLogger
import ipywidgets
from lxml import etree as ET

import embed
from embed.demos import usc

In [2]:
def output_box(enable=True, *, height='28em'):
    """
    Context manager to limit output height, giving a vertical scrollbar.

    Be aware that boxed output is not typically saved in the .ipynb file.
    """
    if not enable:  # This lets the user easily "unbox" and "rebox" the cell.
        return contextlib.nullcontext()

    @contextlib.contextmanager
    def using_output_box():
        out = ipywidgets.Output(layout=ipywidgets.Layout(height=height))
        display(out)
        out.clear_output()  # Avoid doubled output.
        with out:
            yield

    return using_output_box()

In [3]:
# § 1 - 1j
S1 = """<section status="repealed" style="-uslm-lc:I80" id="idb0fba64f-d970-11ed-a113-e530e1711693" identifier="/us/usc/t42/s1...1j"><num value="1 to 1j">§§ 1 to 1j.</num><heading> Repealed. <ref href="/us/act/1944-07-01/ch373">July 1, 1944, ch. 373</ref>, title XIII, § 1313, <ref href="/us/stat/58/714">58 Stat. 714</ref></heading><notes type="uscNote" id="idb0fba650-d970-11ed-a113-e530e1711693">
<note topic="removalDescription" id="idb0fba651-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1, acts <ref href="/us/act/1902-07-01/ch1370/s1">July 1, 1902, ch. 1370, § 1</ref>, <ref href="/us/stat/32/712">32 Stat. 712</ref>; <ref href="/us/act/1912-08-14/ch288/s1">Aug. 14, 1912, ch. 288, § 1</ref>, <ref href="/us/stat/37/309">37 Stat. 309</ref>, provided that Public Health and Marine Hospital Service should be known as the Public Health Service. See <ref href="/us/usc/t42/s202">section 202 of this title</ref>.</p>
</note>
<note topic="removalDescription" id="idb0fba652-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1a, <ref href="/us/act/1943-11-11/ch298/s1">act Nov. 11, 1943, ch. 298, § 1</ref>, <ref href="/us/stat/57/587">57 Stat. 587</ref>, provided for organization and function of Public Health Service. See <ref href="/us/usc/t42/s203">section 203 of this title</ref>.</p>
</note>
<note topic="removalDescription" id="idb0fba653-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1b, <ref href="/us/act/1943-11-11/ch298/s2">act Nov. 11, 1943, ch. 298, § 2</ref>, <ref href="/us/stat/57/587">57 Stat. 587</ref>, provided for appointment of Assistant Surgeons General, their grade, pay, and allowances. See sections 206, 207, and 210 of this title.</p>
</note>
<note topic="removalDescription" id="idb0fba654-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1c, <ref href="/us/act/1943-11-11/ch298/s3">act Nov. 11, 1943, ch. 298, § 3</ref>, <ref href="/us/stat/57/587">57 Stat. 587</ref>, provided for chiefs of divisions, their grade, pay and allowances, and creation of a Dental Division and a Sanitary Engineering Division. See sections 206, 207, and 210 of this title.</p>
</note>
<note topic="removalDescription" id="idb0fba655-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1d, <ref href="/us/act/1943-11-11/ch298/s4">act Nov. 11, 1943, ch. 298, § 4</ref>, <ref href="/us/stat/57/587">57 Stat. 587</ref>, provided for temporary promotions in regular corps in time of war. See <ref href="/us/usc/t42/s211">section 211 of this title</ref>.</p>
</note>
<note topic="removalDescription" id="idb0fba656-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1e, <ref href="/us/act/1943-11-11/ch298/s5">act Nov. 11, 1943, ch. 298, § 5</ref>, <ref href="/us/stat/58/588">58 Stat. 588</ref>, provided for review of record of officers above grade of assistant surgeon and their separation from service. See <ref href="/us/usc/t42/s211">section 211 of this title</ref>.</p>
</note>
<note topic="removalDescription" id="idb0fba657-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1f, <ref href="/us/act/1943-11-11/ch298/s6">act Nov. 11, 1943, ch. 298, § 6</ref>, <ref href="/us/stat/58/588">58 Stat. 588</ref>, provided for an acting Surgeon General during absence of Surgeon General and Assistant to Surgeon General. See <ref href="/us/usc/t42/s206">section 206 of this title</ref>.</p>
</note>
<note topic="removalDescription" id="idb0fba658-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1g, <ref href="/us/act/1943-11-11/ch298/s7">act Nov. 11, 1943, ch. 298, § 7</ref>, <ref href="/us/stat/57/588">57 Stat. 588</ref>, provided for death and disability benefits of commissioned officers during war and for transfer of Service to military forces. See sections 213 and 217 of this title.</p>
</note>
<note topic="removalDescription" id="idb0fba659-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1h, <ref href="/us/act/1943-11-11/ch298/s8">act Nov. 11, 1943, ch. 298, § 8</ref>, <ref href="/us/stat/57/589">57 Stat. 589</ref>, provided for commissioned officers’ benefits as civil officers and employees of United States and election of benefits. See Title 5, Government Organization and Employees.</p>
</note>
<note topic="removalDescription" id="idb0fba65a-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1i, <ref href="/us/act/1943-11-11/ch298/s9">act Nov. 11, 1943, ch. 298, § 9</ref>, <ref href="/us/stat/57/589">57 Stat. 589</ref>, provided for beneficiaries’ benefits where commissioned officer lost his life on active duty between <date date="1941-12-07">Dec. 7, 1941</date>, and <date date="1943-11-11">Nov. 11, 1943</date>.</p>
</note>
<note topic="removalDescription" id="idb0fba65b-d970-11ed-a113-e530e1711693">
<p style="-uslm-lc:I21" class="indent0">Section 1j, <ref href="/us/act/1943-11-11/ch298/s11">act Nov. 11, 1943, ch. 298, § 11</ref>, <ref href="/us/stat/57/589">57 Stat. 589</ref>, provided for transfer of appropriations to continue transferred functions. See note set out under <ref href="/us/usc/t42/s201">section 201 of this title</ref>.</p>
</note>
<note style="-uslm-lc:I74" role="crossHeading" topic="statutoryNotes" id="idb0fba65c-d970-11ed-a113-e530e1711693"><heading class="centered"><b>Statutory Notes and Related Subsidiaries</b></heading></note>
<note style="-uslm-lc:I74" topic="miscellaneous" id="idb0fba65d-d970-11ed-a113-e530e1711693"><heading class="centered smallCaps">Renumbering and Repeal of Repealing Act</heading><p style="-uslm-lc:I21" class="indent0">Title XIII, § 1313, formerly title VI, § 611, of act <date date="1944-07-01">July 1, 1944</date>, which repealed these sections, was renumbered title VII, § 711, by <ref href="/us/act/1946-08-13/ch958/s5">act Aug. 13, 1946, ch. 958, § 5</ref>, <ref href="/us/stat/60/1049">60 Stat. 1049</ref>; § 713, by <ref href="/us/act/1948-02-28/ch83/s9/b">act Feb. 28, 1948, ch. 83, § 9(b)</ref>, <ref href="/us/stat/62/47">62 Stat. 47</ref>; title VIII, § 813, by <ref href="/us/act/1956-07-30/ch779/s3/b">act July 30, 1956, ch. 779, § 3(b)</ref>, <ref href="/us/stat/70/721">70 Stat. 721</ref>; title IX, § 913, by <ref href="/us/pl/88/581/s4/b">Pub. L. 88–581, § 4(b)</ref>, <date date="1964-09-04">Sept. 4, 1964</date>, <ref href="/us/stat/78/919">78 Stat. 919</ref>; title X, § 1013, by <ref href="/us/pl/89/239/s3/b">Pub. L. 89–239, § 3(b)</ref>, <date date="1965-10-06">Oct. 6, 1965</date>, <ref href="/us/stat/79/931">79 Stat. 931</ref>; title XI, § 1113, by <ref href="/us/pl/91/572/s6/b">Pub. L. 91–572, § 6(b)</ref>, <date date="1970-12-24">Dec. 24, 1970</date>, <ref href="/us/stat/84/1506">84 Stat. 1506</ref>; title XII, § 1213, by <ref href="/us/pl/92/294/s3/b">Pub. L. 92–294, § 3(b)</ref>, <date date="1972-05-16">May 16, 1972</date>, <ref href="/us/stat/86/137">86 Stat. 137</ref>; title XIII, § 1313, by <ref href="/us/pl/93/154/s2/b/2">Pub. L. 93–154, § 2(b)(2)</ref>, <date date="1973-11-16">Nov. 16, 1973</date>, <ref href="/us/stat/87/604">87 Stat. 604</ref>, and was repealed by <ref href="/us/pl/93/222/s7/b">Pub. L. 93–222, § 7(b)</ref>, <date date="1973-12-29">Dec. 29, 1973</date>, <ref href="/us/stat/87/936">87 Stat. 936</ref>.</p>
</note>
</notes>
</section>"""

In [4]:
embed.count_tokens(S1)

2817

In [5]:
data_dir = Path('../../data/')
usc.extract_usc(data_dir, download=True)
usc42 = data_dir / usc.USC_STEM / 'usc42.xml'

In [6]:
usc42textorig = usc42.read_text(encoding='utf-8')

In [7]:
# embed.count_tokens(usc42textorig)

In [8]:
usc42tree = ET.parse(usc42)

In [9]:
usc42root = usc42tree.getroot()

In [10]:
usc42text = usc.serialize_xml(usc42root)

In [11]:
usc42text.count('<section')

8810

In [12]:
usc42textorig.count('<section')

8810

In [13]:
usc42textorig.count('status="repealed"')

703

In [14]:
usc42root.nsmap

{None: 'http://xml.house.gov/schemas/uslm/1.0',
 'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
 'dc': 'http://purl.org/dc/elements/1.1/',
 'dcterms': 'http://purl.org/dc/terms/'}

In [15]:
schema_prefix = usc.get_schema_prefix(usc42root)
section_tag = schema_prefix + 'section'
sections = usc42tree.findall(f'.//{section_tag}')

In [16]:
len(sections)

8810

In [17]:
# sum(
#     1 for section in sections
#     if usc.count_tokens_xml(section) > embed.CONTEXT_LENGTH
# )

In [18]:
sum(1 for section in sections if usc.is_repealed(section))

698

In [19]:
sum(1 for element in usc42tree.iter() if usc.is_repealed(element))

703

In [20]:
repealed_sections = {section for section in sections if usc.is_repealed(section)}

In [21]:
repealed_elements = {element for element in usc42tree.iter() if usc.is_repealed(element)}

In [22]:
repealed_sections.issubset(repealed_elements)

True

In [23]:
repealed_non_sections = repealed_elements - repealed_sections
len(repealed_non_sections)

5

In [24]:
repealed_non_sections

{<Element {http://xml.house.gov/schemas/uslm/1.0}subsection at 0x18f01a4f300>,
 <Element {http://xml.house.gov/schemas/uslm/1.0}subsection at 0x18f01a4fc00>,
 <Element {http://xml.house.gov/schemas/uslm/1.0}subsection at 0x18f01a4fe40>,
 <Element {http://xml.house.gov/schemas/uslm/1.0}subsection at 0x18f01aef4c0>,
 <Element {http://xml.house.gov/schemas/uslm/1.0}subsection at 0x18f01b06140>}

In [25]:
some_repealed_non_section = next(iter(repealed_non_sections))
print(usc.serialize_xml(some_repealed_non_section))

<subsection xmlns="http://xml.house.gov/schemas/uslm/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" style="-uslm-lc:I21" class="indent0" status="repealed"><num value="g">“[(g)</num><content> Repealed. <ref href="/us/pl/101/508/tIV/s4118/i/2">Pub. L. 101–508, title IV, § 4118(i)(2)</ref>, <date date="1990-11-05">Nov. 5, 1990</date>, <ref href="/us/stat/104/1388-70">104 Stat. 1388–70</ref>.]</content>
</subsection>



In [26]:
cleaned_copied = [usc.serialize_xml_clean(section) for section in sections]

In [27]:
too_long_counts = [
    count for section in sections
    if (count := usc.count_tokens_xml_clean(section)) > embed.CONTEXT_LENGTH
]

In [28]:
len(too_long_counts)

358

In [29]:
with output_box():
    display(sorted(too_long_counts))

Output(layout=Layout(height='28em'))

In [30]:
too_long = Counter({
    section.attrib['identifier']: count
    for section in sections
    if (count := usc.count_tokens_xml_clean(section)) > embed.CONTEXT_LENGTH
})

with output_box():
    display(pp(too_long))

Output(layout=Layout(height='28em'))

In [31]:
section1396a = usc42tree.find(f".//{section_tag}[@identifier = '/us/usc/t42/s1396a']")
section1396a

<Element {http://xml.house.gov/schemas/uslm/1.0}section at 0x18f01ab6fc0>

In [32]:
usc.count_tokens_xml_clean(section1396a)

245488

In [33]:
with output_box():
    display(list(section1396a))

Output(layout=Layout(height='28em'))

In [34]:
list(usc42root)

[<Element {http://xml.house.gov/schemas/uslm/1.0}meta at 0x18f0a90be80>,
 <Element {http://xml.house.gov/schemas/uslm/1.0}main at 0x18f0a935680>]

In [35]:
section1396a.tag

'{http://xml.house.gov/schemas/uslm/1.0}section'

In [36]:
{child.tag for child in section1396a}

{'{http://xml.house.gov/schemas/uslm/1.0}heading',
 '{http://xml.house.gov/schemas/uslm/1.0}notes',
 '{http://xml.house.gov/schemas/uslm/1.0}num',
 '{http://xml.house.gov/schemas/uslm/1.0}sourceCredit',
 '{http://xml.house.gov/schemas/uslm/1.0}subsection'}

In [37]:
unique_tags = {child.tag for section in sections for child in section}
unique_tags

{'{http://xml.house.gov/schemas/uslm/1.0}chapeau',
 '{http://xml.house.gov/schemas/uslm/1.0}clause',
 '{http://xml.house.gov/schemas/uslm/1.0}content',
 '{http://xml.house.gov/schemas/uslm/1.0}continuation',
 '{http://xml.house.gov/schemas/uslm/1.0}heading',
 '{http://xml.house.gov/schemas/uslm/1.0}level',
 '{http://xml.house.gov/schemas/uslm/1.0}note',
 '{http://xml.house.gov/schemas/uslm/1.0}notes',
 '{http://xml.house.gov/schemas/uslm/1.0}num',
 '{http://xml.house.gov/schemas/uslm/1.0}paragraph',
 '{http://xml.house.gov/schemas/uslm/1.0}sourceCredit',
 '{http://xml.house.gov/schemas/uslm/1.0}subparagraph',
 '{http://xml.house.gov/schemas/uslm/1.0}subsection'}

In [38]:
direct_subparagraphs = [
    child.get('identifier')
    for section in sections
    for child in section
    if child.tag.endswith('}subparagraph')
]

with output_box():
    display(direct_subparagraphs)

Output(layout=Layout(height='28em'))

In [39]:
len(direct_subparagraphs)

41

In [40]:
no_id = [section for section in sections if section.get('identifier') is None]

with output_box():
    display(no_id)

Output(layout=Layout(height='28em'))

In [41]:
usc.count_tokens_xml(no_id[0])

1005

In [42]:
usc.show_wrapped(no_id[0])

<section xmlns="http://xml.house.gov/schemas/uslm/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" style="-uslm-lc:I00" class="inline"><num
value=""/><chapeau>“In the case of any individual—</chapeau><subparagraph style="-uslm-lc:I22" class="indent1"><num
value="A">“(A)</num><content> who performed active service (i) as a commissioned officer of the Public Health Service at any time during the
period beginning <date date="1952-07-04">July 4, 1952</date>, and ending <date date="1956-12-31">December 31, 1956</date>, or (ii) as a
commissioned officer of the Coast and Geodetic Survey at any time during the period beginning <date date="1945-07-29">July 29, 1945</date>,
and ending <date date="1956-12-31">December 31, 1956</date>; and</content> </subparagraph> <subparagraph style="-uslm-lc:I22"
class="indent1"><num value="B">“(B)</num><clause style="-uslm-lc:I22" class="indent1"><num value="i">(i)<

In [43]:
usc.show_wrapped(no_id[-1])

<section xmlns="http://xml.house.gov/schemas/uslm/1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" style="-uslm-lc:I580467"><num value="5">“SEC.
5.</num><heading> AUTHORIZATION OF APPROPRIATIONS.</heading><chapeau style="-uslm-lc:I21" class="indent0">“There are authorized to be
appropriated—</chapeau><paragraph style="-uslm-lc:I22" class="indent1"><num value="1">“(1)</num><content> for grants, contracts, or
cooperative agreements under section 3(a), such sums as may be necessary for fiscal year 2017 and each of the 4 succeeding fiscal
years;</content> </paragraph> <paragraph style="-uslm-lc:I22" class="indent1"><num value="2">“(2)</num><content> for grants, contracts, or
cooperative agreements under section 3(b), such sums as may be necessary for fiscal year 2017 and each of the 4 succeeding fiscal years;
and</content> </paragraph> <paragraph style="-uslm-lc:I22" class="indent1"><num value="

In [44]:
# This fails with:
#     SyntaxError: prefix 'ancestor' not found in prefix map

# direct_sections = usc42root.findall(f'{schema_prefix}title//{section_tag}[not(ancestor::{section_tag})]')

In [45]:
with output_box():
    treeit = usc.walk_tag(usc42root, 'section')
    for event, element in treeit: 
        print(element)

Output(layout=Layout(height='28em'))

In [46]:
sum(1 for _ in usc.walk_tag(usc42root, 'section'))  

8810

In [47]:
direct_sections = usc.get_direct_sections(usc42root)

In [48]:
with output_box():
    display(direct_sections)

Output(layout=Layout(height='28em'))

In [49]:
len(direct_sections)

8434

In [50]:
[section for section in direct_sections if section.get('identifier') is None]

[]

In [51]:
too_long_counts_direct = [
    count for section in direct_sections
    if (count := usc.count_tokens_xml_clean(section)) > embed.CONTEXT_LENGTH
]

In [52]:
len(too_long_counts_direct)

358

In [53]:
# with output_box():
#     display(sorted(too_long_counts_direct))

In [54]:
with ClockLogger() as timer:
    direct_section_token_counts = [
        usc.count_tokens_xml_clean(section) for section in direct_sections
    ]

Total time elapsed: 19.516774s


In [55]:
# We might try to embed it this way (one embedding per row element).
breakdowns = [usc.get_embeddable_elements(section) for section in direct_sections]

In [56]:
len(breakdowns)  # Same as number of direct sections.

8434

In [57]:
all(row for row in breakdowns)  # No rows should be empty.

True

In [58]:
# Same as number of too-long direct sections.
sum(1 for row in breakdowns if len(row) > 1)

358

In [59]:
# This shows lots of <p> elements. Unlike <paragraph>, the <p> element is from
# HTML and I'm not sure it really makes sense to consider a <p> element as a
# logical unit within the U.S. Code, conceptually.
Counter(elem.tag for row in breakdowns for elem in row)

Counter({'{http://xml.house.gov/schemas/uslm/1.0}p': 11713,
         '{http://xml.house.gov/schemas/uslm/1.0}section': 8139,
         '{http://xml.house.gov/schemas/uslm/1.0}note': 4393,
         '{http://xml.house.gov/schemas/uslm/1.0}subsection': 3408,
         '{http://xml.house.gov/schemas/uslm/1.0}paragraph': 776,
         '{http://xml.house.gov/schemas/uslm/1.0}heading': 445,
         '{http://xml.house.gov/schemas/uslm/1.0}num': 395,
         '{http://xml.house.gov/schemas/uslm/1.0}sourceCredit': 358,
         '{http://xml.house.gov/schemas/uslm/1.0}notes': 213,
         '{http://xml.house.gov/schemas/uslm/1.0}ref': 101,
         '{http://xml.house.gov/schemas/uslm/1.0}subparagraph': 64,
         '{http://xml.house.gov/schemas/uslm/1.0}date': 50,
         '{http://xml.house.gov/schemas/uslm/1.0}chapeau': 28,
         '{http://xml.house.gov/schemas/uslm/1.0}content': 10,
         '{http://xml.house.gov/schemas/uslm/1.0}continuation': 7,
         '{http://xml.house.gov/schemas/usl

In [60]:
with output_box():
    display({
        section.attrib['identifier']: len(row)
        for section, row in zip(direct_sections, breakdowns)
        if len(row) > 1
    })

Output(layout=Layout(height='28em'))

In [61]:
def abbreviate_runs(values):
    return [
        (value, sum(1 for _ in group))
        for value, group in itertools.groupby(values)
    ]

In [62]:
def print_with_aligned_keys(mapping):
    """Display a mapping with string keys in a compact multiline form."""
    width = max(len(key) for key in mapping)
    for key, value in mapping.items():
        print(f'{key:>{width}}: {value}')

In [63]:
breakdown_summaries = {
    section.attrib['identifier']:
        abbreviate_runs(ET.QName(elem).localname for elem in row)
    for section, row in zip(direct_sections, breakdowns)
    if len(row) > 1
}

In [64]:
with output_box(height='40em'):
    print_with_aligned_keys(breakdown_summaries)

Output(layout=Layout(height='40em'))

### Still to do

1. Solve the `<p>` element issue.
2. Solve the `<num>` and `<heading>` issue.
3. Look for text that is directly in an element whose children we traverse to. Don't lose this text!
4. Experiment with semantic search allowing repealed sections to be included or omitted from the search.

In [66]:
all_notes = [element for _, element in usc.walk_tag(usc42root, 'notes')]
len(all_notes)

7349

In [67]:
Counter(element.getparent().tag for element in all_notes)

Counter({'{http://xml.house.gov/schemas/uslm/1.0}section': 7135,
         '{http://xml.house.gov/schemas/uslm/1.0}part': 98,
         '{http://xml.house.gov/schemas/uslm/1.0}subchapter': 60,
         '{http://xml.house.gov/schemas/uslm/1.0}subpart': 38,
         '{http://xml.house.gov/schemas/uslm/1.0}chapter': 12,
         '{http://xml.house.gov/schemas/uslm/1.0}division': 6})

In [68]:
big_direct_sections = [
    section for section in direct_sections
    if usc.count_tokens_xml_clean(section) > embed.CONTEXT_LENGTH
]

In [69]:
len(big_direct_sections)

358

In [70]:
big_direct_sections_no_notes = copy.deepcopy(big_direct_sections)
separated_notes = []

for section in big_direct_sections_no_notes:
    for element in section:
        if element.tag == notes_tag:
            # Save a copy (not original), so ET.tostring omits tag namespaces.
            separated_notes.append(copy.deepcopy(element))

            # Prune the notes element from this section.
            section.remove(element)

In [71]:
sum(
    1 for section in big_direct_sections_no_notes
    if usc.count_tokens_xml_clean(section) > embed.CONTEXT_LENGTH
)

141

In [72]:
sum(
    1 for notes_element in separated_notes
    if usc.count_tokens_xml_clean(notes_element) > embed.CONTEXT_LENGTH
)

144

In [73]:
too_long_without_notes = Counter({
    section.attrib['identifier']: count
    for section in big_direct_sections_no_notes
    if (count := usc.count_tokens_xml_clean(section)) > embed.CONTEXT_LENGTH
})

with output_box():
    pp(too_long_without_notes)

Output(layout=Layout(height='28em'))

In [74]:
# This doesn't work, because the parents don't always have "identifier" attributes.

# too_long_separated_notes = Counter({
#    'notes in ' + notes_element.getparent().attrib['identifier']: count
#    for notes_element in separated_notes
#    if (count := usc.count_tokens_xml_clean(notes_element)) > embed.CONTEXT_LENGTH
# })
#
# pp(too_long_separated_notes)

In [75]:
# Instead, just examine the token counts for "notes" elements too big to embed.
too_long_separated_notes_counts = [
    count for notes_element in separated_notes
    if (count := usc.count_tokens_xml_clean(notes_element)) > embed.CONTEXT_LENGTH
]

with output_box():
    display(sorted(too_long_separated_notes_counts, reverse=True))

Output(layout=Layout(height='28em'))

In [76]:
# Compare to usc.get_embeddable_direct_sections.
small_direct_sections = [
    section for section in direct_sections
    if usc.count_tokens_xml_clean(section) <= embed.CONTEXT_LENGTH
]

In [77]:
len(small_direct_sections)

8076

In [78]:
all_small_count = sum(
    usc.count_tokens_xml_clean(section) for section in small_direct_sections
)
all_small_count

10424631

In [79]:
all_big_count = sum(
    usc.count_tokens_xml_clean(section) for section in big_direct_sections
)
all_big_count

7730094

In [80]:
all_count = all_small_count + all_big_count

In [81]:
all_small_count / all_count

0.574210350198089

In [82]:
all_big_count / all_count

0.4257896498019111

In [83]:
ok_with_separated_notes_count = sum(
    section_count + notes_count
    for section, notes in zip(big_direct_sections_no_notes, separated_notes)
    if (section_count := usc.count_tokens_xml_clean(section))
        <= embed.CONTEXT_LENGTH
    if (notes_count := usc.count_tokens_xml_clean(notes))
        <= embed.CONTEXT_LENGTH
)
ok_with_separated_notes_count

1512189

In [84]:
big_with_separated_notes_count = sum(
    section_count + notes_count
    for section, notes in zip(big_direct_sections_no_notes, separated_notes)
    if ((section_count := usc.count_tokens_xml_clean(section))
        > embed.CONTEXT_LENGTH)
     | ((notes_count := usc.count_tokens_xml_clean(notes))
        > embed.CONTEXT_LENGTH)
)
big_with_separated_notes_count

6212826

In [85]:
all_big_split_count = ok_with_separated_notes_count + big_with_separated_notes_count

In [86]:
# A simple check for the above logic (should be True).
all_big_split_count == sum(
    usc.count_tokens_xml_clean(section) + usc.count_tokens_xml_clean(notes)
    for section, notes in zip(big_direct_sections_no_notes, separated_notes)
)

True

In [87]:
all_big_split_count

7725015

In [88]:
ok_with_separated_notes_count / all_big_split_count

0.19575224125778395

In [89]:
# This is a fairly small positive number, which makes intuitive sense, but we
# should probably look into it to account for these tokens.
all_big_count - all_big_split_count

5079

In [90]:
# Check that some separated notes seem to look okay.
usc.show_wrapped(separated_notes[0], limit=3000)

<notes xmlns="http://xml.house.gov/schemas/uslm/1.0" type="uscNote" id="id8bd97011-0605-11ee-a715-ee60fdbbcc39"> <note style="-uslm-lc:I74"
role="crossHeading" topic="editorialNotes" id="id8bd97012-0605-11ee-a715-ee60fdbbcc39"><heading class="centered"><b>Editorial
Notes</b></heading></note> <note style="-uslm-lc:I75" topic="referencesInText" id="id8bd97013-0605-11ee-a715-ee60fdbbcc39"> <heading
class="centered smallCaps">References in Text</heading><p style="-uslm-lc:I21" class="indent0"><ref href="/us/usc/t42/s246/g">Section 246(g)
of this title</ref>, referred to in subsec. (f), was repealed by <ref href="/us/pl/96/398/tI/s107/d">Pub. L. 96–398, title I,
§ 107(d)</ref>, <date date="1980-10-07">Oct. 7, 1980</date>, <ref href="/us/stat/94/1571">94 Stat. 1571</ref>.</p> <p style="-uslm-lc:I21"
class="indent0"><ref href="/us/usc/t42/s247c/c/1">Section 247c(c)(1) of this title</ref>, referred to in subsec. (f), was repealed by <ref
href="/us/pl/94/317/tII/s203/f/1">Pub. L. 94–317, title 

In [91]:
# Check that a section whose notes were removed seems to look okay.
usc.show_wrapped(big_direct_sections_no_notes[0], limit=3000)

<section xmlns="http://xml.house.gov/schemas/uslm/1.0" style="-uslm-lc:I80" id="id8bd96ffe-0605-11ee-a715-ee60fdbbcc39"
identifier="/us/usc/t42/s201"><num value="201">§ 201.</num><heading> Definitions</heading> <chapeau style="-uslm-lc:I11"
class="indent0">When used in this chapter—</chapeau><subsection style="-uslm-lc:I11" class="indent0"
id="id8bd96fff-0605-11ee-a715-ee60fdbbcc39" identifier="/us/usc/t42/s201/a"><num value="a">(a)</num><content> The term “Service” means the
Public Health Service;</content> </subsection> <subsection style="-uslm-lc:I11" class="indent0" id="id8bd97000-0605-11ee-a715-ee60fdbbcc39"
identifier="/us/usc/t42/s201/b"><num value="b">(b)</num><content> The term “Surgeon General” means the Surgeon General of the Public Health
Service;</content> </subsection> <subsection style="-uslm-lc:I11" class="indent0" id="id8bd97001-0605-11ee-a715-ee60fdbbcc39"
identifier="/us/usc/t42/s201/c"><num value="c">(c)</num><content> Unless the context otherwise requires, the term