Parse the lxml for the USC 26

# Plan

Start small again. Subsection is enough granularity

Basically we want dictionaries of:

{"Subtitle", "Chapter", "Part", "Section", "Subsection", "Content"}

The nwe can feed this into the document creator if enough of the content sections are non-empty

In [None]:
from lxml import etree

NS = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0',
      'xhtml': 'http://www.w3.org/1999/xhtml'}

In [2]:
# quick & dirty, get all text

def extract_all_text_from_usc26(file_path):
    tree = etree.parse(file_path)
    root = tree.getroot()

    # This will collect all text, ignoring tags but preserving whitespace inside elements
    all_text = "".join(root.itertext())

    # Optional: clean excessive whitespace
    cleaned_text = " ".join(all_text.split())
    return cleaned_text

text = extract_all_text_from_usc26("./usc26.xml")
print(text[:1000])  # Print first 1000 chars to check

Title 26 USCTitle 26 Online@119-12 no OLRC 2025-02-10T11:46:33 USCConverter 1.7.2 Title 26—INTERNAL REVENUE CODE Current through 119-12 Act Aug. 16, 1954, ch. 736, 68A Stat. 3The following tables have been prepared as aids in comparing provisions of the Internal Revenue Code of 1954 (redesignated the Internal Revenue Code of 1986 by Pub. L. 99–514, § 2, Oct. 22, 1986, 100 Stat. 2095) with provisions of the Internal Revenue Code of 1939. No inferences, implications, or presumptions of legislative construction or intent are to be drawn or made by reason of such tables. Citations to “R.A.” refer to the sections of earlier Revenue Acts. Table I 1939 Codesection number1986 Codesection number 1Omitted 27806(a) 3, 4Omitted 111 12(a), (b)(1), (2)Omitted 12(b)(3), (c)1 12(d)2 12(e)Omitted 12(f)1 12(g), 13(a)Omitted 13(b)11 13(c)–(f), 14Omitted 15(a), (b)11 15(c)1551 2163 22(a)61 22(b)(1)101 22(b)(2)(A)72 22(b)(2)(B)72, 403 22(b)(2)(C)72 22(b)(3)–(5)102–104 22(b)(6)107 22(b)(7)894 22(b)(8)115, 5

Finer Granularity -- Section wise chunks

In [3]:
def parse_sections(file_path):
    with open(file_path, 'rb') as f:
        tree = etree.parse(f)
    
    sections = tree.findall('.//uslm:section', namespaces=NS)
    parsed = []

    for section in sections:
        heading = section.find('uslm:heading', namespaces=NS)
        heading_text = heading.text.strip() if heading is not None else ""

        # Get all paragraphs (and any nested elements)
        content_texts = []
        for p in section.findall('.//uslm:p', namespaces=NS):
            text = ' '.join(p.itertext()).strip()
            if text:
                content_texts.append(text)

        parsed.append({
            "section_head": heading_text,
            "content": "\n".join(content_texts)
        })

    return parsed

In [25]:
sections = parse_sections("usc26.xml")
print(sections[0])

{'section_head': 'Tax imposed', 'content': 'There is hereby imposed on the taxable income of every head of a household (as defined in section 2(b)) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $29,600 15% of taxable income. \n Over $29,600 but not over $76,400 $4,440, plus 28% of the excess over $29,600. \n Over $76,400 but not over $127,500 $17,544, plus 31% of the excess over $76,400. \n Over $127,500 but not over $250,000 $33,385, plus 36% of the excess over $127,500. \n Over $250,000 $77,485, plus 39.6% of the excess over $250,000.\nThere is hereby imposed on the taxable income of every individual (other than a surviving spouse as defined in section 2(a) or the head of a household as defined in section 2(b)) who is not a married individual (as defined in section 7703) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $2

In [26]:
len(sections)  # 2257 sections

2257

In [23]:
list_of_content_lengths = list(map(lambda d: len(d['content']), sections))

sum(list_of_content_lengths)/len(list_of_content_lengths)  # Average num chars per section is ~7554

sum(list(map(lambda e: e == 0, list_of_content_lengths)))/len(list_of_content_lengths)  # ~5% are 0 length

0.06025697828976517

Section with more Metadata Parse

In [None]:
def get_ancestor_heading_text(section, tag, ns):
    ancestor = section.getparent()
    while ancestor is not None:
        if ancestor.tag == f"{{{ns['uslm']}}}{tag}":
            heading = ancestor.find('uslm:heading', namespaces=ns)
            return heading.text.strip() if heading is not None else ""
        ancestor = ancestor.getparent()
    return ""

def parse_sections_with_metadata(file_path):
    with open(file_path, 'rb') as f:
        tree = etree.parse(f)
    
    sections = tree.findall('.//uslm:section', namespaces=NS)
    parsed = []

    for section in sections:
        heading = section.find('uslm:heading', namespaces=NS)
        heading_text = heading.text.strip() if heading is not None else ""

        # Get all paragraphs (and any nested elements)
        content_texts = []
        for p in section.findall('.//uslm:p', namespaces=NS):
            text = ' '.join(p.itertext()).strip()
            if text:
                content_texts.append(text)

        # Get ancestors: subtitle, chapter, part
        subtitle = get_ancestor_heading_text(section, 'subtitle', NS)
        chapter = get_ancestor_heading_text(section, 'chapter', NS)
        part = get_ancestor_heading_text(section, 'part', NS)

        parsed.append({
            "section_head": heading_text,
            "subtitle": subtitle,
            "chapter": chapter,
            "part": part,
            "content": "\n".join(content_texts)
        })

    return parsed

In [35]:
sections = parse_sections_with_metadata("usc26.xml")
print(sections[0])

{'section_head': 'Tax imposed', 'content': 'There is hereby imposed on the taxable income of every head of a household (as defined in section 2(b)) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $29,600 15% of taxable income. \n Over $29,600 but not over $76,400 $4,440, plus 28% of the excess over $29,600. \n Over $76,400 but not over $127,500 $17,544, plus 31% of the excess over $76,400. \n Over $127,500 but not over $250,000 $33,385, plus 36% of the excess over $127,500. \n Over $250,000 $77,485, plus 39.6% of the excess over $250,000.\nThere is hereby imposed on the taxable income of every individual (other than a surviving spouse as defined in section 2(a) or the head of a household as defined in section 2(b)) who is not a married individual (as defined in section 7703) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $2

Section-Subsection Parse

In [16]:

def parse_sections_with_subsections(file_path):
    with open(file_path, 'rb') as f:
        tree = etree.parse(f)
    
    sections = tree.findall('.//uslm:section', namespaces=NS)
    parsed = []

    for section in sections:
        section_heading = section.find('uslm:heading', namespaces=NS)
        section_text = section_heading.text.strip() if section_heading is not None else ""

        # First check if there are subsections
        subsections = section.findall('uslm:subsection', namespaces=NS)

        if subsections:
            for subsection in subsections:
                subsection_label = subsection.get('identifier') or ""
                content_texts = []
                for p in subsection.findall('.//uslm:p', namespaces=NS):
                    text = ' '.join(p.itertext()).strip()
                    if text:
                        content_texts.append(text)

                parsed.append({
                    "section": section_text,
                    "subsection": subsection_label,
                    "content": "\n".join(content_texts)
                })
        else:
            # No subsections, treat entire section as one block
            content_texts = []
            for p in section.findall('.//uslm:p', namespaces=NS):
                text = ' '.join(p.itertext()).strip()
                if text:
                    content_texts.append(text)

            parsed.append({
                "section": section_text,
                "subsection": "",
                "content": "\n".join(content_texts)
            })

    return parsed

In [17]:
sections_subsections = parse_sections_with_subsections("usc26.xml")
sections_subsections[:5]

[{'section': 'Tax imposed', 'subsection': '/us/usc/t26/s1/a', 'content': ''},
 {'section': 'Tax imposed',
  'subsection': '/us/usc/t26/s1/b',
  'content': 'There is hereby imposed on the taxable income of every head of a household (as defined in section 2(b)) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $29,600 15% of taxable income. \n Over $29,600 but not over $76,400 $4,440, plus 28% of the excess over $29,600. \n Over $76,400 but not over $127,500 $17,544, plus 31% of the excess over $76,400. \n Over $127,500 but not over $250,000 $33,385, plus 36% of the excess over $127,500. \n Over $250,000 $77,485, plus 39.6% of the excess over $250,000.'},
 {'section': 'Tax imposed',
  'subsection': '/us/usc/t26/s1/c',
  'content': 'There is hereby imposed on the taxable income of every individual (other than a surviving spouse as defined in section 2(a) or the head of a household as defined in section 2(b)) w

In [6]:
len(sections_subsections)

8292

In [22]:
list_of_content_lengths = list(map(lambda d: len(d['content']), sections_subsections))

sum(list_of_content_lengths)/len(list_of_content_lengths)  # Average num chars per section is ~585

sum(list(map(lambda e: e == 0, list_of_content_lengths)))/len(list_of_content_lengths)  # ~25% are 0 length

0.25120598166907865

Section-Subsection-Hierarchy Parse

In [None]:
def get_parent_heading(element, tag):
    parent = element.getparent()
    while parent is not None:
        if parent.tag.endswith(tag):
            num = parent.find('uslm:num', namespaces=NS)
            heading = parent.find('uslm:heading', namespaces=NS)
            num_text = num.text.strip() if num is not None else ""
            heading_text = heading.text.strip() if heading is not None else ""
            return f"{num_text} {heading_text}".strip()
        parent = parent.getparent()
    return ""



def parse_sections_with_subsections_and_hierarchy(file_path):
    with open(file_path, 'rb') as f:
        tree = etree.parse(f)

    sections = tree.findall('.//uslm:section', namespaces=NS)
    parsed = []

    for section in sections:
        # Section heading
        section_heading = section.find('uslm:heading', namespaces=NS)
        section_title = section_heading.text.strip() if section_heading is not None else ""

        # Hierarchy: title and chapter
        subtitle = get_parent_heading(section, 'subtitle')
        part = get_parent_heading(section, "part")
        chapter = get_parent_heading(section, 'chapter')

        # Subsections
        subsections = section.findall('uslm:subsection', namespaces=NS)

        if subsections:
            for subsection in subsections:
                subsection_label = subsection.get('identifier') or ""
                subsection_heading = subsection.find('uslm:heading', namespaces=NS)
                subsection_title = subsection_heading.text.strip() if subsection_heading is not None and subsection_heading.text else ""

                content_texts = []
                for p in subsection.findall('.//uslm:p', namespaces=NS):
                    text = ' '.join(p.itertext()).strip()
                    if text:
                        content_texts.append(text)

                parsed.append({
                    "subtitle": subtitle,
                    "chapter": chapter,
                    "part": part,
                    "section": section_title,
                    "subsection_label": subsection_label,
                    "subsection_title": subsection_title,
                    "content": "\n".join(content_texts)
                })
        else:
            # No subsections — grab whole section content
            content_texts = []
            for p in section.findall('.//uslm:p', namespaces=NS):
                text = ' '.join(p.itertext()).strip()
                if text:
                    content_texts.append(text)

            parsed.append({
                "subtitle": subtitle,
                "chapter": chapter,
                "part": part,
                "section": section_title,
                "subsection_label": "",
                "subsection_title": "",
                "content": "\n".join(content_texts)
            })

    return parsed

In [21]:
sections_subsections_hierarchy = parse_sections_with_subsections_and_hierarchy("usc26.xml")
sections_subsections_hierarchy[:5]

[{'subtitle': 'Subtitle A— Income Taxes',
  'chapter': 'Subchapter A— Determination of Tax Liability',
  'part': 'PART I— TAX ON INDIVIDUALS',
  'section': 'Tax imposed',
  'subsection_label': '/us/usc/t26/s1/a',
  'subsection_title': 'Married individuals filing joint returns and surviving spouses',
  'content': ''},
 {'subtitle': 'Subtitle A— Income Taxes',
  'chapter': 'Subchapter A— Determination of Tax Liability',
  'part': 'PART I— TAX ON INDIVIDUALS',
  'section': 'Tax imposed',
  'subsection_label': '/us/usc/t26/s1/b',
  'subsection_title': 'Heads of households',
  'content': 'There is hereby imposed on the taxable income of every head of a household (as defined in section 2(b)) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $29,600 15% of taxable income. \n Over $29,600 but not over $76,400 $4,440, plus 28% of the excess over $29,600. \n Over $76,400 but not over $127,500 $17,544, plus 31% of the

In [22]:
len(sections_subsections_hierarchy)

8292

V2 Parser

In [44]:


def parse_paragraph(p):
    # Extract text content for a paragraph, could be enhanced
    return ''.join(p.itertext()).strip()

def parse_table(table):
    # Extract table rows and cells as nested lists
    rows = []
    for tr in table.findall('.//xhtml:tr', namespaces=NS):
        row = []
        for td in tr.findall('./xhtml:td|./xhtml:th', namespaces=NS):
            cell_text = ''.join(td.itertext()).strip()
            row.append(cell_text)
        rows.append(row)
    return rows

def parse_subsection(subsection):
    data = {
        'id': subsection.get('id'),
        'identifier': subsection.get('identifier'),
        'num': subsection.findtext('uslm:num', namespaces=NS),
        'heading': subsection.findtext('uslm:heading', namespaces=NS),
        'chapeau': None,
        'paragraphs': [],
        'tables': []
    }
    chapeau = subsection.find('uslm:chapeau', namespaces=NS)
    if chapeau is not None:
        data['chapeau'] = ''.join(chapeau.itertext()).strip()
    for p in subsection.findall('uslm:paragraph', namespaces=NS):
        data['paragraphs'].append(parse_paragraph(p))
        # You could enhance to parse numbered paragraphs etc.
    for cont in subsection.findall('uslm:continuation', namespaces=NS):
        table = cont.find('xhtml:table', namespaces=NS)
        if table is not None:
            data['tables'].append(parse_table(table))
    return data

def parse_section(section):
    data = {
        'id': section.get('id'),
        'identifier': section.get('identifier'),
        'num': section.findtext('uslm:num', namespaces=NS),
        'heading': section.findtext('uslm:heading', namespaces=NS),
        'subsections': []
    }
    for subsection in section.findall('uslm:subsection', namespaces=NS):
        data['subsections'].append(parse_subsection(subsection))
    return data

def parse_part(part):
    data = {
        'id': part.get('id'),
        'identifier': part.get('identifier'),
        'num': part.findtext('uslm:num', namespaces=NS),
        'heading': part.findtext('uslm:heading', namespaces=NS),
        'sections': []
    }
    for section in part.findall('uslm:section', namespaces=NS):
        data['sections'].append(parse_section(section))
    return data

def parse_usc26(xml_root):
    usc_data = {
        'parts': []
    }
    for part in xml_root.findall('.//uslm:part', namespaces=NS):
        usc_data['parts'].append(parse_part(part))
    return usc_data

# Usage example:
# xml = etree.parse('your_usc26_file.xml')
# data = parse_usc26(xml.getroot())
# print(data)


In [47]:
xml = etree.parse('usc26.xml')
data = parse_usc26(xml.getroot())


In [52]:
len(data)
print(data['parts'][:1])

[{'id': 'ida4c57d93-e7ce-11ef-b868-e6270ec5930c', 'identifier': '/us/usc/t26/stA/ch1/schA/ptI', 'num': 'PART I—', 'heading': 'TAX ON INDIVIDUALS', 'sections': [{'id': 'ida4c57d98-e7ce-11ef-b868-e6270ec5930c', 'identifier': '/us/usc/t26/s1', 'num': '§\u202f1.', 'heading': ' Tax imposed', 'subsections': [{'id': 'ida4c57d99-e7ce-11ef-b868-e6270ec5930c', 'identifier': '/us/usc/t26/s1/a', 'num': '(a)', 'heading': ' Married individuals filing joint returns and surviving spouses', 'chapeau': 'There is hereby imposed on the taxable income of—', 'paragraphs': ['(1) every married individual (as defined in section 7703) who makes a single return jointly with his spouse under section 6013, and', '(2) every surviving spouse (as defined in section 2(a)),'], 'tables': [[[], [], [], [], [], []]]}, {'id': 'ida4c57d9d-e7ce-11ef-b868-e6270ec5930c', 'identifier': '/us/usc/t26/s1/b', 'num': '(b)', 'heading': ' Heads of households', 'chapeau': None, 'paragraphs': [], 'tables': []}, {'id': 'ida4c57d9f-e7ce-1

Langchain Unstructured XML Parse

In [53]:
from langchain.document_loaders import UnstructuredXMLLoader

loader = UnstructuredXMLLoader("usc26.xml")
documents = loader.load()

KeyboardInterrupt: 