Parse the lxml for the USC 26

In [7]:
from lxml import etree

NS = {
    'uslm': 'http://xml.house.gov/schemas/uslm/1.0',
    # add other namespaces if needed, but usually 'def' is enough here
}

In [None]:
# quick & dirty, get all text

def extract_all_text_from_usc26(file_path):
    tree = etree.parse(file_path)
    root = tree.getroot()

    # This will collect all text, ignoring tags but preserving whitespace inside elements
    all_text = "".join(root.itertext())

    # Optional: clean excessive whitespace
    cleaned_text = " ".join(all_text.split())
    return cleaned_text

text = extract_all_text_from_usc26("./usc26/usc26.xml")
print(text[:1000])  # Print first 1000 chars to check

Title 26 USCTitle 26 Online@119-12 no OLRC 2025-02-10T11:46:33 USCConverter 1.7.2 Title 26—INTERNAL REVENUE CODE Current through 119-12 Act Aug. 16, 1954, ch. 736, 68A Stat. 3The following tables have been prepared as aids in comparing provisions of the Internal Revenue Code of 1954 (redesignated the Internal Revenue Code of 1986 by Pub. L. 99–514, § 2, Oct. 22, 1986, 100 Stat. 2095) with provisions of the Internal Revenue Code of 1939. No inferences, implications, or presumptions of legislative construction or intent are to be drawn or made by reason of such tables. Citations to “R.A.” refer to the sections of earlier Revenue Acts. Table I 1939 Codesection number1986 Codesection number 1Omitted 27806(a) 3, 4Omitted 111 12(a), (b)(1), (2)Omitted 12(b)(3), (c)1 12(d)2 12(e)Omitted 12(f)1 12(g), 13(a)Omitted 13(b)11 13(c)–(f), 14Omitted 15(a), (b)11 15(c)1551 2163 22(a)61 22(b)(1)101 22(b)(2)(A)72 22(b)(2)(B)72, 403 22(b)(2)(C)72 22(b)(3)–(5)102–104 22(b)(6)107 22(b)(7)894 22(b)(8)115, 5

Finer Granularity -- Section wise chunks

In [8]:
def parse_sections(file_path):
    with open(file_path, 'rb') as f:
        tree = etree.parse(f)
    
    sections = tree.findall('.//uslm:section', namespaces=NS)
    parsed = []

    for section in sections:
        heading = section.find('uslm:heading', namespaces=NS)
        heading_text = heading.text.strip() if heading is not None else ""

        # Get all paragraphs (and any nested elements)
        content_texts = []
        for p in section.findall('.//uslm:p', namespaces=NS):
            text = ' '.join(p.itertext()).strip()
            if text:
                content_texts.append(text)

        parsed.append({
            "section_head": heading_text,
            "content": "\n".join(content_texts)
        })

    return parsed

In [None]:
sections = parse_sections("usc26/usc26.xml")
print(sections[0])


{'section_head': 'Tax imposed', 'content': 'There is hereby imposed on the taxable income of every head of a household (as defined in section 2(b)) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $29,600 15% of taxable income. \n Over $29,600 but not over $76,400 $4,440, plus 28% of the excess over $29,600. \n Over $76,400 but not over $127,500 $17,544, plus 31% of the excess over $76,400. \n Over $127,500 but not over $250,000 $33,385, plus 36% of the excess over $127,500. \n Over $250,000 $77,485, plus 39.6% of the excess over $250,000.\nThere is hereby imposed on the taxable income of every individual (other than a surviving spouse as defined in section 2(a) or the head of a household as defined in section 2(b)) who is not a married individual (as defined in section 7703) a tax determined in accordance with the following table: \n \n \n \n \n \n \n If taxable income is: The tax is: \n \n \n Not over $2

In [None]:
len(sections[0]['content'])

str