In [1]:
import glob

# set the path to the folder containing the XML files
folder_path = "../xml_files/"

# use glob to create a list of all XML files in the folder
xml_files = glob.glob(f"{folder_path}/*.xml")

# print the list of XML files
print(xml_files)


['../xml_files\\BILLS-117hr1036rs.xml', '../xml_files\\BILLS-117hr1057eh.xml', '../xml_files\\BILLS-117hr1057enr.xml', '../xml_files\\BILLS-117hr1057rds.xml', '../xml_files\\BILLS-117hr1066eh.xml', '../xml_files\\BILLS-117hr1066rds.xml', '../xml_files\\BILLS-117hr1066rh.xml', '../xml_files\\BILLS-117hr1082eas.xml', '../xml_files\\BILLS-117hr1082eh.xml', '../xml_files\\BILLS-117hr1082enr.xml', '../xml_files\\BILLS-117hr1082rds.xml', '../xml_files\\BILLS-117hr1095eh.xml', '../xml_files\\BILLS-117hr1095enr.xml', '../xml_files\\BILLS-117hr1095rfs.xml', '../xml_files\\BILLS-117hr1095rs.xml', '../xml_files\\BILLS-117hr1146rh.xml', '../xml_files\\BILLS-117hr1154enr.xml', '../xml_files\\BILLS-117hr1155rds.xml', '../xml_files\\BILLS-117hr1170enr.xml', '../xml_files\\BILLS-117hr1192enr.xml', '../xml_files\\BILLS-117hr1193eas.xml', '../xml_files\\BILLS-117hr1193enr.xml', '../xml_files\\BILLS-117hr1193rfs.xml', '../xml_files\\BILLS-117hr1193rs.xml', '../xml_files\\BILLS-117hr1215rfs.xml', '../xml_

In [2]:
import xml.etree.ElementTree as ET

# Load the XML file
tree = ET.parse("..\\xml_files\\BILLS-117hr8pcs.xml")

# Get the root element
root = tree.getroot()

# Collect all unique root tags
roots = set()
for elem in root.iter():
    roots.add(elem.tag)

# Print the unique root tags
print("Possible roots in the XML file:")
for root_tag in roots:
    print(root_tag)


Possible roots in the XML file:
metadata
action-desc
subsection
bill
external-xref
text
legis-type
header
official-title
quoted-block
paragraph
subparagraph
form
attestation-group
dublinCore
attestor
calendar
section
session
{http://purl.org/dc/elements/1.1/}publisher
after-quoted-block
enum
subclause
{http://purl.org/dc/elements/1.1/}date
action-date
quote
current-chamber
{http://purl.org/dc/elements/1.1/}rights
congress
{http://purl.org/dc/elements/1.1/}title
{http://purl.org/dc/elements/1.1/}format
role
attestation
legis-num
action
enum-in-header
attestation-date
clause
legis-body
endorsement
{http://purl.org/dc/elements/1.1/}language
short-title
distribution-code


In [3]:
import os
import pandas as pd

# Define the directory containing the XML files
dir_path = "..\\xml_files"

# Create empty lists to hold the extracted bill numbers and raw text
bill_numbers = []
raw_text_list = []

# Loop through all the XML files in the directory
for file_name in os.listdir(dir_path):
    if file_name.endswith(".xml"):
        # Construct the full file path
        file_path = os.path.join(dir_path, file_name)

        # Extract the relevant portion of the file name
        bill_number = file_name.split("-")[1].split(".")[0]

        # Add the bill number to the list
        bill_numbers.append(bill_number)

        # Extract the raw text
        tree = ET.parse(file_path)
        root = tree.getroot()
        raw_text = ""
        for text_elem in root.findall('.//text'):
            if text_elem.text is not None:
                raw_text += text_elem.text
        
        # Add the raw text to the list
        raw_text_list.append(raw_text)

# Create a DataFrame with the bill numbers and raw text as columns
bill_text_df = pd.DataFrame({"Bill Number": bill_numbers, "Raw Text": raw_text_list})




In [5]:
bill_text_df.to_csv('../Data/bill_text_df.csv')