In [1]:
import os
import re
import zipfile
import itertools
from lxml import etree
from io import BytesIO
from tqdm import tqdm

In [2]:
xml_files = []
zipped_files = [file for file in os.listdir(os.path.dirname('../etl_data/LOB/')) if file.endswith('.zip')]

for file in zipped_files:
    zip_file = os.path.join('../etl_data/LOB/', file)
    with zipfile.ZipFile(zip_file, 'r') as z:
        bill_version_files = [f for f in z.namelist() if (f.startswith('bill_version/')) and (f.endswith('.lob'))]
        for file in bill_version_files:
            with z.open(file) as f:
                try:
                    tree = etree.parse(BytesIO(f.read()))
                    xml_files.append(tree)
                except:
                    print(file)
    print(f"Processed {zip_file}")



Processed ../etl_data/LOB/1999_lob_files.zip
Processed ../etl_data/LOB/2001_lob_files.zip
Processed ../etl_data/LOB/2005_lob_files.zip
Processed ../etl_data/LOB/2015_lob_files.zip
Processed ../etl_data/LOB/1989_lob_files.zip
Processed ../etl_data/LOB/2021_lob_files.zip
Processed ../etl_data/LOB/1997_lob_files.zip
Processed ../etl_data/LOB/2025_lob_files.zip
Processed ../etl_data/LOB/1993_lob_files.zip
Processed ../etl_data/LOB/2019_lob_files.zip
Processed ../etl_data/LOB/1991_lob_files.zip
Processed ../etl_data/LOB/2009_lob_files.zip
Processed ../etl_data/LOB/1995_lob_files.zip
Processed ../etl_data/LOB/2023_lob_files.zip
Processed ../etl_data/LOB/2007_lob_files.zip
Processed ../etl_data/LOB/2003_lob_files.zip
Processed ../etl_data/LOB/2013_lob_files.zip
Processed ../etl_data/LOB/2017_lob_files.zip


In [3]:
def process_child_xml(child):
    start_text = child.text
    last_word = start_text.strip().split(' ')[-1]
    xml_string = etree.tostring(child, encoding='unicode', method='xml')
    xml_start = re.search(rf'{last_word}\s*<\?xm', xml_string)
    if xml_start is not None:
        text_ = xml_string[xml_start.end():]
        text_begin = re.search(r'\?>', text_).end()
        batch = re.sub(r'(?:<\?xm-insertion_mark_((?:start)|(?:end))\?>)|(?:<\?xm-deletion_mark data=\".*\"\?>)|(?:</p>)', '', text_[text_begin:])
        batch = re.sub(r'\s+', ' ', batch)
        return batch
    else:
        return child.text

def xml_to_dict(xml_, digestText='Default', billContent='Default', title='Default'):
    entry = {}
    elements = xml_.getroot().getchildren()[1].getchildren() + xml_.getroot().getchildren()[2].getchildren()

    bill_content = []
    for e in elements:
        id = e.tag.split("}")[-1]
        if id == 'VersionNum':
            entry[id] = e.text
        elif id == 'Title':
            if title == 'Default':
                entry[id] = e.text
            else:
                str_ = etree.tostring(e, encoding='unicode', method='xml')
                content = re.search(r'(?<=instance\">)(.*)(?=</caml:Title>)', str_)
                if content is not None:
                    t_ = content.group(1)
                    t__ = re.sub(r'(?:<\?xm-insertion_mark_((?:start)|(?:end))\?>)|(?:<\?xm-deletion_mark data=\".*\"\?>)|(?:</p>)', '', t_)
                    t___ = re.sub(r'\s+', ' ', t__)
                    entry[id] = t___
        elif id == 'History':
            actions = {}
            tags = [h.getchildren() for h in e.getchildren()]
            for tag in tags:
                actions[tag[0].text] = tag[1].text
            entry[id] = actions
        elif id == 'LegislativeInfo':
            info = {}
            children = [h for h in e.getchildren()]
            for child in children:
                info[child.tag.split("}")[-1]] = child.text
            entry.update(info)
        elif id == 'Authors':
            author_types = list(set([h.getchildren()[0].text for h in e.getchildren()]))
            authors = {type: {} for type in author_types}
            for author in e.getchildren():
                author_type = author.getchildren()[0].text
                author_name = author.getchildren()[1].text
                author_house = author.getchildren()[2].text
                authors[author_type].update({author_name: author_house})
            entry[id] = authors
        elif id == 'GeneralSubject':
            entry[id] = e.getchildren()[0].text
        elif id == 'DigestText':
            if digestText == 'Default':
                digest = "".join([h.text for h in e.getchildren()])
                entry[id] = digest
            else:
                text = []
                for child in e.getchildren():
                    if len(child.getchildren()) < 2:
                        if re.sub(r'\(\d\)', '', child.text) != '':
                            text.append(child.text)
                    else:
                        if (child.text is not None) and (re.sub(r'\(\d\)', '', child.text) != ''):
                            text.append(process_child_xml(child))
                digest_text = " ".join(text)
                entry[id] = digest_text
        elif id == 'BillSection':
            if billContent == 'Default':
                content = e.getchildren()[1].getchildren()
                contents = [c.text for c in content if ('The Legislature hereby finds' not in c.text) and (re.search(r'\(\w\)', c.text) is None)]
                content = " ".join(contents)
                bill_content.append(content)
            else:
                for child in e.getchildren():
                    if child.tag.split("}")[-1] == 'Content':
                        for c in child.getchildren():
                            if len(c.getchildren()) == 0:
                                bill_content.append(c.text)
                            else:
                                text_ = etree.tostring(c.getchildren()[0], encoding='unicode', method='xml')
                                end = re.search(r'class=\".*\"/>', text_).end()
                                if 'The legislature hereby finds' not in text_:
                                    bill_content.append(text_[end:])
    session_year = entry['SessionYear']
    session_num = entry['SessionNum']
    measure_type = entry['MeasureType']
    measure_num = entry['MeasureNum']
    version_num = entry['VersionNum']
    measure_state = entry['MeasureState']
    id = f"{session_year}{session_num}{measure_type}__{measure_num}{version_num}{measure_state}"


    entry['BillContent'] = " ".join(bill_content)
    return id, entry

In [4]:
bill_versions = {}
for file in tqdm(xml_files):
    try:
        id, entry = xml_to_dict(file, digestText='Default', billContent='Default', title='Default')
        bill_versions[id] = entry
    except:
        try:
            id, entry = xml_to_dict(file, digestText='alt', billContent='alt', title='Default')
            bill_versions[id] = entry
        except:
            try:
                id, entry = xml_to_dict(file, digestText='alt', billContent='alt', title='alt')
                bill_versions[id] = entry
            except:
                try:
                    id, entry = xml_to_dict(file, digestText='Default', billContent='alt', title='alt')
                    bill_versions[id] = entry
                except Exception as e:
                    pass


100%|██████████| 267408/267408 [01:42<00:00, 2613.00it/s]


In [5]:
import json

In [6]:
with open("../legislation_data/bill_version_text.json", "w") as f:
    json.dump(bill_versions, f, indent=4)