In [244]:
import os
import re
import zipfile
from xml.dom.minidom import parse
from io import BytesIO
from tqdm import tqdm
import concurrent.futures
from unidecode import unidecode
import pandas as pd

In [18]:
xml_files = []
zipped_files = [file for file in os.listdir(os.path.dirname('../etl_data/LOB/')) if file.endswith('.zip')]

for file in zipped_files:
    zip_file = os.path.join('../etl_data/LOB/', file)
    with zipfile.ZipFile(zip_file, 'r') as z:
        bill_version_files = [f for f in z.namelist() if (f.startswith('bill_version/')) and (f.endswith('.lob'))]
        for file in bill_version_files:
            with z.open(file) as f:
                try:
                    tree = parse(BytesIO(f.read()))
                    xml_files.append(tree)
                except:
                    print(file)
    print(f"Processed {zip_file}")



Processed ../etl_data/LOB/1999_lob_files.zip
Processed ../etl_data/LOB/2001_lob_files.zip
Processed ../etl_data/LOB/2005_lob_files.zip
Processed ../etl_data/LOB/2015_lob_files.zip
Processed ../etl_data/LOB/1989_lob_files.zip
Processed ../etl_data/LOB/2021_lob_files.zip
Processed ../etl_data/LOB/1997_lob_files.zip
Processed ../etl_data/LOB/2025_lob_files.zip
Processed ../etl_data/LOB/1993_lob_files.zip
Processed ../etl_data/LOB/2019_lob_files.zip
Processed ../etl_data/LOB/1991_lob_files.zip
Processed ../etl_data/LOB/2009_lob_files.zip
Processed ../etl_data/LOB/1995_lob_files.zip
Processed ../etl_data/LOB/2023_lob_files.zip
Processed ../etl_data/LOB/2007_lob_files.zip
Processed ../etl_data/LOB/2003_lob_files.zip
Processed ../etl_data/LOB/2013_lob_files.zip
Processed ../etl_data/LOB/2017_lob_files.zip


In [237]:
def getElementByCAML(dom, tagName):
    tag = f'caml:{tagName}'
    return dom.getElementsByTagName(tag)

def xml_data(dom):
    data = {}
    xml = dom.childNodes[0]
    for tag in ['Id', 'VersionNum', 'SessionNum', 'SessionYear', 'MeasureType', 'MeasureNum', 'VersionNum', 'MeasureState', 'VoteRequired', 'FiscalCommittee', 'Appropriation', 'LocalProgram', 'Urgency', 'TaxLevy']:
        try:
            data[tag] = getElementByCAML(xml, tag)[0].firstChild.data
        except:
            continue
    try:
        title = getElementByCAML(xml, 'Title')[0].childNodes
        if len(title) == 1:
            data['Title'] = title[0].firstChild.data
        else:
            tilt = [node.data for node in title if node.nodeName == '#text']
            data['Title'] = ''.join(tilt)
    except:
        pass
    try:
        history = getElementByCAML(xml, 'History')[0].getElementsByTagName('caml:Action')
        actions = {}
        for action in history:
            act = action.childNodes[0].firstChild.data
            date = action.childNodes[1].firstChild.data
            actions[act] = date
        data['History'] = actions
    except:
        pass
    try:
        auth = []
        authors = getElementByCAML(xml, 'Authors')[0].childNodes
        for author in authors:
            contribution = author.childNodes[0].firstChild.data
            house = author.childNodes[1].firstChild.data
            name = author.childNodes[2].firstChild.data
            auth.append({'contribution': contribution, 'house': house, 'name': name})
        data['Authors'] = auth
    except:
        pass
    try:
        subject = getElementByCAML(xml, 'GeneralSubject')[0].childNodes[0].childNodes
        if len(subject) > 1:
            node_content = [node.data for node in subject if node.nodeName == '#text']
            data['GeneralSubject'] = ''.join(node_content)
        else:
            data['GeneralSubject'] = subject[0].firstChild.data
    except:
        pass
    try:
        digest = getElementByCAML(xml, 'DigestText')[0].childNodes
        if all(node.nodeName == 'p' for node in digest):
            digestText = " ".join(digest[i].firstChild.data for i in range(len(digest)))
        else:
            digestText = " ".join(digest[i].firstChild.data for i in range(len(digest)) if digest[i].nodeName == 'p')
            changes = {'added': [], 'removed': []}
            for i in range(len(digest)):
                if digest[i].nodeName == 'xm-insertion_mark_start':
                    changes['added'].append(digest[i+1].firstChild.data)
                elif digest[i].nodeName == 'xm-deletion_mark':
                    datt = digest[i].data
                    d1 = re.sub(r'&lt;/p>&lt;p>', ' ', datt)
                    d2 = re.sub(r'(?:data=\"&lt;p>)*\(\d\)&lt;span class=&quot;EnSpace&quot;/>', ' ', d1)
                    d3 = re.sub(r'&lt;/*p>', ' ', d2)
                    d4 = re.sub(r'\"', r'', unidecode(d3))
                    d5 = re.sub(r'[\n\t\\]', ' ', d4)
                    d6 = re.sub(r'\s+', ' ', d5).strip()
                    changes['removed'].append(d6)
            data['DigestChanges'] = changes
        data['DigestText'] = digestText
    except:
        pass
    if 'Id' in data.keys():
        id = re.sub(r'_+', '', data['Id'])
    else:
        try:
            session_year = data['SessionYear']
            session_num = data['SessionNum']
            measure_type = data['MeasureType']
            measure_num = data['MeasureNum']
            version_num = data['VersionNum']
            measure_state = data['MeasureState']
            id = f"{session_year}{session_num}{measure_type}{measure_num}{version_num}{measure_state}"
        except KeyError:
            id = None
    return id, data

In [239]:
def process_xml(dom):
    return xml_data(dom)

def chunked(iterable, chunk_size):
    for i in range(0, len(iterable), chunk_size):
        yield iterable[i:i + chunk_size]

bill_versions = {}
max_workers = 6
chunk_size = 10000
with tqdm(total=len(xml_files), desc="Processing XML files") as pbar:
    for chunk in chunked(xml_files, chunk_size):
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            for id, data in executor.map(process_xml, chunk):
                if id:
                    bill_versions[id] = data
                pbar.update(1)

Processing XML files: 100%|██████████| 267408/267408 [05:58<00:00, 746.64it/s] 


In [243]:
flat_keys = [
    'MeasureType', 'Urgency', 'MeasureNum', 'GeneralSubject', 'VersionNum',
    'Appropriation', 'SessionYear', 'SessionNum', 'VoteRequired',
    'LocalProgram', 'FiscalCommittee', 'MeasureState', 'TaxLevy', 'Title'
]

def extract_entry(item):
    id_val, d = item
    digest_text = d.get('DigestText', '')
    authors = d.get('Authors', [])
    history = d.get('History', {})
    return {
        'digest': (id_val, digest_text),
        'authors': [
            (id_val, a.get('contribution'), a.get('house'), a.get('name'))
            for a in authors if isinstance(a, dict)
        ],
        'history': [
            (id_val, action, date)
            for action, date in history.items()
        ],
        'flat': [id_val] + [d.get(k) for k in flat_keys]
    }
digest_rows = []
author_rows = []
history_rows = []
other_rows = []
max_workers = 4
item_list = list(bill_versions.items())

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    for result in tqdm(executor.map(extract_entry, item_list), total=len(item_list)):
        digest_rows.append(result['digest'])
        author_rows.extend(result['authors'])
        history_rows.extend(result['history'])
        other_rows.append(result['flat'])

100%|██████████| 267229/267229 [00:01<00:00, 137764.46it/s]


In [245]:
digest_df = pd.DataFrame(digest_rows, columns=['bill_id', 'DigestText'])
authors_df = pd.DataFrame(author_rows, columns=['bill_id', 'Contribution', 'House', 'Name'])
history_df = pd.DataFrame(history_rows, columns=['bill_id', 'Action', 'Date'])
other_df = pd.DataFrame(other_rows, columns=['bill_id'] + flat_keys)

In [246]:
digest_df.to_csv('../legislation_data/digest.csv', index=False)

In [247]:
authors_df.to_csv('../legislation_data/authors.csv', index=False)

In [248]:
history_df.to_csv('../legislation_data/history.csv', index=False)

In [None]:
other_df.to_csv('../legislation_data/bill_versions.csv', index=False)

: 