In [1]:
import psycopg2
import pandas as pd
from tqdm import tqdm

In [2]:
DB_CONFIG = {
    'database': 'legislation_db',
    'host': "localhost",
    'user': "legislation",
    'password': "password",
    'port': 5432,
}
years = [i for i in range(1989, 2027, 2) if i != 2011]

In [11]:
def table_name(table):
    return f"legislation_db.{table}"

TABLE_DEFS = {
    'bill_analysis_tbl': {
        'table_name': table_name('bill_analysis_tbl'),
        'columns': [
            "analysis_id",
            "bill_id",
            "house",
            "analysis_type",
            "committee_code",
            "committee_name",
            "amendment_author",
            "analysis_date",
            "amendment_date",
            'source_doc',
            "released_floor",
        ]
    },
    'bill_detail_vote_tbl': {
        'table_name': table_name('bill_detail_vote_tbl'),
        'columns': [
            "bill_id",
            "location_code",
            "legislator_name",
            "vote_date_time",
            "vote_date_seq",
            "vote_code",
            "motion_id",
            "member_order",
            "session_date",
        ]
    },
    'bill_history_tbl': {
        'table_name': table_name('bill_history_tbl'),
        'columns': [
            "bill_id",
            "bill_history_id",
            "action_date",
            "action_",
            "action_sequence",
            'action_code',
            'action_status',
            'primary_location',
            'secondary_location',
            'end_status',
        ]
    },
    'bill_motion_tbl': {
        'table_name': table_name('bill_motion_tbl'),
        'columns': [
            'motion_id',
            'motion_text',
        ]
    },
    'bill_summary_vote_tbl': {
        'table_name': table_name('bill_summary_vote_tbl'),
        'columns': [
            'bill_id',
            'location_code',
            'vote_date_time',
            'motion_id',
            'ayes',
            'noes',
            'abstain',
            'vote_result'
        ]
    },
    'bill_tbl': {
        'table_name': table_name('bill_tbl'),
        'columns': [
            "bill_id",
            "session_year",
            "session_num",
            "measure_num",
            "measure_state",
            "chapter_year",
            "chapter_type",
            "chapter_session_num",
            "chapter_num",
            "latest_bill_version_id",
            "current_location",
            "current_status",
        ]
    },
    'bill_version_author_tbl': {
        'table_name': table_name('bill_version_author_tbl'),
        'columns': [
            'bill_version_id',
            'type',
            'house',
            'name',
            'contribution',
            'committee_members',
            'primary_author_flg'
        ]
    },
    'bill_version_tbl': {
        'table_name': table_name('bill_version_tbl'),
        'columns': [
            "bill_version_id",
            "bill_id",
            "version_num",
            "bill_version_action_date",
            "bill_version_action",
            "request_num",
            "subject",
            "vote_required",
            "appropriation",
            "fiscal_committee",
            "local_program",
            "substantive_changes",
            "urgency",
            "taxlevy",
            'bill_xml'
        ]
    },
    'committee_hearing_tbl': {
        'table_name': table_name('committee_hearing_tbl'),
        'columns': [
            "bill_id",
            "committee_type",
            "committee_nr",
            "location_code"
        ]
    },
    'codes_tbl': {
        'table_name': table_name('codes_tbl'),
        'columns': ['code', 'title']
    }
}

In [12]:
def db_connect():
    return psycopg2.connect(**DB_CONFIG)

In [13]:
def pull_data(table_name, year, columns):
    partition_name = f"{table_name}_{year}"
    col_str = ', '.join(columns)
    query = f"SELECT {col_str} FROM {partition_name};"
    try:
        conn = db_connect()
        cursor = conn.cursor()
        cursor.execute(query)
        rows = cursor.fetchall()
        data = pd.DataFrame(rows)
        if data.empty:
            return 'missing'
        data.columns = columns
        return data
    except:
        return 'missing'
    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()

In [14]:
def get_table_data(table_key, year):
    if table_key not in TABLE_DEFS:
        raise ValueError(f"Unknown table key: {table_key}")

    table_info = TABLE_DEFS[table_key]
    table_name = table_info['table_name']
    table_columns = table_info['columns']
    data = pull_data(table_name, year, table_columns)
    if isinstance(data, pd.DataFrame):
        return data
    else:
        return 'missing'

def get_all_years(table_key):
    all_data = []
    for year in years:
        data = get_table_data(table_key, year)
        if isinstance(data, pd.DataFrame):
            data['year'] = year
            all_data.append(data)
    if all_data:
        return pd.concat(all_data)
    else:
        return pd.DataFrame()

In [16]:
for table_key in tqdm(TABLE_DEFS.keys()):
    data = get_all_years(table_key)
    file_name = f'../legislation_data/{table_key}.csv'
    data.to_csv(file_name, index=False)

  return pd.concat(all_data)
100%|██████████| 10/10 [00:56<00:00,  5.69s/it]


In [8]:
versions = pd.read_csv('../legislation_data/bill_version_tbl.csv')
versions

Unnamed: 0,bill_version_id,bill_id,version_num,bill_version_action_date,bill_version_action,request_num,subject,vote_required,appropriation,fiscal_committee,local_program,substantive_changes,urgency,taxlevy,bill_xml,year
0,19890SB57094CHP,198919900SB570,94,1989-09-26,Chaptered,,Trademarks: innocent infringer or innocent vi...,,,,,,,,BILL_VERSION_TBL_899.lob,1989
1,19890SB29495CHP,198919900SB294,95,1989-09-26,Chaptered,,Controlled substance offenders: registration.,,,,,,,,BILL_VERSION_TBL_900.lob,1989
2,19890SB18694CHP,198919900SB186,94,1989-09-26,Chaptered,,Land use: wild land fire protection.,,,,,,,,BILL_VERSION_TBL_901.lob,1989
3,19890SB11396CHP,198919900SB113,96,1989-09-26,Chaptered,,Validations.,,,,,,,,BILL_VERSION_TBL_902.lob,1989
4,19890AB102095CHP,198919900AB1020,95,1989-09-27,Chaptered,,Beverage containers: curbside programs.,,,,,,,,BILL_VERSION_TBL_903.lob,1989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270527,20250SB46098AMD,202520260SB460,98,2025-03-26,Amended Senate,,Joint Sunset Review Committee.,Majority,No,Yes,No,,No,No,BILL_VERSION_TBL_4300.lob,2025
270528,20250SB46099INT,202520260SB460,99,2025-02-19,Introduced,,Administrative regulations.,Majority,No,No,No,,No,No,BILL_VERSION_TBL_4301.lob,2025
270529,20250SB46198AMD,202520260SB461,98,2025-03-24,Amended Senate,,State real property: City of Imperial.,Majority,No,Yes,No,,No,No,BILL_VERSION_TBL_4302.lob,2025
270530,20250SB46199INT,202520260SB461,99,2025-02-19,Introduced,,State surplus property: City of Imperial.,Majority,No,Yes,No,,No,No,BILL_VERSION_TBL_4303.lob,2025


In [4]:
versions.bill_version_action.unique()

array(['Chaptered', 'Amended Assembly', 'Enrolled', 'Introduced',
       'Amended Senate', 'Proposed', 'Amended', 'Vetoed'], dtype=object)

In [5]:
analysis = pd.read_csv('../legislation_data/bill_analysis_tbl.csv')

  analysis = pd.read_csv('../legislation_data/bill_analysis_tbl.csv')


In [6]:
analysis

Unnamed: 0,analysis_id,bill_id,house,analysis_type,committee_code,committee_name,amendment_author,analysis_date,amendment_date,source_doc,released_floor,year
0,46669,199319940AB10,A,CMTE,,Assembly Committee,,1993-02-08,,<memory at 0x10919b100>,Y,1993
1,46670,199319940AB10,S,CMTE,,Senate Committee,,1993-03-25,,<memory at 0x10919b280>,Y,1993
2,46671,199319940AB10,S,CMTE,,Senate Committee,,1993-03-25,,<memory at 0x10919af80>,Y,1993
3,46672,199319940AB10,S,CMTE,,Senate Committee,,1993-07-07,,<memory at 0x10919b1c0>,Y,1993
4,46673,199319940AB10,S,CMTE,,Senate Committee,,1993-09-08,,<memory at 0x10919b340>,Y,1993
...,...,...,...,...,...,...,...,...,...,...,...,...
328038,382101,202520260SR7,S,FLOOR,CZ09,Sen. Floor Analyses,OchoaBogh,2025-01-22 00:00:00,2025-01-06 00:00:00,<memory at 0x160a58f40>,Y,2025
328039,382102,202520260SR9,S,FLOOR,CZ09,Sen. Floor Analyses,Gonzalez,2025-01-22 00:00:00,2025-01-07 00:00:00,<memory at 0x160a59000>,Y,2025
328040,382120,202520261AB4,S,FLOOR,CZ09,Sen. Floor Analyses,Gabriel,2025-01-23 00:00:00,2025-01-20 00:00:00,<memory at 0x160a590c0>,Y,2025
328041,382107,202520261AB4,A,ATR,CZ01,ASSEMBLY FLOOR ANALYSIS,GABRIEL,2025-01-22 00:00:00,2025-01-20 00:00:00,<memory at 0x160a59180>,Y,2025


In [11]:
conn = psycopg2.connect(**DB_CONFIG)
cursor = conn.cursor()
query = "SELECT lo_read(source_doc) FROM legislation_db.bill_analysis_tbl_2025;"
rows = cursor.execute(query)
data = pd.DataFrame(rows)
data

UndefinedFunction: function lo_read(bytea) does not exist
LINE 1: SELECT lo_read(source_doc) FROM legislation_db.bill_analysis...
               ^
HINT:  No function matches the given name and argument types. You might need to add explicit type casts.


In [12]:
[a for a in analysis['bill_id'].unique() if a in versions['bill_id'].unique()]

['199319940AB10',
 '199319940AB11',
 '199319940AB13',
 '199319940AB14',
 '199319940AB17',
 '199319940AB18',
 '199319940AB19',
 '199319940AB21',
 '199319940AB25',
 '199319940AB28',
 '199319940AB31',
 '199319940AB32',
 '199319940AB33',
 '199319940AB34',
 '199319940AB35',
 '199319940AB36',
 '199319940AB37',
 '199319940AB39',
 '199319940AB40',
 '199319940AB41',
 '199319940AB42',
 '199319940AB43',
 '199319940AB46',
 '199319940AB47',
 '199319940AB48',
 '199319940AB49',
 '199319940AB5',
 '199319940AB6',
 '199319940AB7',
 '199319940AB8',
 '199319941AB13',
 '199319941AB19',
 '199319941AB1',
 '199319941AB25',
 '199319941AB27',
 '199319941AB29',
 '199319941AB30',
 '199319941AB36',
 '199319941AB3',
 '199319941AB45',
 '199319941AB49',
 '199319941AB4',
 '199319941AB8',
 '199319940ACA17',
 '199319940ACA37',
 '199319940ACA46',
 '199319940ACA8',
 '199319940ACR10',
 '199319940ACR11',
 '199319940ACR12',
 '199319940ACR13',
 '199319940ACR15',
 '199319940ACR17',
 '199319940ACR18',
 '199319940ACR19',
 '19931