In [2]:
import psycopg2
import pandas as pd
from tqdm import tqdm

In [3]:
DB_CONFIG = {
    'database': 'legislation_db',
    'host': "localhost",
    'user': "legislation",
    'password': "password",
    'port': 5432,
}
years = [i for i in range(1989, 2027, 2) if i != 2011]

In [4]:
def table_name(table):
    return f"legislation_db.{table}"

TABLE_DEFS = {
    'bill_analysis_tbl': {
        'table_name': table_name('bill_analysis_tbl'),
        'columns': [
            "analysis_id",
            "bill_id",
            "house",
            "analysis_type",
            "committee_code",
            "committee_name",
            "amendment_author",
            "analysis_date",
            "amendment_date",
            'source_doc',
            "released_floor",
        ]
    },
    'bill_detail_vote_tbl': {
        'table_name': table_name('bill_detail_vote_tbl'),
        'columns': [
            "bill_id",
            "location_code",
            "legislator_name",
            "vote_date_time",
            "vote_date_seq",
            "vote_code",
            "motion_id",
            "member_order",
            "session_date",
        ]
    },
    'bill_history_tbl': {
        'table_name': table_name('bill_history_tbl'),
        'columns': [
            "bill_id",
            "bill_history_id",
            "action_date",
            "action_",
            "action_sequence",
            'action_code',
            'action_status',
            'primary_location',
            'secondary_location',
            'end_status',
        ]
    },
    'bill_motion_tbl': {
        'table_name': table_name('bill_motion_tbl'),
        'columns': [
            'motion_id',
            'motion_text',
        ]
    },
    'bill_summary_vote_tbl': {
        'table_name': table_name('bill_summary_vote_tbl'),
        'columns': [
            'bill_id',
            'location_code',
            'vote_date_time',
            'motion_id',
            'ayes',
            'noes',
            'abstain',
            'vote_result'
        ]
    },
    'bill_tbl': {
        'table_name': table_name('bill_tbl'),
        'columns': [
            "bill_id",
            "session_year",
            "session_num",
            "measure_num",
            "measure_state",
            "chapter_year",
            "chapter_type",
            "chapter_session_num",
            "chapter_num",
            "latest_bill_version_id",
            "current_location",
            "current_status",
        ]
    },
    'bill_version_author_tbl': {
        'table_name': table_name('bill_version_author_tbl'),
        'columns': [
            'bill_version_id',
            'type',
            'house',
            'name',
            'contribution',
            'committee_members',
            'primary_author_flg'
        ]
    },
    'bill_version_tbl': {
        'table_name': table_name('bill_version_tbl'),
        'columns': [
            "bill_version_id",
            "bill_id",
            "version_num",
            "bill_version_action_date",
            "bill_version_action",
            "request_num",
            "subject",
            "vote_required",
            "appropriation",
            "fiscal_committee",
            "local_program",
            "substantive_changes",
            "urgency",
            "taxlevy",
            'bill_xml'
        ]
    },
    'committee_hearing_tbl': {
        'table_name': table_name('committee_hearing_tbl'),
        'columns': [
            "bill_id",
            "committee_type",
            "committee_nr",
            "location_code"
        ]
    },
    'codes_tbl': {
        'table_name': table_name('codes_tbl'),
        'columns': ['code', 'title']
    }
}

In [5]:
def db_connect():
    return psycopg2.connect(**DB_CONFIG)

In [6]:
def pull_data(table_name, year, columns):
    partition_name = f"{table_name}_{year}"
    col_str = ', '.join(columns)
    query = f"SELECT {col_str} FROM {partition_name};"
    try:
        conn = db_connect()
        cursor = conn.cursor()
        cursor.execute(query)
        rows = cursor.fetchall()
        data = pd.DataFrame(rows)
        if data.empty:
            return 'missing'
        data.columns = columns
        return data
    except:
        return 'missing'
    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()

In [7]:
def get_table_data(table_key, year):
    if table_key not in TABLE_DEFS:
        raise ValueError(f"Unknown table key: {table_key}")

    table_info = TABLE_DEFS[table_key]
    table_name = table_info['table_name']
    table_columns = table_info['columns']
    data = pull_data(table_name, year, table_columns)
    if isinstance(data, pd.DataFrame):
        return data
    else:
        return 'missing'

def get_all_years(table_key):
    all_data = []
    for year in years:
        data = get_table_data(table_key, year)
        if isinstance(data, pd.DataFrame):
            data['year'] = year
            all_data.append(data)
    if all_data:
        return pd.concat(all_data)
    else:
        return pd.DataFrame()

In [8]:
for table_key in tqdm(TABLE_DEFS.keys()):
    data = get_all_years(table_key)
    file_name = f'../legislation_data/{table_key}.csv'
    data.to_csv(file_name, index=False)

  return pd.concat(all_data)
100%|██████████| 10/10 [00:59<00:00,  5.94s/it]
