In [23]:
import pandas as pd
import json
import re
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
bills = pd.read_csv('ca_leg/legislation_data/bill_tbl.csv')

In [3]:
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})

In [4]:
bill_versions = pd.read_csv('ca_leg/legislation_data/bill_version_tbl.csv')

In [5]:
with open("ca_leg/legislation_data/bill_version_text.json", "r") as f:
    bill_text = json.load(f)
    authors_data = {bill_id: bill_info['Authors']
                   for bill_id, bill_info in bill_text.items()
                   if 'Authors' in bill_info}


In [6]:
list(set([a for b in [v.keys() for v in authors_data.values()] for a in b]))

['null', 'PRINCIPAL_COAUTHOR', 'COAUTHOR', 'LEAD_AUTHOR']

In [7]:
records = []
for bill_id, authors in authors_data.items():
    for author_type, house in authors.items():
        for house_name, author_name in house.items():
            records.append([bill_id, author_type, "COMMITTEE" if house_name == 'UNKNOWN' else house_name, author_name])

df = pd.DataFrame(records, columns=['bill_id', 'author_type', 'house', 'author_name'])
df['bill_id'] = df['bill_id'].apply(lambda x: re.sub(r'__', '', x))
combined = df.merge(bill_versions, left_on='bill_id', right_on='bill_version_id', how='left')

In [8]:
full = combined.loc[combined['bill_version_action'].notna()].merge(bills, left_on='bill_id_y', right_on='bill_id', how='left')

In [9]:
full.to_csv('ca_leg/legislation_data/combined_table.csv', index=False)

In [5]:
disclosure = pd.read_csv('lobbying/CVR_LOBBY_DISCLOSURE_CD.csv', dtype=str)

In [6]:
expenditure = pd.read_csv('lobbying/LEXP_CD.csv', dtype=str)

In [10]:
lobbying = disclosure[['FILING_ID', 'FIRM_NAME']].merge(expenditure, on='FILING_ID', how='left')
lobbying.to_csv('lobbying/lobbying.csv', index=False)

In [30]:
assembly_committees = pd.read_csv('pdf_parsing/assembly_committees_clean.csv')

In [36]:
doubles = assembly_committees.loc[assembly_committees['politician'].str.contains(',')]
hyphens = assembly_committees.loc[assembly_committees['politician'].str.contains('-')]
neither = assembly_committees.loc[(~assembly_committees['politician'].str.contains(',')) & (~assembly_committees['politician'].str.contains('-'))]

Unnamed: 0,term,politician,committee,position,committee_clean
0,2001-2002,Alquist,Select Committee on the Aging of the Baby Boomers,Chair,Select Committee on the Aging of the Baby Boomers
1,2001-2002,Alquist,Joint Committee to Develop a Master Plan for E...,Co-Vice Chair,Joint Committee to Develop a Master Plan for E...
2,2001-2002,Alquist,Aging and Long-Term Care,Member,Aging and Long-Term Care
3,2001-2002,Alquist,Appropriations,Member,Appropriations
4,2001-2002,Alquist,Banking and Finance,Member,Banking and Finance
...,...,...,...,...,...
10887,2025-2026,Zbur,Judiciary,Member,Judiciary
10888,2025-2026,Zbur,Natural Resources,Member,Natural Resources
10889,2025-2026,Zbur,Rules,Member,Rules
10890,2025-2026,Zbur,Utilities and Energy,Member,Utilities and Energy


In [21]:
assembly_roster = pd.read_csv('pdf_parsing/assembly_roster.csv')

In [20]:
assembly_roster

Unnamed: 0,Occupation,Party,District No.,Seat No.,Term,First,Last,Position
0,OralSurgeon,R,3.0,60.0,2001-2002,Sam,Aanestad,
1,Businesswoman/Educator,D,22.0,15.0,2001-2002,Elaine,Alquist,
2,StateSocialServicesSpecialist,D,14.0,8.0,2001-2002,Dion,Aroner,
3,Legislator,R,32.0,65.0,2001-2002,Roy,Ashburn,
4,Legislator,R,73.0,64.0,2001-2002,Patricia,Bates,
...,...,...,...,...,...,...,...,...
1040,FulltimeLegislator,D,78.0,37.0,2025-2026,Christopher M,Ward,
1041,FulltimeLegislator,D,14.0,76.0,2025-2026,Buffy,Wicks,
1042,Pastor/FullTimeLegislator,D,11.0,29.0,2025-2026,Lori D,Wilson,
1043,CivilRightsAttorney,D,51.0,45.0,2025-2026,Rick Chavez,Zbur,
