In [None]:
import pandas as pd
import re
import numpy as np
import warnings
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [3]:
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})

In [4]:
bill_history.columns

Index(['bill_id', 'bill_history_id', 'action_date', 'action_',
       'action_sequence', 'action_code', 'action_status', 'primary_location',
       'secondary_location', 'end_status', 'year'],
      dtype='object')

In [5]:
bill_history

Unnamed: 0,bill_id,bill_history_id,action_date,action_,action_sequence,action_code,action_status,primary_location,secondary_location,end_status,year
0,199920000AB630,1388441,1999-02-19,Read first time. To print.,,,,,,,1999
1,199920000ACA9,1388442,2000-11-30,From committee without further action.,,,,,,,1999
2,199920000ACA9,1388443,1999-05-11,In committee: Set first hearing. Failed pass...,,,,,,,1999
3,199920000ACA9,1388444,1999-04-27,Re-referred to Com. on PUB. S.,,,,,,,1999
4,199920000ACA9,1388445,1999-04-22,"From committee chair, with author's amendments...",,,,,,,1999
...,...,...,...,...,...,...,...,...,...,...,...
1086867,202520261SCR2,117393745,2025-02-04,Died on third reading.,3.0,245.0,Applied,Senate,Floor,Died,2025
1086868,202520261SCR2,117393743,2025-01-23,Introduced. Referred to Com. on RLS.,1.0,2.0,Applied,Senate,Committee,In Committee Process,2025
1086869,202520261SR1,117390240,2025-01-07,Introduced. Referred to Com. on RLS.,1.0,2.0,Applied,Senate,Committee,In Committee Process,2025
1086870,202520261SR1,117390241,2025-01-09,From committee: Ordered to third reading.,2.0,12.0,Applied,Senate,Floor,In Floor Process,2025


In [11]:
full = pd.read_csv('ca_leg/legislation_data/combined_table.csv', dtype={'current_location': str, 'current_status': str})

In [4]:
full.columns

Index(['bill_id_x', 'author_type', 'house', 'author_name', 'bill_version_id',
       'bill_id_y', 'version_num', 'bill_version_action_date',
       'bill_version_action', 'request_num', 'subject', 'vote_required',
       'appropriation', 'fiscal_committee', 'local_program',
       'substantive_changes', 'urgency', 'taxlevy', 'bill_xml', 'year_x',
       'bill_id', 'session_year', 'session_num', 'measure_num',
       'measure_state', 'chapter_year', 'chapter_type', 'chapter_session_num',
       'chapter_num', 'latest_bill_version_id', 'current_location',
       'current_status', 'year_y'],
      dtype='object')

In [12]:
full

Unnamed: 0,bill_id_x,author_type,house,author_name,bill_version_id,bill_id_y,version_num,bill_version_action_date,bill_version_action,request_num,...,measure_num,measure_state,chapter_year,chapter_type,chapter_session_num,chapter_num,latest_bill_version_id,current_location,current_status,year_y
0,19990AB26893ENR,LEAD_AUTHOR,ASSEMBLY,Nakano,19990AB26893ENR,199920000AB268,93.0,1999-07-15,Enrolled,,...,268,Chaptered,1999.0,CHP,0.0,193.0,19990AB26892CHP,,,1999
1,19990AB26893ENR,COAUTHOR,ASSEMBLY,Zettel,19990AB26893ENR,199920000AB268,93.0,1999-07-15,Enrolled,,...,268,Chaptered,1999.0,CHP,0.0,193.0,19990AB26892CHP,,,1999
2,19990AB26893ENR,COAUTHOR,SENATE,Solis,19990AB26893ENR,199920000AB268,93.0,1999-07-15,Enrolled,,...,268,Chaptered,1999.0,CHP,0.0,193.0,19990AB26892CHP,,,1999
3,19990AB51398AMD,LEAD_AUTHOR,ASSEMBLY,Strom-Martin,19990AB51398AMD,199920000AB513,98.0,1999-04-28,Amended Assembly,,...,513,Amended Assembly,,,,,19990AB51397AMD,,,1999
4,19990AB51398AMD,COAUTHOR,ASSEMBLY,Campbell,19990AB51398AMD,199920000AB513,98.0,1999-04-28,Amended Assembly,,...,513,Amended Assembly,,,,,19990AB51397AMD,,,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266710,20170SB115192CHP,COAUTHOR,ASSEMBLY,Chávez,20170SB115192CHP,201720180SB1151,92.0,2018-09-19,Chaptered,,...,1151,Chaptered,2018.0,CHP,0.0,564.0,20170SB115192CHP,,Chaptered,2017
266711,20170AB102799INT,LEAD_AUTHOR,ASSEMBLY,Acosta,20170AB102799INT,201720180AB1027,99.0,2017-02-16,Introduced,,...,1027,Chaptered,2017.0,CHP,0.0,205.0,20170AB102796CHP,,Chaptered,2017
266712,20170AB102799INT,PRINCIPAL_COAUTHOR,SENATE,Wilk,20170AB102799INT,201720180AB1027,99.0,2017-02-16,Introduced,,...,1027,Chaptered,2017.0,CHP,0.0,205.0,20170AB102796CHP,,Chaptered,2017
266713,20170SB61193CHP,LEAD_AUTHOR,SENATE,Allen,20170SB61193CHP,201720180SB611,93.0,2017-10-04,Chaptered,,...,611,Chaptered,2017.0,CHP,0.0,485.0,20170SB61193CHP,,Chaptered,2017


authors and versions

In [6]:
bill_votes = pd.read_csv('ca_leg/legislation_data/bill_detail_vote_tbl.csv', parse_dates=['session_date'])

In [7]:
bill_votes.columns

Index(['bill_id', 'location_code', 'legislator_name', 'vote_date_time',
       'vote_date_seq', 'vote_code', 'motion_id', 'member_order',
       'session_date', 'year'],
      dtype='object')

In [8]:
bill_votes

Unnamed: 0,bill_id,location_code,legislator_name,vote_date_time,vote_date_seq,vote_code,motion_id,member_order,session_date,year
0,199920000AB1387,AFLOOR,Cox,1999-09-01 13:16:00,1011,NOE,107178,,NaT,1999
1,199920000AB1387,AFLOOR,Cunneen,1999-09-01 13:16:00,1011,NOE,107178,,NaT,1999
2,199920000AB1387,AFLOOR,Davis,1999-09-01 13:16:00,1011,AYE,107178,,NaT,1999
3,199920000AB1387,AFLOOR,Dickerson,1999-09-01 13:16:00,1011,NOE,107178,,NaT,1999
4,199920000AB1387,AFLOOR,Ducheny,1999-09-01 13:16:00,1011,AYE,107178,,NaT,1999
...,...,...,...,...,...,...,...,...,...,...
7445020,202520260SCR9,CX20,Irwin,2025-02-10 00:00:00,1,AYE,147493,6.0,2025-02-10,2025
7445021,202520260SCR9,CX20,Krell,2025-02-10 00:00:00,1,AYE,147493,7.0,2025-02-10,2025
7445022,202520260SCR9,CX20,Patel,2025-02-10 00:00:00,1,AYE,147493,8.0,2025-02-10,2025
7445023,202520260SCR9,CX20,Michelle Rodriguez,2025-02-10 00:00:00,1,AYE,147493,9.0,2025-02-10,2025


In [36]:
bill_summary = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')

In [41]:
bill_summary

NameError: name 'bill_summary' is not defined

In [62]:
locations = pd.read_csv('ca_leg/legislation_data/committee_codes.csv')

In [44]:
locations.columns

Index(['committee_code', 'committee_name'], dtype='object')

In [None]:
clean_coms = []
for i, row in locations.iterrows():
    if row['committee_code'].startswith('CZ'):
        continue
    name = row['committee_name']
    if name.startswith('Sen.'):
        cname = re.sub(r'Sen. ', 'Senate ', name).lower()
    if row['committee_code'].startswith('CX'):
        if name.lower().startswith('assembly'):
            cname = name.lower()
        else:
            cname = 'assembly ' + name.lower()



Unnamed: 0,committee_code,committee_name
0,CZ09,Sen. Floor Analyses
1,CS61,Sen. Appropriations
2,CS69,"Sen. Banking, Finance and Insurance"
3,CS71,"Sen. Energy, Utilities and Communications"
4,CS59,Sen. Transportation and Housing
...,...,...
139,CX38,Assembly Military and Veterans Affairs
140,CS66,Senate Committee on Military and Veterans Affairs
141,CX23,Assembly Committee on Utilities and Energy
142,CX04,Assembly Elections


In [21]:
bill_motions = pd.read_csv('ca_leg/legislation_data/bill_motion_tbl.csv')

In [64]:
politicians = pd.read_csv('ca_leg/legislation_data/politicians.csv')

In [65]:
politicians.columns

Index(['committee_clean', 'position', 'Occupation', 'Party', 'District No.',
       'Seat No.', 'Term', 'Last', 'full_name', 'chamber'],
      dtype='object')

legislators and committees

In [43]:
lobbying = pd.read_csv('calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})

In [44]:
lobbying.columns

Index(['FILING_ID', 'FIRM_NAME', 'AMEND_ID', 'LINE_ITEM', 'REC_TYPE',
       'FORM_TYPE', 'TRAN_ID', 'RECSUBTYPE', 'ENTITY_CD', 'PAYEE_NAML',
       'PAYEE_NAMF', 'PAYEE_NAMT', 'PAYEE_NAMS', 'PAYEE_CITY', 'PAYEE_ST',
       'PAYEE_ZIP4', 'CREDCARDCO', 'BENE_NAME', 'BENE_POSIT', 'BENE_AMT',
       'EXPN_DSCR', 'EXPN_DATE', 'AMOUNT', 'MEMO_CODE', 'MEMO_REFNO',
       'BAKREF_TID', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27',
       'Unnamed: 28', 'clean_beneficiary'],
      dtype='object')

In [9]:
expend_assembly = pd.read_csv('calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})

In [10]:
expend_assembly.columns

Index(['TargetCandidateName', 'TargetCandidateOffice', 'TargetPropositionName',
       'ExpenderPosition', 'ExpenderName', 'ExpenderID', 'Amount',
       'ExpenditureDscr', 'PayeeName', 'DateStart', 'DateEnd', 'DateRange',
       'year', 'term_x', 'matched_target_name', 'term_y', 'politician',
       'committee', 'position', 'committee_clean', 'Occupation', 'Party',
       'District No.', 'Seat No.', 'Term', 'First', 'Last', 'Position',
       'full_name', 'target_name'],
      dtype='object')

In [11]:
expend_senate = pd.read_csv('calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})

In [12]:
expend_senate.columns

Index(['TargetCandidateName', 'TargetCandidateOffice', 'TargetPropositionName',
       'ExpenderPosition', 'ExpenderName', 'ExpenderID', 'Amount',
       'ExpenditureDscr', 'PayeeName', 'DateStart', 'DateEnd', 'DateRange',
       'matched_target_name', 'term', 'politician', 'committee', 'position',
       'committee_clean', 'Name', 'Occupation', 'Party', 'District No.',
       'Seat No.', 'Phone', 'Counties', 'pages', 'Last', 'Term', 'full_name'],
      dtype='object')

In [14]:
bill_text = pd.read_csv('ca_leg/legislation_data/bill_text.csv')

In [14]:
bill_text.columns

Index(['bill_id', 'title', 'general_subject', 'digest_text', 'content'], dtype='object')

In [36]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

In [40]:
subjects = [t if (isinstance(t, str) and t is not None) else '' for t in bill_text.loc[bill_text['bill_id'].str.startswith('2')]['general_subject'].tolist()]
embeddings = model.encode(
    subjects,
    batch_size=100,
    show_progress_bar=True,
    convert_to_tensor=True,
    num_workers=5,
    normalize_embeddings=False
)

Batches:   0%|          | 0/1702 [00:00<?, ?it/s]

In [51]:
lobbying_firms_embeddings = {}
for firm in lobbying.loc[(lobbying['clean_beneficiary'].notna()) & (lobbying['FIRM_NAME'].apply(lambda x: isinstance(x, str)))]['FIRM_NAME'].unique().tolist():
    lobbying_firms_embeddings[firm] = model.encode(firm)

In [53]:
committee_embeddings = {}
for committee in politicians['committee_clean'].unique().tolist():
    committee_embeddings[committee] = model.encode(committee)

In [54]:
import networkx as nx

In [56]:
legislators = politicians['full_name'].unique().tolist()

In [59]:
bills= bill_history.loc[bill_history['year'] > 1999]['bill_id'].unique().tolist()

In [60]:
bill_versions = [re.sub(r'_', '', b) for b in bill_text.loc[bill_text['bill_id'].str.startswith('2')]['bill_id'].unique().tolist()]

In [68]:
G = nx.MultiDiGraph()