In [221]:
import pandas as pd
import numpy as np
import geopandas as gpd
import re, json, ast, pathlib, zipfile, tempfile, datetime as _dt, warnings, torch, os
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict, deque
from rapidfuzz import fuzz, process

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [146]:
summary_votes = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})
authors = pd.read_csv('ca_leg/legislation_data/authors.csv')
history = pd.read_csv('ca_leg/legislation_data/history.csv')
versions = pd.read_csv('ca_leg/legislation_data/bill_versions.csv')
bill_votes = pd.read_csv('ca_leg/legislation_data/bill_detail_vote_tbl.csv', parse_dates=['session_date'])
bill_summary = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')
bill_motions = pd.read_csv('ca_leg/legislation_data/bill_motion_tbl.csv')
locations = pd.read_csv('ca_leg/legislation_data/committee_codes.csv')
politicians = pd.read_csv('ca_leg/legislation_data/politicians.csv')
lobbying = pd.read_csv('calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})
expend_assembly = pd.read_csv('calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})
expend_senate = pd.read_csv('calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})
digests = pd.read_csv('ca_leg/legislation_data/digest.csv')
hearings = pd.read_csv('ca_leg/legislation_data/committee_hearing_tbl.csv')

In [147]:
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time']).apply(lambda x: x.strftime('%Y-%m-%d'))
bill_votes['legislator_name'] = bill_votes['legislator_name'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

In [148]:
ACTION_KEYWORDS = ["Assembly Third Reading","Assembly 3rd reading","senate 3rd reading","Senate Third Reading","Concurrence - Urgency Added","Concurrence in Senate Amendments","Do pass as amended, and re-refer","Do pass as amended, but re-refer","Do pass as amended","Do pass and be re-referred","Concurrence","Consent Calendar","Urgency Clause","Special Consent","Motion to Reconsider","Do pass","Reconsideration","Committee amendments","W/O REF. TO FILE","Be re-referred to the Committee","Lay on the Table","Amend by","Unfinished Business","Placed on Appropriations Suspense File"]

def extract_action(motion_text):
    if not isinstance(motion_text, str) or motion_text is None:
        return None
    motion = motion_text.upper()
    action = next((act for act in ACTION_KEYWORDS if act.upper() in motion), None)
    if action != 'Reconsideration' and 'RECONSIDER' in motion:
        if action is not None:
            action += ' Reconsideration'
        else:
            action = 'Reconsideration'
    return action if action else None

bill_motions['simplified_motion'] = bill_motions['motion_text'].apply(extract_action)

In [149]:
clean_coms = {}
for i, row in locations.iterrows():
    if row['committee_code'].startswith('CZ'):
        continue
    name = row['committee_name']
    if row['committee_code'].startswith('CS'):
        if name.startswith('Sen.'):
            cname = re.sub(r'Sen. ', 'senate ', name).lower()
        elif name.startswith('Senate '):
            cname = name.lower()
        else:
            cname = 'senate ' + name.lower()
    elif row['committee_code'].startswith('CX'):
        if name.lower().startswith('assembly'):
            cname = name.lower()
        else:
            cname = 'assembly ' + name.lower()
    if re.search(r'x\d$', cname) is not None:
        cname = re.sub(r'x(?=\d$)', 'no. ', cname)
    clean_coms[row['committee_code']] = cname

leg_committees = [f"{row['chamber']} {row['committee_clean']}".lower() for _, row in politicians[['committee_clean', 'chamber']].drop_duplicates().iterrows()]

In [150]:
def match_committees(_names, clean_coms, threshold=92):
    clean_c = list(clean_coms.values())
    clean_codes = list(clean_coms.keys())
    name_mapping = {}
    for i, clean in enumerate(clean_c):
        code = clean_codes[i]
        matches = []
        matches.append(process.extractOne(clean, _names, scorer=fuzz.token_sort_ratio, score_cutoff=threshold))
        matches.append(process.extractOne(clean, _names, scorer=fuzz.partial_ratio, score_cutoff=threshold))
        valid_matches = [m for m in matches if m is not None]
        if len(valid_matches) > 0:
            best_match = max(valid_matches, key=lambda x: x[1])
            name_mapping[code] = best_match[0]
        else:
            fall_back = process.extractOne(clean, _names, scorer=fuzz.token_sort_ratio, score_cutoff=threshold - 8)
            if fall_back is not None:
                name_mapping[code] = fall_back[0]
            else:
                name_mapping[code] = None
    return name_mapping

committee_matches = match_committees(leg_committees, clean_coms)

In [151]:
locations['committee_clean'] = locations['committee_code'].map(committee_matches)
locations.loc[locations['committee_name'] == 'EDUCATION X5', 'committee_clean'] = 'Budget and Fiscal Review: Education'
locations.loc[locations['committee_code'] == 'CX12', 'committee_clean'] = 'Budget No. 1 on Health and Human Services'
locations.loc[locations['committee_code'] == 'CS68', 'committee_clean'] = 'Budget No. 3 - Health and Human Services'
locations.loc[locations['committee_code'] == 'CS66', 'committee_clean'] = 'Senate Veterans Affairs'
locations.loc[locations['committee_code'] == 'CS56', 'committee_clean'] = 'Senate Public Employment and Retirement'
locations.loc[locations['committee_code'] == 'CS62', 'committee_clean'] = 'Senate Budget and Fiscal Review'
locations.loc[locations['committee_code'] == 'CX23', 'committee_clean'] = 'Assembly Utilities and Commerce'

motion_codes = {row['motion_id']: row['simplified_motion'] for _, row in bill_motions.iterrows()}
summary_votes['motion_text'] = summary_votes['motion_id'].map(motion_codes)

In [152]:
def repair_bill_id(id):
    front, end = id[:4], id[4:]
    if re.search(r'\d{4}$', front):
        return f"{front}{int(front) + 1}{end}"
    else:
        return id

versions['ID'] = versions['bill_id'].apply(lambda x: repair_bill_id(x))
bill_vers = versions.loc[versions['bill_id'].str.startswith('2')].copy()
for i, row in bill_vers.iterrows():
    tail = f"{row['VersionNum']}{row['MeasureState']}"
    repaired = repair_bill_id(re.sub(tail, '', row['bill_id']))
    end = int(repaired[-4:])
    bill_vers.loc[i, 'bill_ID'] = f"{repaired[:-4]}{end}"

In [153]:
leg_parties = {row['full_name']: row['Party'] for _, row in politicians[['full_name', 'Party']].drop_duplicates().iterrows()}
bill_ids = list(set(bill_votes.loc[bill_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist() + summary_votes.loc[summary_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist()))
bill_id_codes = {row['bill_id']: row['bill_ID'] for _, row in bill_vers.drop_duplicates(subset=['bill_id', 'bill_ID']).iterrows()}
history['bill_ID'] = history['bill_id'].map(bill_id_codes)
history['Date'] = pd.to_datetime(history['Date'])

introduction_dates = {}
for v, group in history.loc[history['bill_ID'].isin(bill_ids)].groupby('bill_ID'):
    introduction_dates[v] = {'Dates': group['Date'].unique().tolist(), 'Actions': group.sort_values('Date', ascending=True).drop_duplicates(subset=['Action', 'Date'])['Action'].tolist()}

version_id_mapping = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['ID']}
version_id_mapping2 = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['bill_id']}
bv2b = {v: k for k, val in version_id_mapping2.items() for v in val}

In [154]:
date_ranges = {}
for k, v in introduction_dates.items():
    first, last = min(v['Dates']), max(v['Dates'])
    date_ranges[k] = {'First_action': first, 'Last_action': last}

outcomes = history.loc[history['bill_ID'].notna()].sort_values('Date', ascending=False).groupby('bill_ID').first().reset_index()[['bill_ID', 'Action']]
outcomes.loc[outcomes['Action'].isin(['CHAPTERED', 'ENROLLED', 'FILED', 'APPROVED']), 'Outcome'] = 1
outcomes.loc[outcomes['Action'] == 'VETOED', 'Outcome'] = -1
outcomes.loc[outcomes['Outcome'].isna(), 'Outcome'] = 0
outcome = outcomes.set_index('bill_ID')['Outcome'].to_dict()

In [155]:
vote_bill_ids = {}
for i in summary_votes.loc[summary_votes['bill_id'].isin(bill_ids)].groupby(['year', 'motion_id'])['bill_id'].value_counts().index:
    year, motion_id, bill_id = i
    if (year, motion_id) not in vote_bill_ids.keys():
        vote_bill_ids[(year, motion_id)] = [bill_id]
    else:
        vote_bill_ids[(year, motion_id)].append(bill_id)

In [156]:
bill_vers_dig = bill_vers.merge(digests, on='bill_id', how='inner')
features = {row['ID']: {'digest': row['DigestText'], 'MeasureState': row['MeasureState'], 'VoteRequired': row['VoteRequired'] if row['VoteRequired'] is not None else 'No', 'VersionNum': row['VersionNum'] if row['VersionNum'] is not None else 'No', 'LocalProgram': row['LocalProgram'] if row['LocalProgram'] is not None else 'No', 'FiscalCommittee': row['FiscalCommittee'] if row['FiscalCommittee'] is not None else 'No', 'TaxLevy': row['TaxLevy'] if row['TaxLevy'] is not None else 'No', 'Urgency': row['Urgency'] if row['Urgency'] is not None else 'No'} for _, row in bill_vers_dig.iterrows()}

committee_codes = {v.lower(): k for k, v in enumerate(politicians['committee_clean'].unique().tolist())}

In [157]:
bill_votes['chamber'] = bill_votes['location_code'].apply(lambda x: 'assembly' if x == 'AFLOOR' or str(x).startswith('CX') else 'senate' if x == 'SFLOOR' or str(x).startswith('CS') else 'full')
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time'])
bill_votes['term'] = bill_votes['vote_date_time'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1 else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year + 1}-{x.year + 2}")

In [158]:
author_locations = authors.loc[(authors['House'] == 'UNKNOWN') & (authors['bill_id'].map(bill_id_codes).isin(bill_ids)), ['bill_id', 'Name']].drop_duplicates()
for i, row in author_locations.iterrows():
    if 'AB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Assembly ' + row['Name']
    elif 'SB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Senate ' + row['Name']
    else:
        author_locations.loc[i, 'name'] = 'Joint ' + row['Name']

In [159]:
def fuzzy_strings(source_list, target_list):
    def preprocess_name(name):
        if not isinstance(name, str):
            return ""
        name = name.lower()
        name = re.sub(r'\(.*?\)', '', name)
        name = re.sub(r'committee on', '', name)
        name = re.sub(r'[^a-z\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        return name
    clean_source = [preprocess_name(c) for c in source_list]
    clean_target = [preprocess_name(c) for c in target_list]
    keywords = ["education","health","finance","budget","transportation","judiciary","environment","agriculture","energy","labor","housing","veterans affairs","public safety","insurance","banking","public health","small business","redistricting","public utilities","natural resources","water","technology","communications","elections","government","appropriations","rules","ethics","criminal justice","environmental protection","college and university","human services","reproductive health","mental health","technology","aggriculture","urban development","renewable energy","gun violence","commerce","privacy","cybersecurity","infrastructure","disaster preparedness","prisons","aging"]
    def get_committee_keywords(name):
        return set(kw for kw in keywords if kw in name)
    target_keywords = [get_committee_keywords(name) for name in clean_target]
    def calculate_similarity(source_idx, target_idx):
        source = clean_source[source_idx]
        target = clean_target[target_idx]
        if not source or not target:
            return 0
        if source == target:
            return 100
        token_sort = fuzz.token_sort_ratio(source, target)
        token_set = fuzz.token_set_ratio(source, target)
        partial = fuzz.partial_ratio(source, target)
        source_kw = get_committee_keywords(source)
        keyword_overlap = len(source_kw.intersection(target_keywords[target_idx]))
        keyword_bonus = min(20, keyword_overlap * 10)
        weighted_score = (token_sort * 0.3) + (token_set * 0.5) + (partial * 0.2) + keyword_bonus
        return weighted_score
    matches = {}
    for i, source in enumerate(source_list):
        scores = [calculate_similarity(i, j) for j in range(len(target_list))]
        if not scores or max(scores) < 60:
            matches[source] = None
        else:
            best_idx = np.argmax(scores)
            confidence = scores[best_idx]
            if confidence >= 60:
                matches[source] = target_list[best_idx]
            else:
                matches[source] = None
    return matches

author_com_matches = fuzzy_strings(author_locations['name'].unique().tolist(), leg_committees)
author_locations['name'] = author_locations['name'].map(author_com_matches)

In [160]:
sponsors = authors.loc[authors['bill_id'].map(bill_id_codes).isin(bill_ids)]
sponsors['term'] = sponsors['bill_id'].apply(lambda x: f"{x[:4]}-{int(x[:4]) + 1}" if int(x[:4]) % 2 == 1 else f"{int(x[:4]) - 1}-{x[:4]}" if int(x[:4]) % 2 == 0 and int(x[:4]) < 2009 else f"{x[:4]}-{int(x[:4]) + 1}")

lob = lobbying.loc[lobbying['clean_beneficiary'].notna(), ['FIRM_NAME', 'EXPN_DSCR', 'clean_beneficiary', 'EXPN_DATE', 'BENE_AMT']]
lob['EXPN_DATE'] = pd.to_datetime(lob['EXPN_DATE'])
lob['term'] = lob['EXPN_DATE'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1 else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year}-{x.year + 1}")

In [161]:
for i, row in politicians.loc[politicians['full_name'].apply(lambda x: isinstance(x, float)), ['Term', 'Last', 'chamber']].drop_duplicates().iterrows():
    term, last = row['Term'], row['Last']
    a = politicians.loc[(politicians['Last'] == last) & (politicians['Term'] == term) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
        continue
    else:
        a = politicians.loc[(politicians['Last'] == last) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]

In [162]:
pol_names_terms = {}
for _, row in politicians[['full_name', 'Term', 'chamber']].drop_duplicates().iterrows():
    if ',' in row['full_name']:
        name = row['full_name'].split(',')[1].strip() + ' ' + row['full_name'].split(',')[0].strip()
    else:
        name = row['full_name']
    pol_names_terms[(row['full_name'].lower(), row['Term'])] = {'chamber': row['chamber'], 'name': name}

In [163]:
expend_assembly = expend_assembly.rename(columns={'term': 'Term'})
expend_assembly['chamber'] = 'assembly'
expend_senate = expend_senate.rename(columns={'term': 'Term'})
expend_senate['chamber'] = 'senate'

campaign_contributions = pd.concat([expend_assembly.loc[expend_assembly['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd']), expend_senate.loc[expend_senate['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd'])])
campaign_contributions['DateEnd'] = pd.to_datetime(campaign_contributions['DateEnd'])
sponsors['bill_ID'] = sponsors['bill_id'].apply(repair_bill_id)

In [164]:
voting = history.merge(bill_votes, left_on=['bill_ID', 'Date'], right_on=['bill_id', 'vote_date_time'], how='inner').rename(columns={'bill_id_x': 'bill_version'}).drop('bill_id_y', axis=1)
voting['bv_id'] = voting['bill_version'].apply(repair_bill_id)

voting_places = {}
for i, row in voting.groupby(['motion_id', 'term', 'chamber', 'Date']).agg({'legislator_name': lambda x: list(x)}).iterrows():
    motion_id, term, chamber, date = i
    g = politicians.loc[(politicians['chamber'] == chamber) & (politicians['Term'] == term) & (politicians['Last'].isin(row['legislator_name']))]
    voting_places[(motion_id, term, chamber, date)] = {'most_common_committee': g.groupby('committee_clean').size().sort_values(ascending=False).head(1).index[0] if len(g) > 0 else None}
voting['voting_place'] = voting.apply(lambda row: voting_places.get((row['motion_id'], row['term'], row['chamber'], row['Date']), {}).get('most_common_committee', None), axis=1)

In [165]:
hear = hearings[['bill_id', 'location_code']].merge(locations[['committee_code', 'committee_clean']], left_on='location_code', right_on='committee_code', how='left')[['bill_id', 'committee_clean']].drop_duplicates()
hear['year'] = hear['bill_id'].apply(lambda x: int(x[:4]))

positions = {p: 'Democratic Alternate' if re.search(r'Democratic\s*Alternate', p) is not None else 'Vice Chair' if re.search(r'V\s*i\s*c\s*e\s*-*\s*C\s*h\s*a\s*i\s*r\s*', p) is not None else 'Co-Chair' if re.search(r'Co\s*-\s*Chair', p) is not None else 'Chair' if re.search(r'Cha\s*i\s*r', p) is not None else 'Republican Alternate' if re.search(r'\s*Republican\s*Alternate', p) is not None else p for p in politicians['position'].unique()}
vnums = bill_vers.set_index('ID')['VersionNum'].to_dict()
vid_map = {v: k for k, val in version_id_mapping.items() for v in val}

In [166]:
def _safe_dt(s):
    return pd.to_datetime(s, errors='coerce')

def _canon_name(n):
    n = re.sub(r'[^\w\s]', ' ', str(n)).lower()
    n = re.sub(r'\s+', ' ', n).strip()
    return n

def _infer_origin_chamber_from_bill_id(bill_id):
    s = str(bill_id)
    if 'AB' in s: return 'assembly'
    if 'SB' in s: return 'senate'
    return None

def _term_from_date(ts):
    if pd.isna(ts): return np.nan
    y = ts.year
    if y % 2 == 1:
        return f"{y}-{y+1}"
    else:
        if ts.month < 11:
            return f"{y-1}-{y}"
        return f"{y+1}-{y+2}"

def _tokenize(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return [t for t in s.split(' ') if t]

def _jaccard(a_set, b_set):
    if not a_set and not b_set: return 1.0
    i = len(a_set & b_set)
    u = len(a_set | b_set)
    return i / u if u else 0.0

def read_zip(zip_path, crs=3857):
    tmp = tempfile.TemporaryDirectory()
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(tmp.name)
    shp = next(pathlib.Path(tmp.name).rglob("*.shp"))
    gdf = gpd.read_file(shp).set_crs(epsg=crs)
    gdf = gdf.to_crs(epsg=3857)
    return gdf, tmp

def district_cycle(year):
    if year <= 2012: return "2001"
    if year <= 2022: return "2011"
    return "current"

In [167]:
populations = pd.read_csv('E-4_2010-2020-Internet-Version.csv', skiprows=1).iloc[:, :12]
for c in populations.columns:
    populations[c] = populations[c].astype(str)

for c in populations.iloc[:, 1:].columns:
    populations[c] = populations[c].apply(lambda x: re.sub(r'[^0-9]', '', x)).astype(int)

pops = {}
for _, row in populations.iterrows():
    county = f'{row["COUNTY"].strip()} County'
    pops[county] = np.mean(row.iloc[1:])

In [168]:
counties_gdf, _ = read_zip('dashboard/backend/data/ca_counties.zip')
counties_gdf = counties_gdf[['COUNTYFP', 'NAMELSAD', 'geometry']]
counties_gdf['county_area_m2'] = counties_gdf.geometry.area
counties_gdf['county_id'] = counties_gdf['COUNTYFP'].astype(int)
counties_gdf['population'] = counties_gdf['NAMELSAD'].map(pops).astype('float64')
counties_gdf['pop_density_per_m2'] = counties_gdf['population'] / counties_gdf['county_area_m2']
data_dir = pathlib.Path('dashboard/backend/data')
asm11_zip = data_dir / '2011_assembly_state_shp.zip'
sen11_zip = data_dir / '2011_senate_state_shp.zip'
asmcur_zip = data_dir / '2021_AD_Final_shp.zip'
sencur_zip = data_dir / '2021_SD_Final_shp.zip'
dist_info = [(asm11_zip, "assembly", "2011", 4019),(sen11_zip, "senate", "2011", 4019),(asmcur_zip, "assembly","current", 4269),(sencur_zip, "senate",  "current", 4269)]
weight_records = []
tmps = []
for zp, house, cycle, crs in dist_info:
    gdf, tmp = read_zip(zp, crs)
    tmps.append(tmp)
    gdf = gdf.rename(columns={gdf.columns[0]: "district_id"})[["district_id", "geometry"]]
    gdf["house"] = house
    gdf["cycle"] = cycle
    gdf = gdf.to_crs(3857)
    gdf['dist_area_m2'] = gdf.geometry.area
    inter = gpd.overlay(gdf, counties_gdf, how="intersection")
    inter = inter[['house', 'cycle', 'district_id', 'COUNTYFP', 'NAMELSAD', 'geometry', 'dist_area_m2', 'county_area_m2', 'population', 'pop_density_per_m2']]
    inter['fragment_area_m2'] = inter.geometry.area
    inter['est_overlay_pop'] = inter['pop_density_per_m2'] * inter["fragment_area_m2"]

    inter["pop_denominator"] = inter.groupby(["house","cycle","district_id"])["est_overlay_pop"].transform("sum")

    inter["area_denominator"] = inter.groupby(["house","cycle","district_id"])["fragment_area_m2"].transform("sum")
    inter["w_area"] = np.where(inter["area_denominator"] > 0,
                               inter["fragment_area_m2"] / inter["area_denominator"],
                               np.nan)

    inter["w_pop"] = inter["est_overlay_pop"] / inter["pop_denominator"]
    bad = ~np.isfinite(inter["w_pop"]) | (inter["w_pop"] < 0)
    inter.loc[bad, "w_pop"] = inter.loc[bad, "w_area"]
    inter["district_share_in_county_area"] = inter["fragment_area_m2"] / inter["dist_area_m2"]
    inter["district_share_in_county_pop"] = inter["w_pop"]

    weight_records.append(
        inter[[
            "house","cycle","district_id","COUNTYFP","NAMELSAD",
            "fragment_area_m2","dist_area_m2","county_area_m2",
            "population","pop_density_per_m2","est_overlay_pop",
            "w_pop","w_area","district_share_in_county_pop","district_share_in_county_area"
        ]].reset_index(drop=True)
    )

weights = pd.concat(weight_records, ignore_index=True)

In [169]:
fix = politicians.loc[politicians['District No.'].isna(), ['full_name', 'Term']].drop_duplicates()
fix['District No.'] = [78, 30, 26, 30, 30, 29, 29, 22, 29, 22, 36, 29, 22, 22, 6]
for i, row in fix.iterrows():
    politicians.loc[(politicians['full_name'] == row['full_name']) & (politicians['Term'] == row['Term']), 'District No.'] = row['District No.']

In [170]:
lob['clean_beneficiary'] = lob['clean_beneficiary'].apply(lambda x: x.strip().lower() if isinstance(x, str) else x)
lobb = lob.groupby(['clean_beneficiary', 'term']).agg({'BENE_AMT': 'sum'}).reset_index().rename(columns={'BENE_AMT': 'AMOUNT'})
exp_as = expend_assembly[['Amount', 'year', 'matched_target_name']].drop_duplicates().groupby(['matched_target_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={'year': 'term'})
exp_sen = expend_senate.groupby(['matched_target_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={"year": 'term'})
politicians['lower'] = politicians['full_name'].str.lower()
def name_swap(name):
    return re.sub(r'\,', '', name.lower()).strip()

In [171]:
politicians['name2'] = politicians['full_name'].apply(name_swap)
politicians.loc[politicians['name2'].isin([p for p in politicians['lower'].unique() if p in lobb['clean_beneficiary'].unique()]), 'name2'] = politicians.loc[politicians['name2'].isin([p for p in politicians['lower'].unique() if p in lobb['clean_beneficiary'].unique()]), 'lower']
pl = politicians[['Party', 'District No.', 'Seat No.', 'Term', 'full_name', 'chamber', 'name2']].drop_duplicates().merge(lobb, left_on=['Term', 'name2'], right_on=['term', 'clean_beneficiary'], how='left').rename(columns={'AMOUNT': 'total_lobbying'})
exp_as['name2'] = exp_as['matched_target_name'].apply(lambda x: re.sub(r'\,', '', x.lower()))

In [172]:
def _term_year(x):
    m = re.search(r'(\d{4})', str(x))
    return int(m.group(1)) if m else np.nan

don_as = expend_assembly[['ExpenderName','matched_target_name','Amount','year']].dropna(subset=['ExpenderName','matched_target_name','Amount']).rename(columns={'year':'Term'})
don_as['amount'] = pd.to_numeric(don_as['Amount'], errors='coerce').fillna(0.0)
don_as['name2'] = don_as['matched_target_name'].astype(str).apply(name_swap)
don_as['kind'] = 'Donations'

don_sen = expend_senate[['ExpenderName','matched_target_name','Amount','year']].dropna(subset=['ExpenderName','matched_target_name','Amount']).rename(columns={'year':'Term'})
don_sen['amount'] = pd.to_numeric(don_sen['Amount'], errors='coerce').fillna(0.0)
don_sen['name2'] = don_sen['matched_target_name'].astype(str).apply(name_swap)
don_sen['kind'] = 'Donations'

don_all = pd.concat([don_as, don_sen], ignore_index=True)
pol_key = politicians[['name2','Term','chamber','District No.']].drop_duplicates()
pol_key['term'] = pol_key['Term'].apply(lambda x: x.split('-')[0]).astype(int)
don_all['term'] = don_all['Term'].apply(lambda x: x - 1 if x % 2 == 0 else x).astype(int)
don_m = don_all.merge(pol_key, on=['name2','term'], how='left').dropna(subset=['District No.','chamber'])
don_m['district_id'] = don_m['District No.'].astype(float).astype(int).astype(str)
don_m['house'] = don_m['chamber'].str.lower()
don_m['term_year'] = don_m['term'].apply(_term_year).astype('Int64')
don_m['name'] = don_m['ExpenderName'].astype(str)
don_g = don_m.groupby(['house','district_id','term','term_year','kind','name'], dropna=False, as_index=False)['amount'].sum()
don_g = don_g.rename(columns={'term':'term'})
don_g['donations'] = don_g['amount']
don_g['lobbying'] = 0.0
don_g['total'] = don_g['donations']

lob2 = lob[['clean_beneficiary','term','BENE_AMT']].dropna(subset=['clean_beneficiary','term','BENE_AMT']).copy()
lob2['amount'] = pd.to_numeric(lob2['BENE_AMT'], errors='coerce').fillna(0.0)
lob2['name2'] = lob2['clean_beneficiary'].astype(str).apply(name_swap)
lob2['term'] = lob2['term'].apply(lambda x: int(x.split('-')[0])).astype(int)
lob2['kind'] = 'Lobbying'
lob_m = lob2.merge(pol_key, on=['name2','term'], how='left').dropna(subset=['District No.','chamber'])
lob_m['district_id'] = lob_m['District No.'].apply(lambda x: re.sub(r' ', '', str(x))).astype(float).astype(int).astype(str)
lob_m['house'] = lob_m['chamber'].str.lower()
lob_m['term_year'] = lob_m['term'].apply(_term_year).astype('Int64')
lob_m['name'] = 'All lobby firms'
lob_g = lob_m.groupby(['house','district_id','term','term_year','kind','name'], dropna=False, as_index=False)['amount'].sum()
lob_g['lobbying'] = lob_g['amount']
lob_g['donations'] = 0.0
lob_g['total'] = lob_g['lobbying']

district_funders_time = pd.concat([don_g[['house','district_id','term','term_year','kind','name','donations','lobbying','total']], lob_g[['house','district_id','term','term_year','kind','name','donations','lobbying','total']]], ignore_index=True)

In [173]:
pl['term'] = pl['Term'].apply(lambda x: int(x.split('-')[-1]))
exp_as.loc[exp_as['term'] % 2 == 1, 'term'] = exp_as.loc[exp_as['term'] % 2 == 1, 'term'] + 1
exp_sen.loc[exp_sen['term'] % 2 == 1, 'term'] = exp_sen.loc[exp_sen['term'] % 2 == 1, 'term'] + 1

In [174]:
pld = pl.merge(exp_as, on=['term', 'name2'], how='left').rename(columns={'Amount': 'total_donations_'})
exp_sen['name2'] = exp_sen['matched_target_name'].apply(lambda x: re.sub(r'\,', '', x.lower()))
pldd = pld.merge(exp_sen, on=['term', 'name2'], how='left')
pldd['total_donations_'] = pldd[['total_donations_', 'Amount']].sum(skipna=True, axis=1)
pldd = pldd.rename(columns={'total_donations_': 'total_donations'})
pldd['total_received'] = pldd['total_donations'] + pldd['total_lobbying']
for c in ['total_donations', 'total_lobbying', 'total_received']:
    pldd[c] = pldd[c].fillna(0).astype(float)

In [175]:
lfund = pldd.copy()
lfund['District No.'] = lfund['District No.'].astype(str).apply(lambda x: re.sub(r'\s', '', x)).astype(float).astype(int)
lfund_ = lfund.groupby(['Term', 'District No.', 'chamber']).agg({'total_donations': 'sum','total_lobbying': 'sum','total_received': 'sum'}).reset_index()
lfund_['District No.'] = lfund_['District No.'].astype(float)
lfund_['cycle'] = lfund_['Term'].apply(lambda x: '2011' if int(x.split('-')[0]) <= 2012 else 'current')
reg_funds = lfund_.merge(weights, left_on=['cycle', 'District No.', 'chamber'], right_on=['cycle', 'district_id', 'house'], how='inner')

In [176]:
reg_funds['total_donations'] *= reg_funds['w_pop']
reg_funds['total_lobbying'] *= reg_funds['w_pop']
reg_funds['total_received'] *= reg_funds['w_pop']
reg_funds['county_id'] = reg_funds['COUNTYFP'].astype(int)
reg_funds_ = reg_funds.groupby(['county_id', 'house']).agg({'total_donations': 'sum','total_lobbying': 'sum','total_received': 'sum'}).reset_index()
co_cal = reg_funds_.merge(counties_gdf, on='county_id', how='left')
ca_legislator_funding = gpd.GeoDataFrame(co_cal, geometry='geometry')

In [177]:
vote_map = {'AYE':1,'YES':1,'NOE':-1,'NO':-1}
voting['vote_num'] = voting['vote_code'].str.upper().map(vote_map).fillna(0).astype(int)
motion_dict = bill_motions.set_index('motion_id')['motion_text'].to_dict()
roll_cols = ['bill_ID','bill_version','Date','motion_id','chamber','voting_place']
roll = (voting.groupby(roll_cols, dropna=False).agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())), no=('vote_num', lambda x: int((np.array(x)<0).sum())), total=('vote_num','count')).reset_index())
roll['pass'] = (roll['yes'] > roll['no'])
bill_votes['vote_num'] = bill_votes['vote_code'].str.upper().map(vote_map).fillna(0).astype(int)
bill_votes['Date'] = pd.to_datetime(bill_votes['vote_date_time']).dt.date
roll_cols2 = ['bill_id','Date','motion_id','chamber','location_code']

In [178]:
summary_roll = (bill_votes.groupby(roll_cols2, dropna=False).agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())), no=('vote_num', lambda x: int((np.array(x)<0).sum())), total=('vote_num','count')).reset_index())

In [179]:
summary_roll = (bill_votes.groupby(roll_cols2, dropna=False).agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())), no=('vote_num', lambda x: int((np.array(x)<0).sum())), total=('vote_num','count')).reset_index())
summary_roll['pass'] = (summary_roll['yes'] > summary_roll['no'])

In [180]:
def _stage_timing(group):
    g = group.sort_values('Date')
    intro = g['Date'].min()
    is_committee = ~(g['voting_place'].isin(['Assembly Floor','Senate Floor']))
    comm_ref = g.loc[is_committee, 'Date'].min() if is_committee.any() else pd.NaT
    first_read = g['Date'].min() if not g.empty else pd.NaT
    second_read = pd.NaT
    if pd.notna(first_read):
        _after1 = g[(g['Date'] > first_read) & (is_committee)]
        if not _after1.empty:
            second_read = _after1['Date'].min()
    third_read = pd.NaT
    if pd.notna(second_read):
        _after2 = g[(g['Date'] > second_read)]
        if not _after2.empty:
            third_read = _after2['Date'].min()
    is_floor = summary_roll.loc[(summary_roll['bill_id'] == g['bill_ID'].iloc[0]) & (summary_roll['location_code'].isin(['AFLOOR','SFLOOR']))]
    asm_floor_pass = pd.NaT
    sen_floor_pass = pd.NaT
    if not is_floor.empty:
        asm_floor_data = is_floor[(is_floor['location_code'] == 'AFLOOR') & (is_floor['pass'])]
        if not asm_floor_data.empty:
            asm_floor_pass = asm_floor_data['Date'].min()
        sen_floor_data = is_floor[(is_floor['location_code'] == 'SFLOOR') & (is_floor['pass'])]
        if not sen_floor_data.empty:
            sen_floor_pass = sen_floor_data['Date'].min()
    return pd.Series({'intro': intro, 'comm_ref': comm_ref, 'first_read': first_read, 'second_read': second_read, 'third_read': third_read, 'asm_floor_pass': asm_floor_pass, 'sen_floor_pass': sen_floor_pass})

stages_df = roll.groupby('bill_ID', group_keys=False).apply(_stage_timing).reset_index()

  stages_df = roll.groupby('bill_ID', group_keys=False).apply(_stage_timing).reset_index()


In [181]:
with open('bill_labels_updated.json', 'r') as f:
    bill_labels = json.load(f)

outcomes = (history.dropna(subset=['bill_ID']).sort_values('Date', ascending=False).groupby('bill_ID').first().reset_index()[['bill_ID','Action', 'bill_id']])
outcomes.loc[outcomes['Action'].isin(['CHAPTERED','ENROLLED','FILED','APPROVED']),'Outcome'] = 1
outcomes.loc[outcomes['Action'].isin(['VETOED']),'Outcome'] = -1
outcomes.loc[outcomes['bill_id'].str.endswith('ENR'), 'Outcome'] = 1
outcomes['Outcome'] = outcomes['Outcome'].fillna(0).astype(int)
y_df = outcomes[['bill_ID','Outcome']].rename(columns={'Outcome':'outcome'})

first_last = (history.dropna(subset=['bill_ID']).groupby('bill_ID')['Date'].agg(First_action='min', Last_action='max').reset_index())

In [182]:
dig = digests[['bill_id','DigestText']].copy()
dig['bill_ID'] = dig['bill_id'].map(bv2b)
ver = versions[['bill_id','VersionNum']].copy()
ver['bill_ID'] = ver['bill_id'].map(bv2b)
dv = (ver.merge(dig, on=['bill_id','bill_ID'], how='inner').dropna(subset=['DigestText']))
def _digest_stats(df):
    df = df.sort_values('VersionNum')
    toks = [set(_tokenize(t)) for t in df['DigestText']]
    sims=[]
    for i in range(1,len(toks)):
        sims.append(_jaccard(toks[i-1], toks[i]))
    return pd.Series({'n_versions': len(df), 'median_sim': float(np.median(sims)) if sims else np.nan})
amendment_churn = (
    dv.groupby('bill_ID')
      .apply(_digest_stats)
      .reset_index()
)
amendment_churn['topic'] = amendment_churn['bill_ID'].map(bill_labels)
amendment_churn = amendment_churn.loc[amendment_churn['topic'].notna()]

  .apply(_digest_stats)


In [183]:
vc = voting[['legislator_name','vote_code','location_code']].copy()
vc['is_floor'] = vc['location_code'].isin(['AFLOOR','SFLOOR'])
vc['yes'] = vc['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
leg_comm = vc[~vc['is_floor']].groupby('legislator_name')['yes'].mean().rename('comm_yes')
leg_floor = vc[vc['is_floor']].groupby('legislator_name')['yes'].mean().rename('floor_yes')
committee_floor_drift = (pd.concat([leg_comm, leg_floor], axis=1).reset_index())
committee_floor_drift['drift'] = committee_floor_drift['floor_yes'] - committee_floor_drift['comm_yes']

In [184]:
ca = pd.concat([
    expend_assembly[['ExpenderName','Amount','matched_target_name','Term','DateEnd']].dropna(subset=['ExpenderName','Amount','matched_target_name']),
    expend_senate  [['ExpenderName','Amount','matched_target_name','Term','DateEnd']].dropna(subset=['ExpenderName','Amount','matched_target_name'])
], ignore_index=True)

In [185]:
vt = voting[['legislator_name','vote_code','vote_date_time']].copy()
pol_last_names = politicians[['Last', 'full_name']].dropna().drop_duplicates()
pol_last_names['canon'] = pol_last_names['full_name'].apply(_canon_name)
pln_map = dict(zip(pol_last_names['Last'].str.lower(), pol_last_names['canon']))

vt['canon'] = vt['legislator_name'].apply(_canon_name).map(pln_map)
vt['term'] = vt['vote_date_time'].apply(_term_from_date)
vt['yes'] = vt['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
leg_term_rate = vt.groupby(['canon','term'])['yes'].mean().reset_index().rename(columns={'yes':'yes_rate'})

In [186]:
don = ca.copy()
don['canon'] = don['matched_target_name'].apply(_canon_name)
fund = (don.groupby(['canon','Term'])['Amount'].sum().reset_index().rename(columns={'Term':'term','Amount':'funding'}))
ft = fund.merge(leg_term_rate, on=['canon','term'], how='inner')

In [187]:
ft2 = politicians[['Party', 'name2']].drop_duplicates().merge(ft, left_on='name2', right_on='canon', how='inner')

In [188]:
def _quartiles(g):
    if g.empty: return pd.Series({'yes_rate_top':np.nan,'yes_rate_bottom':np.nan,'delta':np.nan,'n_top':0,'n_bottom':0})
    q = g['funding'].quantile([0.25,0.75]).values
    low = g[g['funding']<=q[0]]; high = g[g['funding']>=q[1]]
    return pd.Series({'yes_rate_top': float(high['yes_rate'].mean()) if not high.empty else np.nan, 'yes_rate_bottom': float(low['yes_rate'].mean()) if not low.empty else np.nan, 'delta': float((high['yes_rate'].mean() - low['yes_rate'].mean())) if (not high.empty and not low.empty) else np.nan, 'n_top': int(high.shape[0]), 'n_bottom': int(low.shape[0])})
money_vote_alignment = ft.groupby('term').apply(_quartiles, include_groups=False).reset_index()
money_vote_party_alignment = ft2.groupby(['term','Party']).apply(_quartiles, include_groups=False).reset_index()

In [189]:
bill_dates_df = first_last.copy()
bill_dates_df['longevity_days'] = (bill_dates_df['Last_action'] - bill_dates_df['First_action']).dt.days
signals = (roll.groupby('bill_ID').apply(lambda g: float(np.mean((g['yes']/(g['total'].replace(0, np.nan))) >= 0.5))).reset_index().rename(columns={0:'vote_signal'}))
n_versions = (versions.assign(bill_ID=lambda d: d['bill_id'].map(bv2b)).dropna(subset=['bill_ID']).groupby('bill_ID')['VersionNum'].nunique().reset_index().rename(columns={'VersionNum':'bill_version_count'}))
y_df['topic'] = y_df['bill_ID'].map(bill_labels)
y_df2 = y_df.loc[y_df['topic'].notna()].copy()
bills_table = (y_df2.merge(bill_dates_df[['bill_ID','First_action','longevity_days', 'Last_action']], on='bill_ID', how='left').merge(signals, on='bill_ID', how='left').merge(amendment_churn[['bill_ID','n_versions','median_sim']], on='bill_ID', how='left').merge(n_versions, on='bill_ID', how='left'))
bills_table['First_action'] = pd.to_datetime(bills_table['First_action']).dt.strftime('%Y-%m-%d')
bills_table['Last_action'] = pd.to_datetime(bills_table['Last_action']).dt.strftime('%Y-%m-%d')

  signals = (roll.groupby('bill_ID').apply(lambda g: float(np.mean((g['yes']/(g['total'].replace(0, np.nan))) >= 0.5))).reset_index().rename(columns={0:'vote_signal'}))


In [190]:
vv = voting[['bill_ID','legislator_name','vote_code','chamber','term', 'location_code', 'Date']].copy()
vv['last'] = vv['legislator_name'].str.lower().str.strip()
vv['yes'] = vv['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
legislators_last_names = {}
for _, row in politicians[['chamber', 'Last', 'Term', 'full_name']].drop_duplicates().iterrows():
    legislators_last_names[(row['chamber'], row['Last'].lower(), row['Term'])] = row['full_name']
def _resolve_full_name(row):
    return legislators_last_names.get((row['chamber'], row['last'], row['term']), np.nan)
vv['full_name'] = vv.apply(_resolve_full_name, axis=1)
vv['party'] = vv['full_name'].map(leg_parties)
vv['topic'] = vv['bill_ID'].map(bill_labels)
vv = vv.loc[vv['topic'].notna()]
vv_major = vv[vv['party'].isin(['D','R'])].copy()
rc = (vv_major.groupby(['bill_ID','term','topic','party'])['yes'].mean().unstack('party').reset_index().rename(columns={'D':'yes_D','R':'yes_R'}))
for c in ['yes_D','yes_R']:
    if c not in rc.columns: rc[c] = np.nan
rc['polarization'] = (rc['yes_D'] - rc['yes_R']).abs()
rc['party_line_split'] = np.where(((rc['yes_D']>0.5) & (rc['yes_R']<0.5)) | ((rc['yes_D']<0.5) & (rc['yes_R']>0.5)), 1, 0)
topic_controversy = (rc.groupby(['topic','term']).agg(n_rollcalls=('bill_ID','nunique'), mean_polarization=('polarization','mean'), median_polarization=('polarization','median'), party_line_share=('party_line_split','mean'), dem_yes_rate=('yes_D','mean'), rep_yes_rate=('yes_R','mean')).reset_index())
rollcall_party_splits = rc[['bill_ID','term','topic','yes_D','yes_R','polarization','party_line_split']].copy()


In [191]:
vv2 = vv.copy()
vv2['canon'] = vv2['legislator_name'].apply(_canon_name)
vv2['any_vote'] = 1
_weight_col = 'yes'
topic_votes = (vv2.dropna(subset=['topic']).groupby(['canon','term','topic'])[_weight_col].sum().reset_index(name='topic_votes'))
total_votes = (vv2.groupby(['canon','term'])[_weight_col].sum().reset_index(name='total_votes'))
weights_topics = (topic_votes.merge(total_votes, on=['canon','term'], how='left'))
weights_topics['topic_share'] = np.where(weights_topics['total_votes']>0, weights_topics['topic_votes']/weights_topics['total_votes'], 0.0)

In [192]:
for df in (expend_assembly, expend_senate):
    df['year'] = df['Term'].str.extract(r'^(\d{4})').astype(int)
    df['term'] = np.where((df['year']%2==0), df['year']-1, df['year'])
    df['term'] = df['term'].astype(int).astype(str) + '-' + (df['term']+1).astype(int).astype(str)
    df['canon'] = df['matched_target_name'].apply(_canon_name)

In [193]:
don_by_leg_term = (pd.concat([expend_assembly, expend_senate], ignore_index=True).groupby(['canon','term'])['Amount'].sum().reset_index().rename(columns={'Amount':'donations'}))
lb2 = lobbying[['clean_beneficiary','EXPN_DATE','BENE_AMT']].dropna().copy()
lb2['EXPN_DATE'] = pd.to_datetime(lb2['EXPN_DATE'], errors='coerce')
lb2['term'] = lb2['EXPN_DATE'].apply(lambda x: np.nan if pd.isna(x) else (f"{x.year-1}-{x.year}" if (x.year%2==0 and x.month<11) else f"{x.year+1}-{x.year+2}" if (x.year%2==0) else f"{x.year}-{x.year+1}"))
lb2['canon'] = lb2['clean_beneficiary'].apply(_canon_name)
lob_by_leg_term = (lb2.groupby(['canon','term'])['BENE_AMT'].sum().reset_index().rename(columns={'BENE_AMT':'lobbying'}))
fund_leg_term = (don_by_leg_term.merge(lob_by_leg_term, on=['canon','term'], how='outer').fillna({'donations':0.0,'lobbying':0.0}))
fund_leg_term['total_received'] = fund_leg_term['donations'] + fund_leg_term['lobbying']
weights_topics['canon'] = weights_topics['canon'].map(pln_map)
alloc = (weights_topics.merge(fund_leg_term, on=['canon','term'], how='left').fillna({'donations':0.0,'lobbying':0.0,'total_received':0.0}))
alloc['donations_topic'] = alloc['donations'] * alloc['topic_share']
alloc['lobbying_topic']  = alloc['lobbying']  * alloc['topic_share']
alloc['total_topic'] = alloc['total_received'] * alloc['topic_share']
topic_funding_by_term = (alloc.groupby(['topic','term']).agg(total_donations=('donations_topic','sum'), total_lobbying=('lobbying_topic','sum'), total_received=('total_topic','sum')).reset_index())
topic_funding_by_leg = (alloc.groupby(['canon','term','topic']).agg(donations=('donations_topic','sum'), lobbying=('lobbying_topic','sum'), total=('total_topic','sum')).reset_index())
don_leg_term = (pd.concat([expend_assembly, expend_senate], ignore_index=True).assign(canon=lambda d: d['matched_target_name'].apply(_canon_name)).rename(columns={'Amount':'donation'}))
don_alloc = (don_leg_term.merge(weights_topics[['canon','term','topic','topic_share']], on=['canon','term'], how='left').fillna({'topic_share':0.0}))
don_alloc['donation_topic'] = don_alloc['donation'] * don_alloc['topic_share']
donor_topic_by_term = (don_alloc.groupby(['ExpenderName','topic','term'])['donation_topic'].sum().reset_index().rename(columns={'donation_topic':'donations_allocated'}))
for _df in (topic_funding_by_term, topic_funding_by_leg, donor_topic_by_term):
    _df['term'] = _df['term'].astype(str)

In [194]:
summary_roll_with_bill = summary_roll.copy()
summary_roll_with_bill = summary_roll_with_bill.dropna(subset=['bill_id'])
floor_only = summary_roll_with_bill[summary_roll_with_bill['location_code'].isin(['AFLOOR','SFLOOR'])].copy()
floor_only['Date'] = pd.to_datetime(floor_only['Date'])
last_floor = floor_only.sort_values('Date').groupby('bill_id').tail(1)[['bill_id','yes','no','total','Date']]

In [195]:
vv_floor = vv[vv['location_code'].isin(['AFLOOR','SFLOOR'])].copy().rename(columns={'bill_ID':'bill_id'})
vv_floor['Date'] = pd.to_datetime(vv_floor['Date'])
vv_floor = vv_floor.merge(last_floor[['bill_id','Date']], on=['bill_id','Date'], how='inner')
vv_floor_major = vv_floor[vv_floor['party'].isin(['D','R'])]
bill_party_rates = vv_floor_major.groupby(['bill_id','party'])['yes'].mean().unstack('party').reset_index().rename(columns={'D':'yes_D_last','R':'yes_R_last'})
bill_party_rates[['yes_D_last','yes_R_last']] = bill_party_rates[['yes_D_last','yes_R_last']].astype(float)

closest_vote = summary_roll_with_bill.copy()
closest_vote['diff'] = (closest_vote['yes'] - closest_vote['no']).abs()
closest_vote = closest_vote.sort_values(['bill_id','diff'])
closest_pick = closest_vote.groupby('bill_id').head(1)[['bill_id','yes','no','total']]

In [196]:
def _entropy_row(r):
    y = float(r['yes']); n = float(r['no']); t = float(r['total'])
    if t<=0: return 0.0
    a = max(t - y - n, 0.0)
    p = np.array([y,t - y - n,n], dtype=np.float64)/t
    p = p[p>0]
    return float(-(p*np.log(p)).sum())
closest_pick['controversiality'] = 1 - (closest_pick['yes'] - closest_pick['no']).abs()/closest_pick['total'].replace(0,np.nan)
closest_pick['vote_entropy'] = closest_pick.apply(_entropy_row, axis=1)

mean_yes_ratio_versions = roll.groupby('bill_ID').apply(lambda g: float(np.nanmean((g['yes']/g['total'].replace(0,np.nan)).values)) if len(g)>0 else np.nan).reset_index().rename(columns={0:'mean_yes_ratio_versions'})

bill_term = first_last[['bill_ID','First_action']].copy()
bill_term['term'] = _safe_dt(bill_term['First_action']).apply(_term_from_date)

  mean_yes_ratio_versions = roll.groupby('bill_ID').apply(lambda g: float(np.nanmean((g['yes']/g['total'].replace(0,np.nan)).values)) if len(g)>0 else np.nan).reset_index().rename(columns={0:'mean_yes_ratio_versions'})


In [197]:
bt = bills_table[['bill_ID', 'Last_action']].drop_duplicates()
bt['Last_action'] = pd.to_datetime(bt['Last_action'])
bt = bt.merge(history, left_on=['bill_ID', 'Last_action'], right_on=['bill_ID', 'Date'], how='left')
bt = bt.merge(authors[['bill_id', 'Contribution', 'Name']], on='bill_id', how='inner')

author_type_map = {
    'LEAD_AUTHOR': 'LEAD_AUTHOR',
    'PRINCIPAL_COAUTHOR': 'PRINCIPAL_COAUTHOR',
    'COAUTHOR': 'COAUTHOR',
    'data="COAUTHOR"': 'COAUTHOR',
    'data="LEAD_AUTHOR"': 'LEAD_AUTHOR',
    'data="PRINCIPAL_COAUTHOR"': 'PRINCIPAL_COAUTHOR',
    'nan': 'AUTHOR'
}
author_levels = {
    'AUTHOR': 1,
    'COAUTHOR': 1,
    'PRINCIPAL_COAUTHOR': 2,
    'LEAD_AUTHOR': 3
}

bt['author_type'] = bt['Contribution'].map(author_type_map).fillna('AUTHOR')
bt['author_level'] = bt['author_type'].map(author_levels).fillna(0).astype(int)
def _primary_authors(g):
    g = g.sort_values('author_level', ascending=False)
    primary = g[g['author_level'] == g['author_level'].max()]
    return list(set(primary['Name'].tolist()))

bt['primary_authors'] = bt.groupby('bill_ID', group_keys=False).apply(_primary_authors).reset_index(level=0, drop=True)

  bt['primary_authors'] = bt.groupby('bill_ID', group_keys=False).apply(_primary_authors).reset_index(level=0, drop=True)


In [198]:
lead_author_df = bt[['bill_ID', 'primary_authors']].copy()

In [199]:
bill_insights = pd.DataFrame({'bill_ID': list(bill_labels.keys())})
bill_insights = bill_insights.merge(pd.DataFrame({'bill_ID': list(version_id_mapping2.keys()), 'bill_id_raw': [version_id_mapping2[k][0] if len(version_id_mapping2[k])>0 else np.nan for k in version_id_mapping2.keys()]}), on='bill_ID', how='left')
bill_insights['topic'] = bill_insights['bill_ID'].map(bill_labels)

In [200]:
last_floor['bill_ID'] = last_floor['bill_id']
closest_pick['bill_ID'] = closest_pick['bill_id']
bill_party_rates['bill_ID'] = bill_party_rates['bill_id']

In [201]:
bill_insights = bill_insights.merge(lead_author_df, on='bill_ID', how='left').merge(mean_yes_ratio_versions, on='bill_ID', how='left').merge(bill_party_rates, on='bill_ID', how='left').merge(last_floor[['bill_ID','yes','no','total']], on='bill_ID', how='left').merge(closest_pick[['bill_ID','controversiality','vote_entropy']], on='bill_ID', how='left').merge(bill_term[['bill_ID','term']], on='bill_ID', how='left')
bill_insights['bill_polarization'] = (bill_insights['yes_D_last'] - bill_insights['yes_R_last']).abs()
bill_insights['bill_party_line'] = np.where(((bill_insights['yes_D_last']>0.5) & (bill_insights['yes_R_last']<0.5)) | ((bill_insights['yes_D_last']<0.5) & (bill_insights['yes_R_last']>0.5)), 1, 0)
bill_insights = bill_insights.rename(columns={'controversiality':'bill_controversiality','vote_entropy':'bill_vote_entropy'})
bill_insights = bill_insights[['bill_ID','bill_id_raw','topic','term','primary_authors','mean_yes_ratio_versions','bill_polarization','bill_party_line','bill_controversiality','bill_vote_entropy']]

In [202]:
topic_partisanship_summary = (bill_insights.groupby(['topic','term']).agg(mean_polarization=('bill_polarization','mean'), party_line_share=('bill_party_line','mean'), controversiality_index=('bill_controversiality','median'), vote_entropy=('bill_vote_entropy','median'), n_bills=('bill_ID','nunique')).reset_index())

In [203]:
votes = voting[['Date', 'bill_ID', 'voting_place', 'chamber']].drop_duplicates().rename(columns={'bill_ID': 'bill_id'})
votes['committee_clean'] = votes['chamber'] + ' ' + votes['voting_place']
hear_month = hear.copy().merge(votes, on=['bill_id', 'committee_clean'], how='left')

In [204]:
hear_month['month'] = pd.to_datetime(hear_month['Date']).dt.to_period('M').astype(str)
hear_month = hear_month.dropna(subset=['committee_clean','month'])

mload = hear_month.groupby(['committee_clean','month'])['bill_id'].nunique().reset_index(name='hearings')
committee_workload_median = mload.groupby('committee_clean')['hearings'].median().reset_index().rename(columns={'committee_clean':'committee','hearings':'median_monthly_hearings'})

In [205]:
files = Path("dashboard/backend/data/outs").glob("*.parquet")
dfs = {}
for f in files:
    df = pd.read_parquet(f)
    dfs[f.name.replace('.parquet','')] = df

In [206]:
import torch, pickle, datetime
from torch_geometric.transforms import ToUndirected, RemoveIsolatedNodes

legislators = pickle.load(open('legislators.pkl', 'rb'))

def leg_term_to_name(leg_term_id):
    if isinstance(leg_term_id, str):
        num = int(leg_term_id.split('_')[0])
        return legislators.get(num, None)
    else:
        return None

def leg_term_to_term(leg_term_id):
    if isinstance(leg_term_id, str):
        a = leg_term_id.split('_')[1]
        return int(a.split('-')[0]) if a else None
    else:
        return None

with open('node_id_map.json', 'r') as f:
    node_id_map = json.load(f)

leg_ids = {v: k for k, v in node_id_map['legislator_term'].items()}

In [207]:
data = torch.load('data4.pt', map_location='cpu', weights_only=False)
data = ToUndirected()(data)
data = RemoveIsolatedNodes()(data)

In [208]:
ei = data[("legislator_term","wrote","bill_version")].edge_index.numpy()
ea = data[("legislator_term","wrote","bill_version")].edge_attr.numpy()
author_edge = pd.DataFrame({"legterm_id": ei[0], "bill_id": ei[1], "type": ea[:,0]})

eib = data[('bill_version','is_version', 'bill')].edge_index.numpy()
eib = pd.DataFrame({"src": eib[0], "dst": eib[1], 'outcome': data['bill'].y[eib[1]]})
eib['src'] = eib['src'].astype(int)
eib['dst'] = eib['dst'].astype(int)
author_edge['bill_id'] = author_edge['bill_id'].astype(int)

In [209]:
bv_ids = {v: k for k, v in node_id_map['bill_version'].items()}
bids = {v: k for k, v in node_id_map['bill'].items()}
v2b_edge = tuple([et for et in data.edge_types
                if et[0] == "bill_version" and et[2] == "bill"])[0]
src, dst = data[v2b_edge].edge_index.numpy()
d = [bids.get(s, None) for s in dst]

bv_df = pd.DataFrame({"bill_version": src, "bill_id": d})
bv_df['bill_version_id'] = bv_df['bill_version'].map(bv_ids)

In [210]:
author_edge = author_edge.merge(eib, left_on='bill_id', right_on='src', how='inner')
author_edge['outcome'] = (author_edge['outcome'] == 1).astype(int)
author_levels = {1: 'COAUTHOR', 2: 'PRINCIPAL_COAUTHOR', 3: 'LEAD_AUTHOR'}
author_edge['author_type'] = author_edge['type'].map(author_levels)

ve = data[('bill_version', 'rev_voted_on', 'legislator_term')].edge_index.numpy()
va = data[('bill_version', 'rev_voted_on', 'legislator_term')].edge_attr.numpy()
vote_edge = pd.DataFrame({'bill_version': ve[0], 'legislator_term': ve[1], 'vote_signal': va[:, 0]})
vote_edge = vote_edge.merge(eib, left_on='bill_version', right_on='src', how='left').merge(bv_df, on='bill_version', how='left')
vote_edge['full_name'] = vote_edge['legislator_term'].map(leg_ids).apply(leg_term_to_name)
vote_edge['term'] = vote_edge['legislator_term'].map(leg_ids).apply(leg_term_to_term)
signals = vote_edge.groupby('bill_id').agg({'outcome': 'max', 'vote_signal': lambda x: (x > 0).sum() / len(x)})
signals.loc[(signals['outcome'] == 0.0) & (signals['vote_signal'] == 1.0), 'vote_signal'] = 0.0
a3 = author_edge.merge(bv_df, left_on='bill_id', right_on='bill_version', how='left').groupby('legterm_id').agg({
    'outcome': 'mean',
    'author_type': lambda x: sum(x == 'LEAD_AUTHOR'),
    'bill_version': 'nunique'
}).reset_index()

a3['full_name'] = a3['legterm_id'].map(leg_ids).apply(leg_term_to_name)
a3['term'] = a3['legterm_id'].map(leg_ids).apply(leg_term_to_term)
lfund['term'] = (lfund['term'] - 1).astype(float)

a4 = a3.merge(lfund, on=['full_name', 'term'], how='left')

In [211]:
d1 = dfs['leginflu_v10_overall_influence']
d1l = d1.loc[d1['actor_type'] == 'legislator_term']
a5 = a4.merge(d1l, left_on='legterm_id', right_on='actor_idx', how='left')
leg_terms = a5[['outcome', 'author_type', 'bill_version', 'full_name', 'term', 'Party', 'chamber', 'total_lobbying', 'total_donations', 'influence']]

In [212]:
lt = topic_funding_by_leg.copy()
if 'topic_weight' not in lt.columns:
    for c in ['alignment','weight','share','prob']:
        if c in lt.columns:
            lt = lt.rename(columns={c:'topic_weight'})
            break
    if 'topic_weight' not in lt.columns:
        lt['topic_weight'] = 1.0
lt['lower'] = lt['canon'].astype(str).apply(name_swap)
lt['Term'] = lt['term'].astype(str)
lk = politicians[['name2','Term','chamber','District No.']].drop_duplicates().rename(columns={'name2':'lower'})
lm = lt.merge(lk, on=['lower','Term'], how='left').dropna(subset=['District No.','chamber'])
lm['district_id'] = lm['District No.'].apply(lambda x: re.sub(r' ', '', str(x))).astype(float).astype(int).astype(str)
lm['house'] = lm['chamber'].str.lower()
lm['term_year'] = lm['Term'].apply(_term_year).astype('Int64')
dist_align = lm.groupby(['house','district_id','Term','term_year','topic'], dropna=False, as_index=False)['topic_weight'].mean().rename(columns={'Term':'term','topic_weight':'topic_weight_mean'})
overall = lm.groupby(['house','district_id','topic'], dropna=False, as_index=False)['topic_weight'].mean()
overall['term'] = 'Overall'
overall['term_year'] = pd.NA
overall = overall.rename(columns={'topic_weight':'topic_weight_mean'})
district_legislator_topic_alignment = pd.concat([dist_align, overall], ignore_index=True)

  district_legislator_topic_alignment = pd.concat([dist_align, overall], ignore_index=True)


In [213]:
bills_table = bills_table.merge(bill_vers[['bill_ID', 'GeneralSubject', 'Urgency', 'VoteRequired', 'LocalProgram', 'TaxLevy']], on='bill_ID', how='inner').reset_index(drop=True)

In [214]:
lb_topics = lobbying[['FIRM_NAME','clean_beneficiary','EXPN_DATE','BENE_AMT']].dropna(subset=['FIRM_NAME','clean_beneficiary','EXPN_DATE','BENE_AMT']).copy()
lb_topics['EXPN_DATE'] = pd.to_datetime(lb_topics['EXPN_DATE'], errors='coerce')
lb_topics = lb_topics[lb_topics['EXPN_DATE'].notna()]
lb_topics['term'] = lb_topics['EXPN_DATE'].apply(_term_from_date)
lb_topics = lb_topics[lb_topics['term'].notna()]
lb_topics['canon'] = lb_topics['clean_beneficiary'].apply(_canon_name)
lb_topics['BENE_AMT'] = pd.to_numeric(lb_topics['BENE_AMT'], errors='coerce').fillna(0.0)
lb_topics = lb_topics.merge(weights_topics[['canon','term','topic','topic_share']], on=['canon','term'], how='left')
lb_topics = lb_topics[lb_topics['topic_share'].notna()]
lb_topics['alloc'] = lb_topics['BENE_AMT'] * lb_topics['topic_share']
lobby_firm_topic_by_term = lb_topics.groupby(['FIRM_NAME','topic','term'], as_index=False)['alloc'].sum().rename(columns={'alloc':'lobby_allocated'})

def _cycle_from_year(y):
    return '2011' if y <= 2012 else 'current'

In [215]:
don_gc = don_g.copy()
don_gc['district_id'] = don_gc['district_id'].apply(lambda x: re.sub(r' ', '', str(x))).astype(float).astype(int)
don_gc['base_year'] = pd.to_numeric(don_gc['term_year'].fillna(don_gc['term']), errors='coerce')
don_gc = don_gc[don_gc['base_year'].notna()]
don_gc['cycle'] = don_gc['base_year'].apply(_cycle_from_year)
don_gc = don_gc.merge(
    weights[['house','cycle','district_id','COUNTYFP','w_pop']],
    on=['house','cycle','district_id'],
    how='inner'
)
don_gc['county_id'] = don_gc['COUNTYFP'].astype(int)
don_gc['amount_weighted'] = don_gc['total'] * don_gc['w_pop']
donor_county = (
    don_gc.groupby(['county_id','name'], as_index=False)['amount_weighted']
         .sum()
         .rename(columns={'name':'funder','amount_weighted':'total_amount'})
)
donor_county['kind'] = 'Donor'

In [216]:
pol_key2 = pol_key[['name2','term','chamber','District No.']].drop_duplicates().rename(columns={'term':'term_start'})
lob_f = lobbying[['FIRM_NAME','clean_beneficiary','EXPN_DATE','BENE_AMT']].dropna(subset=['FIRM_NAME','clean_beneficiary','EXPN_DATE','BENE_AMT']).copy()
lob_f['EXPN_DATE'] = pd.to_datetime(lob_f['EXPN_DATE'], errors='coerce')
lob_f = lob_f[lob_f['EXPN_DATE'].notna()]
lob_f['term'] = lob_f['EXPN_DATE'].apply(_term_from_date)
lob_f = lob_f[lob_f['term'].notna()]
lob_f['term_start'] = lob_f['term'].str.slice(0,4).astype(int)
lob_f['name2'] = lob_f['clean_beneficiary'].astype(str).apply(name_swap)
lob_f = lob_f.merge(pol_key2, on=['name2','term_start'], how='left')
lob_f = lob_f.dropna(subset=['District No.','chamber'])
lob_f['district_id'] = lob_f['District No.'].apply(lambda x: re.sub(r' ', '', str(x))).astype(float).astype(int)
lob_f['house'] = lob_f['chamber'].str.lower()
lob_f['amount'] = pd.to_numeric(lob_f['BENE_AMT'], errors='coerce').fillna(0.0)
lob_f['cycle'] = lob_f['term_start'].apply(_cycle_from_year)
lob_gc = lob_f.merge(
    weights[['house','cycle','district_id','COUNTYFP','w_pop']],
    on=['house','cycle','district_id'],
    how='inner'
)
lob_gc['county_id'] = lob_gc['COUNTYFP'].astype(int)
lob_gc['amount_weighted'] = lob_gc['amount'] * lob_gc['w_pop']
lobby_county = (
    lob_gc.groupby(['county_id','FIRM_NAME'], as_index=False)['amount_weighted']
          .sum()
          .rename(columns={'FIRM_NAME':'funder','amount_weighted':'total_amount'})
)
lobby_county['kind'] = 'Lobbying'

In [217]:
county_funders = pd.concat([donor_county, lobby_county], ignore_index=True)
county_funders = county_funders.merge(
    counties_gdf[['county_id','NAMELSAD']],
    on='county_id',
    how='left'
).rename(columns={'NAMELSAD':'county_name'})


donor_topics = (
    donor_topic_by_term.groupby(['ExpenderName','topic'], as_index=False)['donations_allocated']
    .sum()
)
donor_topics['rank'] = donor_topics.groupby('ExpenderName')['donations_allocated'].rank(ascending=False, method='first')
donor_topics_top = donor_topics[donor_topics['rank'] <= 3].sort_values(['ExpenderName','rank'])

In [218]:
donor_topics_top['topic'] = donor_topics_top['topic'].astype(int).astype(str)

donor_topics_map = donor_topics_top.groupby('ExpenderName')['topic'].apply(lambda x: ', '.join(list(set(x)))).to_dict()

lobby_topics = (
    lobby_firm_topic_by_term.groupby(['FIRM_NAME','topic'], as_index=False)['lobby_allocated']
    .sum()
)
lobby_topics['rank'] = lobby_topics.groupby('FIRM_NAME')['lobby_allocated'].rank(ascending=False, method='first')
lobby_topics_top = lobby_topics[lobby_topics['rank'] <= 3].sort_values(['FIRM_NAME','rank'])
lobby_topics_top['topic'] = lobby_topics_top['topic'].astype(int).astype(str)
lobby_topics_map = lobby_topics_top.groupby('FIRM_NAME')['topic'].apply(lambda x: ', '.join(list(set(x)))).to_dict()

county_funders['top_topics'] = np.nan
mask_d = county_funders['kind'] == 'Donor'
mask_l = county_funders['kind'] == 'Lobbying'
county_funders.loc[mask_d, 'top_topics'] = county_funders.loc[mask_d, 'funder'].map(donor_topics_map)
county_funders.loc[mask_l, 'top_topics'] = county_funders.loc[mask_l, 'funder'].map(lobby_topics_map)

county_funders['rank_in_county'] = county_funders.groupby(['county_id','kind'])['total_amount'].rank(ascending=False, method='first')
county_top_funders = county_funders[county_funders['rank_in_county'] <= 15].reset_index(drop=True)

 '3, 62, 18']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  county_funders.loc[mask_d, 'top_topics'] = county_funders.loc[mask_d, 'funder'].map(donor_topics_map)


In [220]:
for df_name in ['route_archetypes','amendment_churn','risk_list','committee_gatekeeping','committee_workload_median','cross_chamber_friction','survival_curves','vote_similarity_edges','vote_communities','committee_floor_drift','text_lift_top_tokens','donor_portfolios_hhi','money_vote_alignment','money_event_time_curve','ca_legislator_funding','bills_table','topic_controversy','rollcall_party_splits','topic_funding_by_term','topic_funding_by_leg','donor_topic_by_term','topic_momentum','topic_funnel_obs','topic_funnel_modeled','route_baseline','bill_insights','topic_partisanship_summary','route_entropy','committee_betweenness_proxy','author_coalition_breadth']:
    if df_name in locals():
        df = locals()[df_name]
        if isinstance(df, pd.DataFrame) and 'term' in df.columns:
            locals()[df_name]['term'] = locals()[df_name]['term'].astype(str)

precomp_outputs = {
    'amendment_churn': amendment_churn,
    'committee_workload_median': committee_workload_median,
    'committee_floor_drift': committee_floor_drift,
    'money_vote_alignment': money_vote_alignment,
    'money_vote_party_alignment': money_vote_party_alignment,
    'ca_legislator_funding_geo': ca_legislator_funding,
    'ca_legislator_funding': reg_funds_,
    'bills_table': bills_table,
    'topic_controversy': topic_controversy,
    'rollcall_party_splits': rollcall_party_splits,
    'topic_funding_by_term': topic_funding_by_term,
    'topic_funding_by_leg': topic_funding_by_leg,
    'donor_topic_by_term': donor_topic_by_term,
    'bill_insights': bill_insights,
    'topic_partisanship_summary': topic_partisanship_summary,
    'district_funders_time': district_funders_time,
    'county_top_funders': county_top_funders,
    'leg_terms': leg_terms
}

for k, v in precomp_outputs.items():
    v.to_parquet(f'dashboard/backend/data/outs/{k}.parquet', index=False)

In [228]:
ca_legislator_funding = pd.read_parquet('dashboard/backend/data/outs/ca_legislator_funding_geo.parquet')
county_top_funders = pd.read_parquet('dashboard/backend/data/outs/county_top_funders.parquet')
bills_table = pd.read_parquet('dashboard/backend/data/outs/bills_table.parquet')
bill_insights = pd.read_parquet('dashboard/backend/data/outs/bill_insights.parquet')
rollcall_party_splits = pd.read_parquet('dashboard/backend/data/outs/rollcall_party_splits.parquet')
amendment_churn = pd.read_parquet('dashboard/backend/data/outs/amendment_churn.parquet')
leg_terms = pd.read_parquet('dashboard/backend/data/outs/leg_terms.parquet')
topic_funding_by_term = pd.read_parquet('dashboard/backend/data/outs/topic_funding_by_term.parquet')
maps = gpd.read_file('dashboard/backend/data/ca_counties.geojson').to_crs(epsg=4326)

In [231]:
tables = {
    'ca_legislator_funding': ca_legislator_funding,
    'county_top_funders': county_top_funders,
    'bills_table': bills_table,
    'bill_insights': bill_insights,
    'rollcall_party_splits': rollcall_party_splits,
    'amendment_churn': amendment_churn,
    'leg_terms': leg_terms,
    'topic_funding_by_term': topic_funding_by_term,
    'CA_counties.geojson': maps
}

for t, table in tables.items():
    print(t)
    print('\n')
    print(table.dtypes)
    print('\n')

ca_legislator_funding


county_id               int64
house                  object
total_donations       float64
total_lobbying        float64
total_received        float64
COUNTYFP               object
NAMELSAD               object
geometry               object
county_area_m2        float64
population            float64
pop_density_per_m2    float64
dtype: object


county_top_funders


county_id           int64
funder             object
total_amount      float64
kind               object
county_name        object
top_topics         object
rank_in_county    float64
dtype: object


bills_table


bill_ID                object
outcome                 int64
topic                 float64
First_action           object
longevity_days          int64
Last_action            object
vote_signal           float64
n_versions            float64
median_sim            float64
bill_version_count      int64
GeneralSubject         object
Urgency                object
VoteRequired           object
LocalPr