In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import re, json, ast, pathlib, zipfile, tempfile, datetime as _dt, warnings, torch
from tqdm import tqdm
from collections import defaultdict, deque
from rapidfuzz import fuzz, process

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
summary_votes = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})
authors = pd.read_csv('ca_leg/legislation_data/authors.csv')
history = pd.read_csv('ca_leg/legislation_data/history.csv')
versions = pd.read_csv('ca_leg/legislation_data/bill_versions.csv')
bill_votes = pd.read_csv('ca_leg/legislation_data/bill_detail_vote_tbl.csv', parse_dates=['session_date'])
bill_summary = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')
bill_motions = pd.read_csv('ca_leg/legislation_data/bill_motion_tbl.csv')
locations = pd.read_csv('ca_leg/legislation_data/committee_codes.csv')
politicians = pd.read_csv('ca_leg/legislation_data/politicians.csv')
lobbying = pd.read_csv('calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})
expend_assembly = pd.read_csv('calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})
expend_senate = pd.read_csv('calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})
digests = pd.read_csv('ca_leg/legislation_data/digest.csv')
hearings = pd.read_csv('ca_leg/legislation_data/committee_hearing_tbl.csv')

In [3]:
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time']).apply(lambda x: x.strftime('%Y-%m-%d'))
bill_votes['legislator_name'] = bill_votes['legislator_name'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

In [4]:
ACTION_KEYWORDS = ["Assembly Third Reading","Assembly 3rd reading","senate 3rd reading","Senate Third Reading","Concurrence - Urgency Added","Concurrence in Senate Amendments","Do pass as amended, and re-refer","Do pass as amended, but re-refer","Do pass as amended","Do pass and be re-referred","Concurrence","Consent Calendar","Urgency Clause","Special Consent","Motion to Reconsider","Do pass","Reconsideration","Committee amendments","W/O REF. TO FILE","Be re-referred to the Committee","Lay on the Table","Amend by","Unfinished Business","Placed on Appropriations Suspense File"]

def extract_action(motion_text):
    if not isinstance(motion_text, str) or motion_text is None:
        return None
    motion = motion_text.upper()
    action = next((act for act in ACTION_KEYWORDS if act.upper() in motion), None)
    if action != 'Reconsideration' and 'RECONSIDER' in motion:
        if action is not None:
            action += ' Reconsideration'
        else:
            action = 'Reconsideration'
    return action if action else None

bill_motions['simplified_motion'] = bill_motions['motion_text'].apply(extract_action)

In [5]:
clean_coms = {}
for i, row in locations.iterrows():
    if row['committee_code'].startswith('CZ'):
        continue
    name = row['committee_name']
    if row['committee_code'].startswith('CS'):
        if name.startswith('Sen.'):
            cname = re.sub(r'Sen. ', 'senate ', name).lower()
        elif name.startswith('Senate '):
            cname = name.lower()
        else:
            cname = 'senate ' + name.lower()
    elif row['committee_code'].startswith('CX'):
        if name.lower().startswith('assembly'):
            cname = name.lower()
        else:
            cname = 'assembly ' + name.lower()
    if re.search(r'x\d$', cname) is not None:
        cname = re.sub(r'x(?=\d$)', 'no. ', cname)
    clean_coms[row['committee_code']] = cname

leg_committees = [f"{row['chamber']} {row['committee_clean']}".lower() for _, row in politicians[['committee_clean', 'chamber']].drop_duplicates().iterrows()]

In [6]:
def match_committees(_names, clean_coms, threshold=92):
    clean_c = list(clean_coms.values())
    clean_codes = list(clean_coms.keys())
    name_mapping = {}
    for i, clean in enumerate(clean_c):
        code = clean_codes[i]
        matches = []
        matches.append(process.extractOne(clean, _names, scorer=fuzz.token_sort_ratio, score_cutoff=threshold))
        matches.append(process.extractOne(clean, _names, scorer=fuzz.partial_ratio, score_cutoff=threshold))
        valid_matches = [m for m in matches if m is not None]
        if len(valid_matches) > 0:
            best_match = max(valid_matches, key=lambda x: x[1])
            name_mapping[code] = best_match[0]
        else:
            fall_back = process.extractOne(clean, _names, scorer=fuzz.token_sort_ratio, score_cutoff=threshold - 8)
            if fall_back is not None:
                name_mapping[code] = fall_back[0]
            else:
                name_mapping[code] = None
    return name_mapping

committee_matches = match_committees(leg_committees, clean_coms)

In [7]:
locations['committee_clean'] = locations['committee_code'].map(committee_matches)
locations.loc[locations['committee_name'] == 'EDUCATION X5', 'committee_clean'] = 'Budget and Fiscal Review: Education'
locations.loc[locations['committee_code'] == 'CX12', 'committee_clean'] = 'Budget No. 1 on Health and Human Services'
locations.loc[locations['committee_code'] == 'CS68', 'committee_clean'] = 'Budget No. 3 - Health and Human Services'
locations.loc[locations['committee_code'] == 'CS66', 'committee_clean'] = 'Senate Veterans Affairs'
locations.loc[locations['committee_code'] == 'CS56', 'committee_clean'] = 'Senate Public Employment and Retirement'
locations.loc[locations['committee_code'] == 'CS62', 'committee_clean'] = 'Senate Budget and Fiscal Review'
locations.loc[locations['committee_code'] == 'CX23', 'committee_clean'] = 'Assembly Utilities and Commerce'

motion_codes = {row['motion_id']: row['simplified_motion'] for _, row in bill_motions.iterrows()}
summary_votes['motion_text'] = summary_votes['motion_id'].map(motion_codes)

def repair_bill_id(id):
    front, end = id[:4], id[4:]
    if re.search(r'\d{4}$', front):
        return f"{front}{int(front) + 1}{end}"
    else:
        return id

versions['ID'] = versions['bill_id'].apply(lambda x: repair_bill_id(x))
bill_vers = versions.loc[versions['bill_id'].str.startswith('2')].copy()
for i, row in bill_vers.iterrows():
    tail = f"{row['VersionNum']}{row['MeasureState']}"
    repaired = repair_bill_id(re.sub(tail, '', row['bill_id']))
    end = int(repaired[-4:])
    bill_vers.loc[i, 'bill_ID'] = f"{repaired[:-4]}{end}"

In [8]:
leg_parties = {row['full_name']: row['Party'] for _, row in politicians[['full_name', 'Party']].drop_duplicates().iterrows()}
bill_ids = list(set(bill_votes.loc[bill_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist() + summary_votes.loc[summary_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist()))
bill_id_codes = {row['bill_id']: row['bill_ID'] for _, row in bill_vers.drop_duplicates(subset=['bill_id', 'bill_ID']).iterrows()}
history['bill_ID'] = history['bill_id'].map(bill_id_codes)
history['Date'] = pd.to_datetime(history['Date'])

introduction_dates = {}
for v, group in history.loc[history['bill_ID'].isin(bill_ids)].groupby('bill_ID'):
    introduction_dates[v] = {'Dates': group['Date'].unique().tolist(), 'Actions': group.sort_values('Date', ascending=True).drop_duplicates(subset=['Action', 'Date'])['Action'].tolist()}

version_id_mapping = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['ID']}
version_id_mapping2 = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['bill_id']}
bv2b = {v: k for k, val in version_id_mapping2.items() for v in val}

In [9]:
date_ranges = {}
for k, v in introduction_dates.items():
    first, last = min(v['Dates']), max(v['Dates'])
    date_ranges[k] = {'First_action': first, 'Last_action': last}

outcomes = history.loc[history['bill_ID'].notna()].sort_values('Date', ascending=False).groupby('bill_ID').first().reset_index()[['bill_ID', 'Action']]
outcomes.loc[outcomes['Action'].isin(['CHAPTERED', 'ENROLLED', 'FILED', 'APPROVED']), 'Outcome'] = 1
outcomes.loc[outcomes['Action'] == 'VETOED', 'Outcome'] = -1
outcomes.loc[outcomes['Outcome'].isna(), 'Outcome'] = 0
outcome = outcomes.set_index('bill_ID')['Outcome'].to_dict()

In [10]:
vote_bill_ids = {}
for i in summary_votes.loc[summary_votes['bill_id'].isin(bill_ids)].groupby(['year', 'motion_id'])['bill_id'].value_counts().index:
    year, motion_id, bill_id = i
    if (year, motion_id) not in vote_bill_ids.keys():
        vote_bill_ids[(year, motion_id)] = [bill_id]
    else:
        vote_bill_ids[(year, motion_id)].append(bill_id)

In [11]:
bill_vers_dig = bill_vers.merge(digests, on='bill_id', how='inner')
features = {row['ID']: {'digest': row['DigestText'], 'MeasureState': row['MeasureState'], 'VoteRequired': row['VoteRequired'] if row['VoteRequired'] is not None else 'No', 'VersionNum': row['VersionNum'] if row['VersionNum'] is not None else 'No', 'LocalProgram': row['LocalProgram'] if row['LocalProgram'] is not None else 'No', 'FiscalCommittee': row['FiscalCommittee'] if row['FiscalCommittee'] is not None else 'No', 'TaxLevy': row['TaxLevy'] if row['TaxLevy'] is not None else 'No', 'Urgency': row['Urgency'] if row['Urgency'] is not None else 'No'} for _, row in bill_vers_dig.iterrows()}

committee_codes = {v.lower(): k for k, v in enumerate(politicians['committee_clean'].unique().tolist())}

In [12]:
bill_votes['chamber'] = bill_votes['location_code'].apply(lambda x: 'assembly' if x == 'AFLOOR' or str(x).startswith('CX') else 'senate' if x == 'SFLOOR' or str(x).startswith('CS') else 'full')
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time'])
bill_votes['term'] = bill_votes['vote_date_time'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1 else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year + 1}-{x.year + 2}")

In [13]:
author_locations = authors.loc[(authors['House'] == 'UNKNOWN') & (authors['bill_id'].map(bill_id_codes).isin(bill_ids)), ['bill_id', 'Name']].drop_duplicates()
for i, row in author_locations.iterrows():
    if 'AB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Assembly ' + row['Name']
    elif 'SB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Senate ' + row['Name']
    else:
        author_locations.loc[i, 'name'] = 'Joint ' + row['Name']

In [14]:
def fuzzy_strings(source_list, target_list):
    def preprocess_name(name):
        if not isinstance(name, str):
            return ""
        name = name.lower()
        name = re.sub(r'\(.*?\)', '', name)
        name = re.sub(r'committee on', '', name)
        name = re.sub(r'[^a-z\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        return name
    clean_source = [preprocess_name(c) for c in source_list]
    clean_target = [preprocess_name(c) for c in target_list]
    keywords = ["education","health","finance","budget","transportation","judiciary","environment","agriculture","energy","labor","housing","veterans affairs","public safety","insurance","banking","public health","small business","redistricting","public utilities","natural resources","water","technology","communications","elections","government","appropriations","rules","ethics","criminal justice","environmental protection","college and university","human services","reproductive health","mental health","technology","aggriculture","urban development","renewable energy","gun violence","commerce","privacy","cybersecurity","infrastructure","disaster preparedness","prisons","aging"]
    def get_committee_keywords(name):
        return set(kw for kw in keywords if kw in name)
    target_keywords = [get_committee_keywords(name) for name in clean_target]
    def calculate_similarity(source_idx, target_idx):
        source = clean_source[source_idx]
        target = clean_target[target_idx]
        if not source or not target:
            return 0
        if source == target:
            return 100
        token_sort = fuzz.token_sort_ratio(source, target)
        token_set = fuzz.token_set_ratio(source, target)
        partial = fuzz.partial_ratio(source, target)
        source_kw = get_committee_keywords(source)
        keyword_overlap = len(source_kw.intersection(target_keywords[target_idx]))
        keyword_bonus = min(20, keyword_overlap * 10)
        weighted_score = (token_sort * 0.3) + (token_set * 0.5) + (partial * 0.2) + keyword_bonus
        return weighted_score
    matches = {}
    for i, source in enumerate(source_list):
        scores = [calculate_similarity(i, j) for j in range(len(target_list))]
        if not scores or max(scores) < 60:
            matches[source] = None
        else:
            best_idx = np.argmax(scores)
            confidence = scores[best_idx]
            if confidence >= 60:
                matches[source] = target_list[best_idx]
            else:
                matches[source] = None
    return matches

author_com_matches = fuzzy_strings(author_locations['name'].unique().tolist(), leg_committees)
author_locations['name'] = author_locations['name'].map(author_com_matches)

In [15]:
sponsors = authors.loc[authors['bill_id'].map(bill_id_codes).isin(bill_ids)]
sponsors['term'] = sponsors['bill_id'].apply(lambda x: f"{x[:4]}-{int(x[:4]) + 1}" if int(x[:4]) % 2 == 1 else f"{int(x[:4]) - 1}-{x[:4]}" if int(x[:4]) % 2 == 0 and int(x[:4]) < 2009 else f"{x[:4]}-{int(x[:4]) + 1}")

lob = lobbying.loc[lobbying['clean_beneficiary'].notna(), ['FIRM_NAME', 'EXPN_DSCR', 'clean_beneficiary', 'EXPN_DATE', 'BENE_AMT']]
lob['EXPN_DATE'] = pd.to_datetime(lob['EXPN_DATE'])
lob['term'] = lob['EXPN_DATE'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1 else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year}-{x.year + 1}")

In [16]:
for i, row in politicians.loc[politicians['full_name'].apply(lambda x: isinstance(x, float)), ['Term', 'Last', 'chamber']].drop_duplicates().iterrows():
    term, last = row['Term'], row['Last']
    a = politicians.loc[(politicians['Last'] == last) & (politicians['Term'] == term) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
        continue
    else:
        a = politicians.loc[(politicians['Last'] == last) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]

In [17]:
pol_names_terms = {}
for _, row in politicians[['full_name', 'Term', 'chamber']].drop_duplicates().iterrows():
    if ',' in row['full_name']:
        name = row['full_name'].split(',')[1].strip() + ' ' + row['full_name'].split(',')[0].strip()
    else:
        name = row['full_name']
    pol_names_terms[(row['full_name'].lower(), row['Term'])] = {'chamber': row['chamber'], 'name': name}

In [18]:
expend_assembly = expend_assembly.rename(columns={'term': 'Term'})
expend_assembly['chamber'] = 'assembly'
expend_senate = expend_senate.rename(columns={'term': 'Term'})
expend_senate['chamber'] = 'senate'

campaign_contributions = pd.concat([expend_assembly.loc[expend_assembly['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd']), expend_senate.loc[expend_senate['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd'])])
campaign_contributions['DateEnd'] = pd.to_datetime(campaign_contributions['DateEnd'])
sponsors['bill_ID'] = sponsors['bill_id'].apply(repair_bill_id)

In [19]:
voting = history.merge(bill_votes, left_on=['bill_ID', 'Date'], right_on=['bill_id', 'vote_date_time'], how='inner').rename(columns={'bill_id_x': 'bill_version'}).drop('bill_id_y', axis=1)
voting['bv_id'] = voting['bill_version'].apply(repair_bill_id)

voting_places = {}
for i, row in voting.groupby(['motion_id', 'term', 'chamber', 'Date']).agg({'legislator_name': lambda x: list(x)}).iterrows():
    motion_id, term, chamber, date = i
    g = politicians.loc[(politicians['chamber'] == chamber) & (politicians['Term'] == term) & (politicians['Last'].isin(row['legislator_name']))]
    voting_places[(motion_id, term, chamber, date)] = {'most_common_committee': g.groupby('committee_clean').size().sort_values(ascending=False).head(1).index[0] if len(g) > 0 else None}
voting['voting_place'] = voting.apply(lambda row: voting_places.get((row['motion_id'], row['term'], row['chamber'], row['Date']), {}).get('most_common_committee', None), axis=1)

In [20]:
hear = hearings[['bill_id', 'location_code']].merge(locations[['committee_code', 'committee_clean']], left_on='location_code', right_on='committee_code', how='left')[['bill_id', 'committee_clean']].drop_duplicates()
hear['year'] = hear['bill_id'].apply(lambda x: int(x[:4]))

positions = {p: 'Democratic Alternate' if re.search(r'Democratic\s*Alternate', p) is not None else 'Vice Chair' if re.search(r'V\s*i\s*c\s*e\s*-*\s*C\s*h\s*a\s*i\s*r\s*', p) is not None else 'Co-Chair' if re.search(r'Co\s*-\s*Chair', p) is not None else 'Chair' if re.search(r'Cha\s*i\s*r', p) is not None else 'Republican Alternate' if re.search(r'\s*Republican\s*Alternate', p) is not None else p for p in politicians['position'].unique()}
vnums = bill_vers.set_index('ID')['VersionNum'].to_dict()
vid_map = {v: k for k, val in version_id_mapping.items() for v in val}

In [21]:
def _safe_dt(s):
    return pd.to_datetime(s, errors='coerce')

def _canon_name(n):
    n = re.sub(r'[^\w\s]', ' ', str(n)).lower()
    n = re.sub(r'\s+', ' ', n).strip()
    return n

def _infer_origin_chamber_from_bill_id(bill_id):
    s = str(bill_id)
    if 'AB' in s: return 'assembly'
    if 'SB' in s: return 'senate'
    return None

def _term_from_date(ts):
    if pd.isna(ts): return np.nan
    y = ts.year
    if y % 2 == 1:
        return f"{y}-{y+1}"
    else:
        if ts.month < 11:
            return f"{y-1}-{y}"
        return f"{y+1}-{y+2}"

def _tokenize(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return [t for t in s.split(' ') if t]

def _jaccard(a_set, b_set):
    if not a_set and not b_set: return 1.0
    i = len(a_set & b_set)
    u = len(a_set | b_set)
    return i / u if u else 0.0

def read_zip(zip_path, crs=3857):
    tmp = tempfile.TemporaryDirectory()
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(tmp.name)
    shp = next(pathlib.Path(tmp.name).rglob("*.shp"))
    gdf = gpd.read_file(shp).set_crs(epsg=crs)
    gdf = gdf.to_crs(epsg=3857)
    return gdf, tmp

def district_cycle(year):
    if year <= 2012: return "2001"
    if year <= 2022: return "2011"
    return "current"

In [22]:
counties_gdf, _ = read_zip('dashboard/backend/data/ca_counties.zip')
counties_gdf = counties_gdf[['COUNTYFP', 'NAMELSAD', 'geometry']]
counties_gdf['county_area'] = counties_gdf.geometry.area
counties_gdf['county_id'] = counties_gdf['COUNTYFP'].astype(int)
data_dir = pathlib.Path('dashboard/backend/data')
asm11_zip = data_dir / '2011_assembly_state_shp.zip'
sen11_zip = data_dir / '2011_senate_state_shp.zip'
asmcur_zip = data_dir / '2021_AD_Final_shp.zip'
sencur_zip = data_dir / '2021_SD_Final_shp.zip'
dist_info = [(asm11_zip, "assembly", "2011", 4019),(sen11_zip, "senate", "2011", 4019),(asmcur_zip, "assembly","current", 4269),(sencur_zip, "senate",  "current", 4269)]
weight_records = []
tmps = []
for zp, house, cycle, crs in dist_info:
    gdf, tmp = read_zip(zp, crs)
    tmps.append(tmp)
    gdf = gdf.rename(columns={gdf.columns[0]: "district_id"})[["district_id", "geometry"]]
    gdf["house"] = house
    gdf["cycle"] = cycle
    gdf["dist_area"] = gdf.geometry.area
    inter = gpd.overlay(gdf, counties_gdf, how="intersection")
    inter["fragment_area"] = inter.geometry.area
    weight_records.append(inter[["house", "cycle", "district_id", "county_id", "fragment_area", 'county_area', 'dist_area']].reset_index(drop=True))
weights = pd.concat(weight_records, ignore_index=True)
weights['weight'] = weights['fragment_area'] / weights['county_area']
weights['district_share_in_county'] = weights['fragment_area']/weights['dist_area']

In [23]:
fix = politicians.loc[politicians['District No.'].isna(), ['full_name', 'Term']].drop_duplicates()
fix['District No.'] = [78, 30, 26, 30, 30, 29, 29, 22, 29, 22, 36, 29, 22, 22, 6]
for i, row in fix.iterrows():
    politicians.loc[(politicians['full_name'] == row['full_name']) & (politicians['Term'] == row['Term']), 'District No.'] = row['District No.']

lob['clean_beneficiary'] = lob['clean_beneficiary'].apply(lambda x: x.strip().lower() if isinstance(x, str) else x)
lobb = lob.groupby(['clean_beneficiary', 'term']).agg({'BENE_AMT': 'sum'}).reset_index().rename(columns={'BENE_AMT': 'AMOUNT'})
exp_as = expend_assembly[['Amount', 'year', 'matched_target_name']].drop_duplicates().groupby(['matched_target_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={'year': 'term'})
exp_sen = expend_senate.groupby(['matched_target_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={"year": 'term'})
politicians['lower'] = politicians['full_name'].str.lower()
def name_swap(name):
    return re.sub(r'\,', '', name.lower()).strip()
politicians['name2'] = politicians['full_name'].apply(name_swap)
politicians.loc[politicians['name2'].isin([p for p in politicians['lower'].unique() if p in lobb['clean_beneficiary'].unique()]), 'name2'] = politicians.loc[politicians['name2'].isin([p for p in politicians['lower'].unique() if p in lobb['clean_beneficiary'].unique()]), 'lower']
pl = politicians[['Party', 'District No.', 'Seat No.', 'Term', 'full_name', 'chamber', 'name2']].drop_duplicates().merge(lobb, left_on=['Term', 'name2'], right_on=['term', 'clean_beneficiary'], how='left').rename(columns={'AMOUNT': 'total_lobbying'})
exp_as['name2'] = exp_as['matched_target_name'].apply(lambda x: re.sub(r'\,', '', x.lower()))
pl['term'] = pl['Term'].apply(lambda x: int(x.split('-')[-1]))
exp_as.loc[exp_as['term'] % 2 == 1, 'term'] = exp_as.loc[exp_as['term'] % 2 == 1, 'term'] - 1
exp_sen.loc[exp_sen['term'] % 2 == 1, 'term'] = exp_sen.loc[exp_sen['term'] % 2 == 1, 'term'] - 1
pld = pl.merge(exp_as, on=['term', 'name2'], how='left').rename(columns={'Amount': 'total_donations_'})
exp_sen['name2'] = exp_sen['matched_target_name'].apply(lambda x: re.sub(r'\,', '', x.lower()))
pldd = pld.merge(exp_sen, on=['term', 'name2'], how='left')
pldd['total_donations_'] = pldd[['total_donations_', 'Amount']].sum(skipna=True, axis=1)
pldd = pldd.rename(columns={'total_donations_': 'total_donations'})
pldd['total_received'] = pldd['total_donations'] + pldd['total_lobbying']
for c in ['total_donations', 'total_lobbying', 'total_received']:
    pldd[c] = pldd[c].fillna(0).astype(float)
lfund = pldd.copy()
lfund['District No.'] = lfund['District No.'].astype(str).apply(lambda x: re.sub(r'\s', '', x)).astype(float).astype(int)
lfund_ = lfund.groupby(['Term', 'District No.', 'chamber']).agg({'total_donations': 'sum','total_lobbying': 'sum','total_received': 'sum'}).reset_index()
lfund_['cycle'] = lfund_['Term'].apply(lambda x: '2011' if int(x.split('-')[0]) <= 2012 else 'current')
reg_funds = lfund_.merge(weights, left_on=['cycle', 'District No.', 'chamber'], right_on=['cycle', 'district_id', 'house'], how='left')
reg_funds['total_donations'] *= reg_funds['district_share_in_county']
reg_funds['total_lobbying'] *= reg_funds['district_share_in_county']
reg_funds['total_received'] *= reg_funds['district_share_in_county']
reg_funds_ = reg_funds.groupby(['county_id', 'house']).agg({'total_donations': 'sum','total_lobbying': 'sum','total_received': 'sum'}).reset_index()
co_cal = reg_funds_.merge(counties_gdf, on='county_id', how='left')
ca_legislator_funding = gpd.GeoDataFrame(co_cal, geometry='geometry')

In [24]:
vote_map = {'AYE':1,'YES':1,'NOE':-1,'NO':-1}
voting['vote_num'] = voting['vote_code'].str.upper().map(vote_map).fillna(0).astype(int)
motion_dict = bill_motions.set_index('motion_id')['motion_text'].to_dict()
roll_cols = ['bill_ID','bill_version','Date','motion_id','chamber','voting_place']
roll = (voting.groupby(roll_cols, dropna=False).agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())), no=('vote_num', lambda x: int((np.array(x)<0).sum())), total=('vote_num','count')).reset_index())
roll['pass'] = (roll['yes'] > roll['no'])
bill_votes['vote_num'] = bill_votes['vote_code'].str.upper().map(vote_map).fillna(0).astype(int)
bill_votes['Date'] = pd.to_datetime(bill_votes['vote_date_time']).dt.date
roll_cols2 = ['bill_id','Date','motion_id','chamber','location_code']

In [25]:
summary_roll = (bill_votes.groupby(roll_cols2, dropna=False).agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())), no=('vote_num', lambda x: int((np.array(x)<0).sum())), total=('vote_num','count')).reset_index())

In [27]:
summary_roll = (bill_votes.groupby(roll_cols2, dropna=False).agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())), no=('vote_num', lambda x: int((np.array(x)<0).sum())), total=('vote_num','count')).reset_index())
summary_roll['pass'] = (summary_roll['yes'] > summary_roll['no'])

def _stage_timing(group):
    g = group.sort_values('Date')
    intro = g['Date'].min()
    is_committee = ~(g['voting_place'].isin(['Assembly Floor','Senate Floor']))
    comm_ref = g.loc[is_committee, 'Date'].min() if is_committee.any() else pd.NaT
    first_read = g['Date'].min() if not g.empty else pd.NaT
    second_read = pd.NaT
    if pd.notna(first_read):
        _after1 = g[(g['Date'] > first_read) & (is_committee)]
        if not _after1.empty:
            second_read = _after1['Date'].min()
    third_read = pd.NaT
    if pd.notna(second_read):
        _after2 = g[(g['Date'] > second_read)]
        if not _after2.empty:
            third_read = _after2['Date'].min()
    is_floor = summary_roll.loc[(summary_roll['bill_id'] == g['bill_ID'].iloc[0]) & (summary_roll['location_code'].isin(['AFLOOR','SFLOOR']))]
    asm_floor_pass = pd.NaT
    sen_floor_pass = pd.NaT
    if not is_floor.empty:
        asm_floor_data = is_floor[(is_floor['location_code'] == 'AFLOOR') & (is_floor['pass'])]
        if not asm_floor_data.empty:
            asm_floor_pass = asm_floor_data['Date'].min()
        sen_floor_data = is_floor[(is_floor['location_code'] == 'SFLOOR') & (is_floor['pass'])]
        if not sen_floor_data.empty:
            sen_floor_pass = sen_floor_data['Date'].min()
    return pd.Series({'intro': intro, 'comm_ref': comm_ref, 'first_read': first_read, 'second_read': second_read, 'third_read': third_read, 'asm_floor_pass': asm_floor_pass, 'sen_floor_pass': sen_floor_pass})

stages_df = roll.groupby('bill_ID', group_keys=False).apply(_stage_timing).reset_index()

  stages_df = roll.groupby('bill_ID', group_keys=False).apply(_stage_timing).reset_index()


In [28]:
with open('bill_labels_updated.json', 'r') as f:
    bill_labels = json.load(f)
stages_df['topic'] = stages_df['bill_ID'].map(bill_labels)
stages_df = stages_df.loc[stages_df['topic'].notna()]

outcomes = (history.dropna(subset=['bill_ID']).sort_values('Date', ascending=False).groupby('bill_ID').first().reset_index()[['bill_ID','Action']])
outcomes.loc[outcomes['Action'].isin(['CHAPTERED','ENROLLED','FILED','APPROVED']),'Outcome'] = 1
outcomes.loc[outcomes['Action'].isin(['VETOED']),'Outcome'] = -1
outcomes['Outcome'] = outcomes['Outcome'].fillna(0).astype(int)
y_df = outcomes[['bill_ID','Outcome']].rename(columns={'Outcome':'outcome'})

first_last = (history.dropna(subset=['bill_ID']).groupby('bill_ID')['Date'].agg(First_action='min', Last_action='max').reset_index())
pipe_base = (stages_df.merge(first_last, on='bill_ID', how='left').merge(y_df, on='bill_ID', how='left'))
stage_order = [c for c in ['intro','comm_ref','first_read','second_read','third_read','asm_floor_pass','sen_floor_pass'] if c in pipe_base.columns]


In [29]:
pairs = [(stage_order[i], stage_order[i+1]) for i in range(len(stage_order)-1)]
rows = []
for a,b in pairs:
    aa = _safe_dt(pipe_base[a]); bb = _safe_dt(pipe_base[b])
    entered = int(aa.notna().sum())
    advanced = int(((aa.notna()) & (bb.notna())).sum())
    rate = float(advanced / entered) if entered else np.nan
    mdays = float(np.median((bb - aa).dt.days.dropna().values)) if advanced else np.nan
    rows.append({'from':a,'to':b,'entered':entered,'advanced':advanced,'pass_rate':rate,'median_days':mdays})
pipeline_stage_funnel = pd.DataFrame(rows)
pipeline_timestamps_wide = pipe_base[['bill_ID','topic'] + stage_order].copy()
for s in stage_order:
    pipeline_timestamps_wide[f'{s}_ts'] = _safe_dt(pipeline_timestamps_wide[s]).astype('int64', errors='ignore')//10**9
stuck_rows=[]
for a,b in pairs:
    aa = _safe_dt(pipe_base[a]); bb = _safe_dt(pipe_base[b])
    dd = (bb - aa).dt.days
    q90 = np.nanpercentile(dd.dropna().values, 90) if dd.notna().any() else np.nan
    sub = pipe_base[(aa.notna()) & (bb.isna())][['bill_ID','topic']].copy()
    if not sub.empty:
        sub['stage']=a; sub['q90']=q90
        stuck_rows.append(sub)
pipeline_stuck_candidates = pd.concat(stuck_rows, ignore_index=True) if stuck_rows else pd.DataFrame(columns=['bill_ID','topic','stage','q90'])

In [30]:
hear_seq = (hearings[['bill_id','location_code']].merge(locations[['committee_code','committee_clean']], left_on='location_code', right_on='committee_code', how='left').rename(columns={'committee_clean':'committee'}))
route_df = (hear_seq.groupby('bill_id')['committee'].apply(lambda s: tuple([x for x in s.dropna().tolist() if x])).reset_index().rename(columns={'committee':'route'}))
route_df['route_key'] = route_df['route'].apply(lambda r: ' > '.join(list(r)[:5]) if isinstance(r, tuple) and r else None)
route_df.rename(columns={'bill_id':'bill_ID'}, inplace=True)
route_df['topic'] = route_df['bill_ID'].map(bill_labels)
route_df = route_df.loc[route_df['topic'].notna()]
route_perf = route_df.merge(y_df, on='bill_ID', how='left')
route_archetypes = (route_perf.groupby(['topic','route_key']).agg(n=('bill_ID','nunique'), pass_rate=('outcome', lambda x: float(np.mean(np.array(x)==1)) if len(x)>0 else np.nan)).reset_index().sort_values(['topic','n'], ascending=[True,False]))

In [31]:
dig = digests[['bill_id','DigestText']].copy()
dig['bill_ID'] = dig['bill_id'].map(bv2b)
ver = versions[['bill_id','VersionNum']].copy()
ver['bill_ID'] = ver['bill_id'].map(bv2b)
dv = (ver.merge(dig, on=['bill_id','bill_ID'], how='inner').dropna(subset=['DigestText']))
def _digest_stats(df):
    df = df.sort_values('VersionNum')
    toks = [set(_tokenize(t)) for t in df['DigestText']]
    sims=[]
    for i in range(1,len(toks)):
        sims.append(_jaccard(toks[i-1], toks[i]))
    return pd.Series({'n_versions': len(df), 'median_sim': float(np.median(sims)) if sims else np.nan})
amendment_churn = (
    dv.groupby('bill_ID')
      .apply(_digest_stats)
      .reset_index()
)
amendment_churn['topic'] = amendment_churn['bill_ID'].map(bill_labels)
amendment_churn = amendment_churn.loc[amendment_churn['topic'].notna()]

  .apply(_digest_stats)


In [32]:
now = pd.Timestamp.now().date()
pb = pipe_base[['bill_ID','topic'] + stage_order].copy()
for c in stage_order: pb[c] = _safe_dt(pb[c])
pb['last_date'] = pb[stage_order].max(axis=1)
by_stage_q80 = {}
for c in stage_order:
    dd = pb[c].apply(lambda x: (now - x.date()).days if pd.notna(x) else np.nan)
    by_stage_q80[c] = np.nanpercentile(dd.dropna().values, 80) if dd.notna().any() else np.nan
last_stage_col = None
for c in reversed(stage_order):
    if pb[c].notna().any(): last_stage_col = c; break
pb = pb.merge(route_archetypes[['route_key','topic','pass_rate']].drop_duplicates(subset=['route_key','topic']), on='topic', how='left')
pb = pb.merge(route_df[['bill_ID','route_key']], on='bill_ID', how='left')
pb = pb.merge(amendment_churn[['bill_ID','n_versions']], on='bill_ID', how='left')
def _risk_row(r):
    churn  = (r.get('n_versions',0) or 0) >= 5
    low_route = (r.get('pass_rate',1.0) or 1.0) < 0.3
    return int(sum([churn, low_route]))
pb['risk'] = pb.apply(_risk_row, axis=1)
risk_list = pb[['bill_ID','topic','route_key_x','n_versions','risk']].copy().rename(columns={'route_key_x':'route_key'})

In [33]:
entries = stages_df[['bill_ID','comm_ref']].dropna()
exits = stages_df[['bill_ID'] + [c for c in stage_order if c!='comm_ref']].copy()
exits['has_exit'] = exits.drop(columns=['bill_ID']).notna().any(axis=1)
gate = entries.merge(exits[['bill_ID','has_exit']], on='bill_ID', how='left')
heard = hear_seq[['bill_id','committee']].dropna().drop_duplicates().rename(columns={'bill_id':'bill_ID'})
gk = heard.merge(gate[['bill_ID','has_exit']], on='bill_ID', how='left')
committee_gatekeeping = (gk.groupby('committee').agg(entries=('bill_ID','nunique'), exits=('has_exit', lambda x: int(np.nansum(np.array(x)==True)))).reset_index())
committee_gatekeeping['gatekeeping'] = 1 - (committee_gatekeeping['exits'] / committee_gatekeeping['entries'].replace(0, np.nan))
hear_dates = history[['bill_ID','Date','Action']].copy()
hear_dates = hear_dates[hear_dates['Action'].str.upper().str.contains('HEARING|REFERRED|RE-REFERRED|COMMITTEE', na=False)]

In [34]:
origin = stages_df[['bill_ID']].copy()
origin['origin'] = origin['bill_ID'].apply(_infer_origin_chamber_from_bill_id)
ccf = stages_df[['bill_ID','asm_floor_pass','sen_floor_pass']].copy()
ccf['A_then_S'] = ccf['asm_floor_pass'].notna() & ccf['sen_floor_pass'].notna()
ccf['S_then_A'] = ccf['sen_floor_pass'].notna() & ccf['asm_floor_pass'].notna()
ccf['topic'] = ccf['bill_ID'].map(bill_labels)
ccf = ccf.loc[ccf['topic'].notna()]
cross_chamber_friction = (ccf.groupby('topic').agg(pass_Asm_then_Sen=('A_then_S', lambda x: int(np.nansum(x))), pass_Sen_then_Asm=('S_then_A', lambda x: int(np.nansum(x)))).reset_index())

In [35]:
sv = stages_df[['bill_ID','intro']].merge(y_df, on='bill_ID', how='left')
sv['start'] = _safe_dt(sv['intro'])
ends = first_last[['bill_ID','Last_action']].rename(columns={'Last_action':'end'})
sv = sv.merge(ends, on='bill_ID', how='left')
sv['end'] = _safe_dt(sv['end'])
sv['topic'] = sv['bill_ID'].map(bill_labels)
sv = sv.loc[sv['topic'].notna()]
def _survival_topic(df):
    df = df.dropna(subset=['start'])
    if df.empty: return pd.DataFrame(columns=['t','survival'])
    t0 = df['start'].min()
    t1 = df['end'].max() if df['end'].notna().any() else t0 + pd.Timedelta(days=1)
    grid = pd.date_range(t0, t1, freq='7D')
    rows=[]
    for g in grid:
        alive = ((df['end'].isna()) | (df['end'] > g)).sum()
        total = len(df)
        rows.append({'t': g, 'survival': alive/total if total else np.nan})
    return pd.DataFrame(rows)
_surv = []
for topic, g in sv.groupby('topic'):
    sdf = _survival_topic(g)
    sdf['topic'] = topic
    _surv.append(sdf)
survival_curves = pd.concat(_surv, ignore_index=True) if _surv else pd.DataFrame(columns=['t','survival','topic'])

In [36]:
v = voting[['bill_ID','legislator_name','vote_code','location_code']].copy()
vote_num_map = {'AYE':1,'YES':1,'NOE':-1,'NO':-1}
v['vote'] = v['vote_code'].str.upper().map(vote_num_map).fillna(0).astype(int)
mat = v.pivot_table(index='legislator_name', columns='bill_ID', values='vote', aggfunc='first').fillna(0).astype(int)
l = mat.index.to_list()
sim_edges=[]
if mat.shape[0] >= 2:
    X = mat.to_numpy(dtype=np.float32)
    Xc = X - X.mean(axis=1, keepdims=True)
    denom = np.sqrt((Xc**2).sum(axis=1, keepdims=True)); denom[denom==0]=1.0
    Xn = Xc/denom
    for i in range(Xn.shape[0]):
        dots = Xn[i] @ Xn.T
        dots[i] = -1
        idx = np.where(dots>=0.6)[0]
        for j in idx:
            if i<j:
                sim_edges.append((l[i], l[j], float(dots[j])))
vote_similarity_edges = pd.DataFrame(sim_edges, columns=['u','v','sim'])
adj=defaultdict(list)
for _,r in vote_similarity_edges.iterrows():
    adj[r['u']].append(r['v']); adj[r['v']].append(r['u'])
visited=set(); comps=[]
for node in l:
    if node in visited: continue
    q=deque([node]); comp=[]
    while q:
        x=q.popleft()
        if x in visited: continue
        visited.add(x); comp.append(x)
        for nb in adj.get(x, []):
            if nb not in visited: q.append(nb)
    comps.append(comp)
vote_communities = pd.DataFrame([(n,i) for i,comp in enumerate(comps) for n in comp], columns=['legislator_name','community'])

In [37]:
vc = voting[['legislator_name','vote_code','location_code']].copy()
vc['is_floor'] = vc['location_code'].isin(['AFLOOR','SFLOOR'])
vc['yes'] = vc['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
leg_comm = vc[~vc['is_floor']].groupby('legislator_name')['yes'].mean().rename('comm_yes')
leg_floor = vc[vc['is_floor']].groupby('legislator_name')['yes'].mean().rename('floor_yes')
committee_floor_drift = (pd.concat([leg_comm, leg_floor], axis=1).reset_index())
committee_floor_drift['drift'] = committee_floor_drift['floor_yes'] - committee_floor_drift['comm_yes']

In [38]:
dig_m = digests[['bill_id','DigestText']].copy()
dig_m['bill_ID'] = dig_m['bill_id'].map(bv2b)
dig_m = dig_m.dropna(subset=['bill_ID','DigestText'])
dig_m = dig_m.merge(y_df, on='bill_ID', how='left')
def _lift(df):
    toks_pos=defaultdict(int); toks_neg=defaultdict(int)
    for _,r in df.iterrows():
        toks=set(_tokenize(r['DigestText']))
        if int(r.get('outcome',0))==1:
            for t in toks: toks_pos[t]+=1
        else:
            for t in toks: toks_neg[t]+=1
    rows=[]
    all_t = set(list(toks_pos.keys())+list(toks_neg.keys()))
    for t in all_t:
        pos = toks_pos.get(t,0)+1
        neg = toks_neg.get(t,0)+1
        rows.append((t, float(np.log(pos/neg)), toks_pos.get(t,0), toks_neg.get(t,0)))
    return (pd.DataFrame(rows, columns=['token','log_lift_pass_vs_other','pos','neg']).sort_values('log_lift_pass_vs_other', ascending=False))
text_lift_top_tokens = _lift(dig_m)

In [39]:
ca = pd.concat([
    expend_assembly[['ExpenderName','Amount','matched_target_name','Term','DateEnd']].dropna(subset=['ExpenderName','Amount','matched_target_name']),
    expend_senate  [['ExpenderName','Amount','matched_target_name','Term','DateEnd']].dropna(subset=['ExpenderName','Amount','matched_target_name'])
], ignore_index=True)
port = ca.groupby(['ExpenderName','matched_target_name'])['Amount'].sum().reset_index()
def _hhi(g):
    s = g['Amount'].sum()
    if s<=0: return np.nan
    p = (g['Amount']/s).values
    return float(np.sum(p*p))
donor_portfolios_hhi = (port.groupby('ExpenderName').apply(_hhi).reset_index().rename(columns={0:'hhi'}))

  donor_portfolios_hhi = (port.groupby('ExpenderName').apply(_hhi).reset_index().rename(columns={0:'hhi'}))


In [40]:
vt = voting[['legislator_name','vote_code','vote_date_time']].copy()
pol_last_names = politicians[['Last', 'full_name']].dropna().drop_duplicates()
pol_last_names['canon'] = pol_last_names['full_name'].apply(_canon_name)
pln_map = dict(zip(pol_last_names['Last'].str.lower(), pol_last_names['canon']))

vt['canon'] = vt['legislator_name'].apply(_canon_name).map(pln_map)
vt['term'] = vt['vote_date_time'].apply(_term_from_date)
vt['yes'] = vt['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
leg_term_rate = vt.groupby(['canon','term'])['yes'].mean().reset_index().rename(columns={'yes':'yes_rate'})

In [41]:
don = ca.copy()
don['canon'] = don['matched_target_name'].apply(_canon_name)
fund = (don.groupby(['canon','Term'])['Amount'].sum().reset_index().rename(columns={'Term':'term','Amount':'funding'}))
ft = fund.merge(leg_term_rate, on=['canon','term'], how='inner')
def _quartiles(g):
    if g.empty: return pd.Series({'yes_rate_top':np.nan,'yes_rate_bottom':np.nan,'delta':np.nan,'n_top':0,'n_bottom':0})
    q = g['funding'].quantile([0.25,0.75]).values
    low = g[g['funding']<=q[0]]; high = g[g['funding']>=q[1]]
    return pd.Series({'yes_rate_top': float(high['yes_rate'].mean()) if not high.empty else np.nan, 'yes_rate_bottom': float(low['yes_rate'].mean()) if not low.empty else np.nan, 'delta': float((high['yes_rate'].mean() - low['yes_rate'].mean())) if (not high.empty and not low.empty) else np.nan, 'n_top': int(high.shape[0]), 'n_bottom': int(low.shape[0])})
money_vote_alignment = ft.groupby('term').apply(_quartiles, include_groups=False).reset_index()

In [42]:
cc2 = ca[['DateEnd','Amount']].dropna().copy()
cc2['DateEnd'] = pd.to_datetime(_safe_dt(cc2['DateEnd'])).dt.date
start_min = pd.to_datetime(_safe_dt(first_last['First_action']).min() if first_last['First_action'].notna().any() else cc2['DateEnd'].min()).date()
cc2['t'] = cc2['DateEnd'].apply(lambda d: (d - start_min).days if pd.notna(d) else np.nan)
money_event_time_curve = (cc2.groupby('t')['Amount'].mean().reset_index().sort_values('t'))

In [43]:
bill_dates_df = first_last.copy()
bill_dates_df['longevity_days'] = (bill_dates_df['Last_action'] - bill_dates_df['First_action']).dt.days
signals = (roll.groupby('bill_ID').apply(lambda g: float(np.mean((g['yes']/(g['total'].replace(0, np.nan))) >= 0.5))).reset_index().rename(columns={0:'vote_signal'}))
n_versions = (versions.assign(bill_ID=lambda d: d['bill_id'].map(bv2b)).dropna(subset=['bill_ID']).groupby('bill_ID')['VersionNum'].nunique().reset_index().rename(columns={'VersionNum':'bill_version_count'}))
y_df['topic'] = y_df['bill_ID'].map(bill_labels)
y_df2 = y_df.loc[y_df['topic'].notna()].copy()
bills_table = (y_df2.merge(bill_dates_df[['bill_ID','First_action','longevity_days', 'Last_action']], on='bill_ID', how='left').merge(signals, on='bill_ID', how='left').merge(amendment_churn[['bill_ID','n_versions','median_sim']], on='bill_ID', how='left').merge(n_versions, on='bill_ID', how='left'))
bills_table['First_action'] = pd.to_datetime(bills_table['First_action']).dt.strftime('%Y-%m-%d')
bills_table['Last_action'] = pd.to_datetime(bills_table['Last_action']).dt.strftime('%Y-%m-%d')

  signals = (roll.groupby('bill_ID').apply(lambda g: float(np.mean((g['yes']/(g['total'].replace(0, np.nan))) >= 0.5))).reset_index().rename(columns={0:'vote_signal'}))


In [44]:
vv = voting[['bill_ID','legislator_name','vote_code','chamber','term', 'location_code', 'Date']].copy()
vv['last'] = vv['legislator_name'].str.lower().str.strip()
vv['yes'] = vv['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
legislators_last_names = {}
for _, row in politicians[['chamber', 'Last', 'Term', 'full_name']].drop_duplicates().iterrows():
    legislators_last_names[(row['chamber'], row['Last'].lower(), row['Term'])] = row['full_name']
def _resolve_full_name(row):
    return legislators_last_names.get((row['chamber'], row['last'], row['term']), np.nan)
vv['full_name'] = vv.apply(_resolve_full_name, axis=1)
vv['party'] = vv['full_name'].map(leg_parties)
vv['topic'] = vv['bill_ID'].map(bill_labels)
vv = vv.loc[vv['topic'].notna()]
vv_major = vv[vv['party'].isin(['D','R'])].copy()
rc = (vv_major.groupby(['bill_ID','term','topic','party'])['yes'].mean().unstack('party').reset_index().rename(columns={'D':'yes_D','R':'yes_R'}))
for c in ['yes_D','yes_R']:
    if c not in rc.columns: rc[c] = np.nan
rc['polarization'] = (rc['yes_D'] - rc['yes_R']).abs()
rc['party_line_split'] = np.where(((rc['yes_D']>0.5) & (rc['yes_R']<0.5)) | ((rc['yes_D']<0.5) & (rc['yes_R']>0.5)), 1, 0)
topic_controversy = (rc.groupby(['topic','term']).agg(n_rollcalls=('bill_ID','nunique'), mean_polarization=('polarization','mean'), median_polarization=('polarization','median'), party_line_share=('party_line_split','mean'), dem_yes_rate=('yes_D','mean'), rep_yes_rate=('yes_R','mean')).reset_index())
rollcall_party_splits = rc[['bill_ID','term','topic','yes_D','yes_R','polarization','party_line_split']].copy()


In [45]:
vv2 = vv.copy()
vv2['canon'] = vv2['legislator_name'].apply(_canon_name)
vv2['any_vote'] = 1
_weight_col = 'yes'
topic_votes = (vv2.dropna(subset=['topic']).groupby(['canon','term','topic'])[_weight_col].sum().reset_index(name='topic_votes'))
total_votes = (vv2.groupby(['canon','term'])[_weight_col].sum().reset_index(name='total_votes'))
weights_topics = (topic_votes.merge(total_votes, on=['canon','term'], how='left'))
weights_topics['topic_share'] = np.where(weights_topics['total_votes']>0, weights_topics['topic_votes']/weights_topics['total_votes'], 0.0)

In [46]:
for df in (expend_assembly, expend_senate):
    df['year'] = df['Term'].str.extract(r'^(\d{4})').astype(int)
    df['term'] = np.where((df['year']%2==0), df['year']-1, df['year'])
    df['term'] = df['term'].astype(int).astype(str) + '-' + (df['term']+1).astype(int).astype(str)
    df['canon'] = df['matched_target_name'].apply(_canon_name)

In [47]:
don_by_leg_term = (pd.concat([expend_assembly, expend_senate], ignore_index=True).groupby(['canon','term'])['Amount'].sum().reset_index().rename(columns={'Amount':'donations'}))
lb2 = lobbying[['clean_beneficiary','EXPN_DATE','BENE_AMT']].dropna().copy()
lb2['EXPN_DATE'] = pd.to_datetime(lb2['EXPN_DATE'], errors='coerce')
lb2['term'] = lb2['EXPN_DATE'].apply(lambda x: np.nan if pd.isna(x) else (f"{x.year-1}-{x.year}" if (x.year%2==0 and x.month<11) else f"{x.year+1}-{x.year+2}" if (x.year%2==0) else f"{x.year}-{x.year+1}"))
lb2['canon'] = lb2['clean_beneficiary'].apply(_canon_name)
lob_by_leg_term = (lb2.groupby(['canon','term'])['BENE_AMT'].sum().reset_index().rename(columns={'BENE_AMT':'lobbying'}))
fund_leg_term = (don_by_leg_term.merge(lob_by_leg_term, on=['canon','term'], how='outer').fillna({'donations':0.0,'lobbying':0.0}))
fund_leg_term['total_received'] = fund_leg_term['donations'] + fund_leg_term['lobbying']
weights_topics['canon'] = weights_topics['canon'].map(pln_map)
alloc = (weights_topics.merge(fund_leg_term, on=['canon','term'], how='left').fillna({'donations':0.0,'lobbying':0.0,'total_received':0.0}))
alloc['donations_topic'] = alloc['donations'] * alloc['topic_share']
alloc['lobbying_topic']  = alloc['lobbying']  * alloc['topic_share']
alloc['total_topic'] = alloc['total_received'] * alloc['topic_share']
topic_funding_by_term = (alloc.groupby(['topic','term']).agg(total_donations=('donations_topic','sum'), total_lobbying=('lobbying_topic','sum'), total_received=('total_topic','sum')).reset_index())
topic_funding_by_leg = (alloc.groupby(['canon','term','topic']).agg(donations=('donations_topic','sum'), lobbying=('lobbying_topic','sum'), total=('total_topic','sum')).reset_index())
don_leg_term = (pd.concat([expend_assembly, expend_senate], ignore_index=True).assign(canon=lambda d: d['matched_target_name'].apply(_canon_name)).rename(columns={'Amount':'donation'}))
don_alloc = (don_leg_term.merge(weights_topics[['canon','term','topic','topic_share']], on=['canon','term'], how='left').fillna({'topic_share':0.0}))
don_alloc['donation_topic'] = don_alloc['donation'] * don_alloc['topic_share']
donor_topic_by_term = (don_alloc.groupby(['ExpenderName','topic','term'])['donation_topic'].sum().reset_index().rename(columns={'donation_topic':'donations_allocated'}))
for _df in (topic_funding_by_term, topic_funding_by_leg, donor_topic_by_term):
    _df['term'] = _df['term'].astype(str)

In [48]:
monthly_counts = (stages_df.dropna(subset=['intro']).assign(month=lambda d: _safe_dt(d['intro']).dt.to_period('M').astype(str)).groupby(['topic','month']).agg(introduced=('bill_ID','nunique')).reset_index())
monthly_counts['month'] = pd.to_datetime(monthly_counts['month']+'-01')
topic_momentum = []
for t, g in monthly_counts.sort_values('month').groupby('topic'):
    s = g.set_index('month')['introduced'].astype(float)
    if len(s)>=2:
        ema = s.ewm(span=12, adjust=False).mean()
    else:
        ema = s.copy()
    topic_momentum.append(pd.DataFrame({'topic':t,'month':ema.index,'ema_introduced':ema.values}))
topic_momentum = pd.concat(topic_momentum, ignore_index=True) if topic_momentum else pd.DataFrame(columns=['topic','month','ema_introduced'])

In [49]:
topic_funnel_obs = (pipe_base.assign(any_comm=pipe_base['comm_ref'].notna().astype(int), any_floor=((pipe_base['asm_floor_pass'].notna()) | (pipe_base['sen_floor_pass'].notna())).astype(int)).groupby('topic').agg(introduced=('bill_ID','nunique'), reached_committee=('any_comm','sum'), reached_floor=('any_floor','sum'), passed_outcome=('outcome', lambda x: int(np.sum(np.array(x)==1)))).reset_index())


In [50]:
def _to_cpu(x):
    if isinstance(x, torch.Tensor):
        return x.detach().cpu()
    if isinstance(x, dict):
        return {k: _to_cpu(v) for k, v in x.items()}
    if isinstance(x, (list, tuple)):
        return [ _to_cpu(v) for v in x ]
    return x

def _to_list(x):
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy().tolist()
    if isinstance(x, np.ndarray):
        return x.tolist()
    return x

def _scalarize(v):
    if isinstance(v, torch.Tensor):
        if v.numel() == 1:
            return v.item()
        return v.detach().cpu().numpy().tolist()
    if isinstance(v, dict):
        return {k: _scalarize(u) for k, u in v.items()}
    if isinstance(v, list):
        return [ _scalarize(u) for u in v ]
    if isinstance(v, np.ndarray):
        return v.tolist()
    return v

def _as_df(obj):
    if isinstance(obj, pd.DataFrame):
        return obj
    if isinstance(obj, list) and len(obj) and isinstance(obj[0], dict):
        return pd.DataFrame([{k: _scalarize(v) for k, v in row.items()} for row in obj])
    if isinstance(obj, dict):
        keys = list(obj.keys())
        if all(isinstance(obj[k], (list, np.ndarray, torch.Tensor)) for k in keys):
            return pd.DataFrame({k: _to_list(obj[k]) for k in keys})
    if isinstance(obj, torch.Tensor):
        x = obj.detach().cpu().numpy()
        if x.ndim == 1:
            return pd.DataFrame({0: x})
        return pd.DataFrame(x)
    try:
        return pd.DataFrame(obj)
    except:
        return pd.DataFrame()

def _find(obj, name):
    if isinstance(obj, dict):
        if name in obj:
            return obj[name]
        for v in obj.values():
            r = _find(v, name)
            if r is not None:
                return r
    if isinstance(obj, (list, tuple)):
        for v in obj:
            r = _find(v, name)
            if r is not None:
                return r
    return None

ckpt = torch.load('legnn_eff_outputs.pt', weights_only=False)
ckpt = _to_cpu(ckpt)

raw_per_bill = _find(ckpt, 'per_bill')
raw_actor_topic = _find(ckpt, 'actor_topic')
raw_actor_overall = _find(ckpt, 'actor_overall')
raw_committee_overall = _find(ckpt, 'committee_overall')

per_bill = _as_df(raw_per_bill) if raw_per_bill is not None else pd.DataFrame(columns=['bill_id','topic_id','p_pass_total'])
actor_topic = _as_df(raw_actor_topic) if raw_actor_topic is not None else pd.DataFrame(columns=['actor_id','actor_type','topic_id','stance','certainty','influence_delta_mean','engagement'])
actor_overall = _as_df(raw_actor_overall) if raw_actor_overall is not None else pd.DataFrame(columns=['actor_id','actor_type','overall_influence'])
committee_overall = _as_df(raw_committee_overall) if raw_committee_overall is not None else pd.DataFrame(columns=['committee_id','overall_influence','gate_index'])

if 'bill_id' not in per_bill.columns and 'bill_ID' in per_bill.columns:
    per_bill = per_bill.rename(columns={'bill_ID':'bill_id'})
if 'topic' in per_bill.columns and 'topic_id' not in per_bill.columns:
    per_bill = per_bill.rename(columns={'topic':'topic_id'})
if 'p_pass' in per_bill.columns and 'p_pass_total' not in per_bill.columns:
    per_bill = per_bill.rename(columns={'p_pass':'p_pass_total'})

if 'actor' in actor_topic.columns and 'actor_id' not in actor_topic.columns:
    actor_topic = actor_topic.rename(columns={'actor':'actor_id'})
if 'overall' in actor_overall.columns and 'overall_influence' not in actor_overall.columns:
    actor_overall = actor_overall.rename(columns={'overall':'overall_influence'})
if 'gate_index' not in committee_overall.columns and 'overall_influence' in committee_overall.columns:
    committee_overall['gate_index'] = committee_overall['overall_influence']


In [51]:
topic_route_baseline = route_archetypes.groupby('topic')['pass_rate'].mean().reset_index().rename(columns={'pass_rate':'topic_route_baseline'})
route_baseline = route_archetypes.merge(topic_route_baseline, on='topic', how='left')
route_baseline['route_uplift_vs_topic'] = route_baseline['pass_rate'] - route_baseline['topic_route_baseline']

In [52]:
if not per_bill.empty and {'bill_id','p_pass_total'}.issubset(per_bill.columns):
    pb_topic = pd.DataFrame({'bill_ID': list(bill_labels.keys()), 'topic': list(bill_labels.values())})
    modeled_join = pb_topic.merge(per_bill.rename(columns={'bill_id':'bill_ID','topic_id':'topic'}), on='bill_ID', how='left')
    topic_modeled_pass = modeled_join.groupby('topic')['p_pass_total'].mean().reset_index().rename(columns={'p_pass_total':'modeled_pass_mean'})
    topic_funnel_modeled = topic_funnel_obs.merge(topic_modeled_pass, on='topic', how='left')
else:
    topic_funnel_modeled = topic_funnel_obs.assign(modeled_pass_mean=np.nan)



In [53]:
readability_df = digests[['bill_id','DigestText']].dropna().copy()
def _readability(s):
    txt = str(s)
    words = re.findall(r'\w+', txt)
    sents = re.split(r'[.!?]+', txt)
    syllables = sum([len(re.findall(r'[aeiouyAEIOUY]+', w)) for w in words])
    W = max(1,len(words)); S = max(1,len([x for x in sents if x.strip()]))
    fk = 206.835 - 1.015*(W/S) - 84.6*(syllables/W)
    return fk
readability_df['flesch_kincaid'] = readability_df['DigestText'].apply(_readability)
readability_df['bill_ID'] = readability_df['bill_id'].map(bv2b)
readability_df = readability_df.dropna(subset=['bill_ID'])
readability_outcomes = readability_df.merge(y_df[['bill_ID','outcome']], on='bill_ID', how='left')
readability_stats = readability_outcomes.groupby('bill_ID').agg(fk_mean=('flesch_kincaid','mean')).reset_index()

In [54]:
late_drift = dv.copy()
late_drift = late_drift.merge(stages_df[['bill_ID','asm_floor_pass','sen_floor_pass']], on='bill_ID', how='left')
def _late_flag(df):
    df = df.sort_values('VersionNum')
    toks = [set(_tokenize(t)) for t in df['DigestText']]
    sims=[]
    for i in range(1,len(toks)):
        sims.append(1-_jaccard(toks[i-1], toks[i]))
    if not sims: return pd.Series({'late_drift_flag':0,'last_delta':np.nan})
    return pd.Series({'late_drift_flag': int(np.percentile(sims,95)<=sims[-1]) if len(sims)>=1 else 0,'last_delta': sims[-1] if sims else np.nan})

late_drift_flags = (
    dv.groupby('bill_ID')
      .apply(_late_flag)
      .reset_index()
)

  .apply(_late_flag)


In [55]:
bills_table = bills_table.merge(readability_stats, on='bill_ID', how='left').merge(late_drift_flags, on='bill_ID', how='left')

In [56]:
summary_roll_with_bill = summary_roll.copy()
summary_roll_with_bill = summary_roll_with_bill.dropna(subset=['bill_id'])
floor_only = summary_roll_with_bill[summary_roll_with_bill['location_code'].isin(['AFLOOR','SFLOOR'])].copy()
floor_only['Date'] = pd.to_datetime(floor_only['Date'])
last_floor = floor_only.sort_values('Date').groupby('bill_id').tail(1)[['bill_id','yes','no','total','Date']]

In [57]:
vv_floor = vv[vv['location_code'].isin(['AFLOOR','SFLOOR'])].copy().rename(columns={'bill_ID':'bill_id'})
vv_floor['Date'] = pd.to_datetime(vv_floor['Date'])
vv_floor = vv_floor.merge(last_floor[['bill_id','Date']], on=['bill_id','Date'], how='inner')
vv_floor_major = vv_floor[vv_floor['party'].isin(['D','R'])]
bill_party_rates = vv_floor_major.groupby(['bill_id','party'])['yes'].mean().unstack('party').reset_index().rename(columns={'D':'yes_D_last','R':'yes_R_last'})
bill_party_rates[['yes_D_last','yes_R_last']] = bill_party_rates[['yes_D_last','yes_R_last']].astype(float)

closest_vote = summary_roll_with_bill.copy()
closest_vote['diff'] = (closest_vote['yes'] - closest_vote['no']).abs()
closest_vote = closest_vote.sort_values(['bill_id','diff'])
closest_pick = closest_vote.groupby('bill_id').head(1)[['bill_id','yes','no','total']]

In [58]:
def _entropy_row(r):
    y = float(r['yes']); n = float(r['no']); t = float(r['total'])
    if t<=0: return 0.0
    a = max(t - y - n, 0.0)
    p = np.array([y,t - y - n,n], dtype=np.float64)/t
    p = p[p>0]
    return float(-(p*np.log(p)).sum())
closest_pick['controversiality'] = 1 - (closest_pick['yes'] - closest_pick['no']).abs()/closest_pick['total'].replace(0,np.nan)
closest_pick['vote_entropy'] = closest_pick.apply(_entropy_row, axis=1)

mean_yes_ratio_versions = roll.groupby('bill_ID').apply(lambda g: float(np.nanmean((g['yes']/g['total'].replace(0,np.nan)).values)) if len(g)>0 else np.nan).reset_index().rename(columns={0:'mean_yes_ratio_versions'})

bill_term = first_last[['bill_ID','First_action']].copy()
bill_term['term'] = _safe_dt(bill_term['First_action']).apply(_term_from_date)

  mean_yes_ratio_versions = roll.groupby('bill_ID').apply(lambda g: float(np.nanmean((g['yes']/g['total'].replace(0,np.nan)).values)) if len(g)>0 else np.nan).reset_index().rename(columns={0:'mean_yes_ratio_versions'})


In [59]:
bt = bills_table[['bill_ID', 'Last_action']].drop_duplicates()
bt['Last_action'] = pd.to_datetime(bt['Last_action'])
bt = bt.merge(history, left_on=['bill_ID', 'Last_action'], right_on=['bill_ID', 'Date'], how='left')
bt = bt.merge(authors[['bill_id', 'Contribution', 'Name']], on='bill_id', how='left')

author_type_map = {
    'LEAD_AUTHOR': 'LEAD_AUTHOR',
    'PRINCIPAL_COAUTHOR': 'PRINCIPAL_COAUTHOR',
    'COAUTHOR': 'COAUTHOR',
    'data="COAUTHOR"': 'COAUTHOR',
    'data="LEAD_AUTHOR"': 'LEAD_AUTHOR',
    'data="PRINCIPAL_COAUTHOR"': 'PRINCIPAL_COAUTHOR',
    'nan': 'AUTHOR'
}
author_levels = {
    'AUTHOR': 1,
    'COAUTHOR': 1,
    'PRINCIPAL_COAUTHOR': 2,
    'LEAD_AUTHOR': 3
}

bt['author_type'] = bt['Contribution'].map(author_type_map).fillna('AUTHOR')
bt['author_level'] = bt['author_type'].map(author_levels).fillna(0).astype(int)

def _primary_authors(g):
    g = g.sort_values('author_level', ascending=False)
    primary = g[g['author_level'] == g['author_level'].max()]
    return list(set(primary['Name'].tolist()))

bt['primary_authors'] = bt.groupby('bill_ID', group_keys=False).apply(_primary_authors).reset_index(level=0, drop=True)

  bt['primary_authors'] = bt.groupby('bill_ID', group_keys=False).apply(_primary_authors).reset_index(level=0, drop=True)


In [60]:
lead_author_df = bt[['bill_ID', 'primary_authors']].copy()

In [76]:
bill_insights = pd.DataFrame({'bill_ID': list(bill_labels.keys())})
bill_insights = bill_insights.merge(pd.DataFrame({'bill_ID': list(version_id_mapping2.keys()), 'bill_id_raw': [version_id_mapping2[k][0] if len(version_id_mapping2[k])>0 else np.nan for k in version_id_mapping2.keys()]}), on='bill_ID', how='left')
bill_insights['topic'] = bill_insights['bill_ID'].map(bill_labels)

In [77]:
last_floor['bill_ID'] = last_floor['bill_id']
closest_pick['bill_ID'] = closest_pick['bill_id']
bill_party_rates['bill_ID'] = bill_party_rates['bill_id']

In [78]:
bill_insights = bill_insights.merge(lead_author_df, on='bill_ID', how='left').merge(mean_yes_ratio_versions, on='bill_ID', how='left').merge(bill_party_rates, on='bill_ID', how='left').merge(last_floor[['bill_ID','yes','no','total']], on='bill_ID', how='left').merge(closest_pick[['bill_ID','controversiality','vote_entropy']], on='bill_ID', how='left').merge(bill_term[['bill_ID','term']], on='bill_ID', how='left')
bill_insights['bill_polarization'] = (bill_insights['yes_D_last'] - bill_insights['yes_R_last']).abs()
bill_insights['bill_party_line'] = np.where(((bill_insights['yes_D_last']>0.5) & (bill_insights['yes_R_last']<0.5)) | ((bill_insights['yes_D_last']<0.5) & (bill_insights['yes_R_last']>0.5)), 1, 0)
bill_insights = bill_insights.rename(columns={'controversiality':'bill_controversiality','vote_entropy':'bill_vote_entropy'})
bill_insights = bill_insights[['bill_ID','bill_id_raw','topic','term','primary_authors','mean_yes_ratio_versions','bill_polarization','bill_party_line','bill_controversiality','bill_vote_entropy']]

In [81]:
topic_partisanship_summary = (bill_insights.groupby(['topic','term']).agg(mean_polarization=('bill_polarization','mean'), party_line_share=('bill_party_line','mean'), controversiality_index=('bill_controversiality','median'), vote_entropy=('bill_vote_entropy','median'), n_bills=('bill_ID','nunique')).reset_index())

In [82]:
route_with_term = route_df.merge(first_last[['bill_ID','First_action']], on='bill_ID', how='left')
route_with_term['term'] = _safe_dt(route_with_term['First_action']).apply(_term_from_date)
route_entropy = route_with_term.groupby(['topic','term']).agg(route_entropy=('route_key', lambda s: float(-np.sum(pd.Series(s).value_counts(normalize=True).apply(lambda p: p*np.log(p))))) if len(route_with_term)>0 else np.nan, n_routes=('route_key','nunique')).reset_index()

def _betweenness_rows(r):
    if not isinstance(r, tuple) or len(r)<3:
        return []
    return list(r[1:-1])
mid_rows = []
tmp = route_with_term[['bill_ID','topic','term','route']].dropna()
for _, row in tmp.iterrows():
    mids = _betweenness_rows(row['route'])
    for c in mids:
        mid_rows.append((c,row['topic'],row['term'],row['bill_ID']))
mid_df = pd.DataFrame(mid_rows, columns=['committee','topic','term','bill_ID'])
if mid_df.empty:
    committee_betweenness_proxy = pd.DataFrame(columns=['committee','topic','term','proxy'])
else:
    counts = mid_df.groupby(['committee','topic','term'])['bill_ID'].nunique().reset_index(name='mid_count')
    totals = route_with_term.groupby(['topic','term'])['bill_ID'].nunique().reset_index(name='n_bills')
    committee_betweenness_proxy = counts.merge(totals, on=['topic','term'], how='left')
    committee_betweenness_proxy['proxy'] = committee_betweenness_proxy['mid_count']/committee_betweenness_proxy['n_bills'].replace(0,np.nan)
    committee_betweenness_proxy = committee_betweenness_proxy[['committee','topic','term','proxy']]

In [92]:
voting[['Date', 'bill_ID', 'voting_place']].drop_duplicates()

Unnamed: 0,Date,bill_ID,voting_place
0,2002-01-24,200120020AB323,appropriations
21,2002-08-21,200120020SB902,budget
101,2001-05-29,200120020ACR6,transportation
140,2001-05-03,200120020ACR6,budget
219,2001-09-13,200120020SB273,transportation
...,...,...,...
5962452,2018-05-25,201720180AB2184,appropriations
5962829,2018-08-23,201720180AB2143,rules
5962909,2018-08-20,201720180AB2143,budget and fiscal review
5963071,2017-09-11,201720180AB1264,budget


In [95]:
sponsors_min = sponsors[['bill_id','Name']].dropna().copy()
sponsors_min['bill_ID'] = sponsors_min['bill_id'].map(bv2b)
coal = sponsors_min.merge(route_with_term[['bill_ID','route']], on='bill_ID', how='left').dropna(subset=['route'])
coal['committees'] = coal['route'].apply(lambda r: list(dict.fromkeys([x for x in list(r) if isinstance(x,str)])))
coal = coal.explode('committees')
auth_comms = coal.groupby(['Name','bill_ID'])['committees'].nunique().reset_index()
auth_span = auth_comms.groupby('Name').agg(bills=('bill_ID','nunique'), committees=('committees','sum')).reset_index()
auth_span['breadth_norm'] = np.where(auth_span['bills']>0, auth_span['committees']/auth_span['bills'], np.nan)
author_coalition_breadth = auth_span[['Name','breadth_norm']].rename(columns={'Name':'author'})

In [106]:
votes = voting[['Date', 'bill_ID', 'voting_place', 'chamber']].drop_duplicates().rename(columns={'bill_ID': 'bill_id'})
votes['committee_clean'] = votes['chamber'] + ' ' + votes['voting_place']
hear_month = hear.copy().merge(votes, on=['bill_id', 'committee_clean'], how='left')

In [107]:
hear_month['month'] = pd.to_datetime(hear_month['Date']).dt.to_period('M').astype(str)
hear_month = hear_month.dropna(subset=['committee_clean','month'])

mload = hear_month.groupby(['committee_clean','month'])['bill_id'].nunique().reset_index(name='hearings')
committee_workload_median = mload.groupby('committee_clean')['hearings'].median().reset_index().rename(columns={'committee_clean':'committee','hearings':'median_monthly_hearings'})

In [111]:
for df_name in ['pipeline_stage_funnel','pipeline_timestamps_wide','pipeline_stuck_candidates','route_archetypes','amendment_churn','risk_list','committee_gatekeeping','committee_workload_median','cross_chamber_friction','survival_curves','vote_similarity_edges','vote_communities','committee_floor_drift','text_lift_top_tokens','donor_portfolios_hhi','money_vote_alignment','money_event_time_curve','ca_legislator_funding','bills_table','topic_controversy','rollcall_party_splits','topic_funding_by_term','topic_funding_by_leg','donor_topic_by_term','topic_momentum','topic_funnel_obs','topic_funnel_modeled','route_baseline','bill_insights','topic_partisanship_summary','route_entropy','committee_betweenness_proxy','author_coalition_breadth']:
    if df_name in locals():
        df = locals()[df_name]
        if isinstance(df, pd.DataFrame) and 'term' in df.columns:
            locals()[df_name]['term'] = locals()[df_name]['term'].astype(str)

precomp_outputs = {
    'pipeline_stage_funnel': pipeline_stage_funnel,
    'pipeline_timestamps_wide': pipeline_timestamps_wide,
    'pipeline_stuck_candidates': pipeline_stuck_candidates,
    'route_archetypes': route_archetypes,
    'route_uplift_baseline': route_baseline,
    'amendment_churn': amendment_churn,
    'risk_list': risk_list,
    'committee_gatekeeping': committee_gatekeeping,
    'committee_workload_median': committee_workload_median,
    'cross_chamber_friction': cross_chamber_friction,
    'survival_curves': survival_curves,
    'vote_similarity_edges': vote_similarity_edges,
    'vote_communities': vote_communities,
    'committee_floor_drift': committee_floor_drift,
    'text_lift_top_tokens': text_lift_top_tokens,
    'donor_portfolios_hhi': donor_portfolios_hhi,
    'money_vote_alignment': money_vote_alignment,
    'money_event_time_curve': money_event_time_curve,
    'ca_legislator_funding_geo': ca_legislator_funding,
    'ca_legislator_funding': reg_funds_,
    'bills_table': bills_table,
    'topic_controversy': topic_controversy,
    'rollcall_party_splits': rollcall_party_splits,
    'topic_funding_by_term': topic_funding_by_term,
    'topic_funding_by_leg': topic_funding_by_leg,
    'donor_topic_by_term': donor_topic_by_term,
    'topic_momentum': topic_momentum,
    'topic_funnel_obs': topic_funnel_obs,
    'topic_funnel_modeled': topic_funnel_modeled,
    'bill_insights': bill_insights,
    'topic_partisanship_summary': topic_partisanship_summary,
    'route_entropy': route_entropy,
    'committee_betweenness_proxy': committee_betweenness_proxy,
    'author_coalition_breadth': author_coalition_breadth
}

model_outputs = {
    'per_bill': per_bill,
    'actor_topic': actor_topic,
    'actor_overall': actor_overall
}

In [114]:
for d in [precomp_outputs, model_outputs]:
    for k, v in d.items():
        v.to_parquet(f'outs/{k}.parquet', index=False)