In [68]:
import pandas as pd
import re
import numpy as np
import warnings
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch
from collections import defaultdict, deque

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
summary_votes = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')

bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})

authors = pd.read_csv('ca_leg/legislation_data/authors.csv')

history = pd.read_csv('ca_leg/legislation_data/history.csv')

versions = pd.read_csv('ca_leg/legislation_data/bill_versions.csv')

bill_votes = pd.read_csv('ca_leg/legislation_data/bill_detail_vote_tbl.csv', parse_dates=['session_date'])

bill_summary = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')

bill_motions = pd.read_csv('ca_leg/legislation_data/bill_motion_tbl.csv')

locations = pd.read_csv('ca_leg/legislation_data/committee_codes.csv')


politicians = pd.read_csv('ca_leg/legislation_data/politicians.csv')


lobbying = pd.read_csv('calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})


expend_assembly = pd.read_csv('calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})


expend_senate = pd.read_csv('calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})


digests = pd.read_csv('ca_leg/legislation_data/digest.csv')


hearings = pd.read_csv('ca_leg/legislation_data/committee_hearing_tbl.csv')

In [3]:
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time']).apply(lambda x: x.strftime('%Y-%m-%d'))

bill_votes['legislator_name'] = bill_votes['legislator_name'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

ACTION_KEYWORDS = [
    "Assembly Third Reading", "Assembly 3rd reading", 'senate 3rd reading', "Senate Third Reading","Concurrence - Urgency Added", "Concurrence in Senate Amendments", "Do pass as amended, and re-refer", "Do pass as amended, but re-refer", "Do pass as amended", "Do pass and be re-referred",
    "Concurrence", "Consent Calendar", "Urgency Clause", "Special Consent",
    "Motion to Reconsider", "Do pass", "Reconsideration", "Committee amendments",
    "W/O REF. TO FILE", "Be re-referred to the Committee",
    "Lay on the Table", "Amend by", "Unfinished Business", "Placed on Appropriations Suspense File",
]

def extract_action(motion_text):
    if not isinstance(motion_text, str) or motion_text is None:
        return None
    motion = motion_text.upper()

    action = next((act for act in ACTION_KEYWORDS if act.upper() in motion), None)
    if action != 'Reconsideration' and 'RECONSIDER' in motion:
        if action is not None:
            action += ' Reconsideration'
        else:
            action = 'Reconsideration'


    return action if action else None

bill_motions['simplified_motion'] = bill_motions['motion_text'].apply(extract_action)

clean_coms = {}
for i, row in locations.iterrows():
    if row['committee_code'].startswith('CZ'):
        continue
    name = row['committee_name']
    if row['committee_code'].startswith('CS'):
        if name.startswith('Sen.'):
            cname = re.sub(r'Sen. ', 'senate ', name).lower()
        elif name.startswith('Senate '):
            cname = name.lower()
        else:
            cname = 'senate ' + name.lower()
    elif row['committee_code'].startswith('CX'):
        if name.lower().startswith('assembly'):
            cname = name.lower()
        else:
            cname = 'assembly ' + name.lower()
    if re.search(r'x\d$', cname) is not None:
        cname = re.sub(r'x(?=\d$)', 'no. ', cname)
    clean_coms[row['committee_code']] = cname


In [4]:
from rapidfuzz import fuzz, process

leg_committees = [f"{row['chamber']} {row['committee_clean']}".lower() for _, row in politicians[['committee_clean', 'chamber']].drop_duplicates().iterrows()]

def match_committees(_names, clean_coms, threshold=92):
    clean_c = list(clean_coms.values())
    clean_codes = list(clean_coms.keys())
    name_mapping = {}
    for i, clean in enumerate(clean_c):
        code = clean_codes[i]
        matches = []
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=threshold
        ))
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.partial_ratio,
            score_cutoff=threshold
        ))
        valid_matches = [m for m in matches if m is not None]
        if len(valid_matches) > 0:
            best_match = max(valid_matches, key=lambda x: x[1])
            name_mapping[code] = best_match[0]
        else:
            fall_back = process.extractOne(
                clean,
                _names,
                scorer=fuzz.token_sort_ratio,
                score_cutoff=threshold - 8
            )
            if fall_back is not None:
                name_mapping[code] = fall_back[0]
            else:
                name_mapping[code] = None
    return name_mapping

committee_matches = match_committees(leg_committees, clean_coms)

In [5]:
locations['committee_clean'] = locations['committee_code'].map(committee_matches)

locations.loc[locations['committee_name'] == 'EDUCATION X5', 'committee_clean'] = 'Budget and Fiscal Review: Education'
locations.loc[locations['committee_code'] == 'CX12', 'committee_clean'] = 'Budget No. 1 on Health and Human Services'
locations.loc[locations['committee_code'] == 'CS68', 'committee_clean'] = 'Budget No. 3 - Health and Human Services'
locations.loc[locations['committee_code'] == 'CS66', 'committee_clean'] = 'Senate Veterans Affairs'
locations.loc[locations['committee_code'] == 'CS56', 'committee_clean'] = 'Senate Public Employment and Retirement'
locations.loc[locations['committee_code'] == 'CS62', 'committee_clean'] = 'Senate Budget and Fiscal Review'
locations.loc[locations['committee_code'] == 'CX23', 'committee_clean'] = 'Assembly Utilities and Commerce'

motion_codes = {
    row['motion_id']: row['simplified_motion']
    for _, row in bill_motions.iterrows()
}

summary_votes['motion_text'] = summary_votes['motion_id'].map(motion_codes)

def repair_bill_id(id):
    front, end = id[:4], id[4:]
    if re.search(r'\d{4}$', front):
        return f"{front}{int(front) + 1}{end}"
    else:
        return id

In [6]:
versions['ID'] = versions['bill_id'].apply(lambda x: repair_bill_id(x))

bill_vers = versions.loc[versions['bill_id'].str.startswith('2')]
for i, row in bill_vers.iterrows():
    tail = f"{row['VersionNum']}{row['MeasureState']}"
    repaired = repair_bill_id(re.sub(tail, '', row['bill_id']))
    end = int(repaired[-4:])

    bill_vers.loc[i, 'bill_ID'] = f"{repaired[:-4]}{end}"


In [7]:
legislators = {i: pol for i, pol in enumerate(politicians['full_name'].unique().tolist())}

leg_parties = {row['full_name']: row['Party'] for _, row in politicians[['full_name', 'Party']].drop_duplicates().iterrows()}
leg_occupations = {row['full_name']: row['Occupation'] for _, row in politicians[['full_name', 'Occupation']].drop_duplicates().iterrows()}
committees = {i: com for i, com in enumerate(politicians['committee_clean'].unique().tolist())}
lobby_firms = {i: firm for i, firm in enumerate(lobbying['FIRM_NAME'].unique().tolist())}

donor_names = list(set(expend_assembly['ExpenderName'].unique().tolist() + expend_senate['ExpenderName'].unique().tolist()))
donors = {i: donor for i, donor in enumerate(donor_names)}


In [8]:
bill_titles = {row['bill_ID']: row['Title'] for _, row in bill_vers[['bill_ID', 'Title']].drop_duplicates().iterrows()}

bill_subjects = {row['bill_ID']: row['GeneralSubject'] for _, row in bill_vers.loc[bill_vers['GeneralSubject'].apply(lambda x: x is not None and isinstance(x, str)), ['bill_ID', 'GeneralSubject']].drop_duplicates().iterrows()}

bill_ids = list(set(bill_votes.loc[bill_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist() + summary_votes.loc[summary_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist()))


In [9]:
bill_id_codes = {row['bill_id']: row['bill_ID'] for _, row in bill_vers.drop_duplicates(subset=['bill_id', 'bill_ID']).iterrows()}
history['bill_ID'] = history['bill_id'].map(bill_id_codes)

history['Date'] = pd.to_datetime(history['Date'])

introduction_dates = {}
for v, group in history.loc[history['bill_ID'].isin(bill_ids)].groupby('bill_ID'):
    introduction_dates[v] = {'Dates': group['Date'].unique().tolist(), 'Actions': group.sort_values('Date', ascending=True).drop_duplicates(subset=['Action', 'Date'])['Action'].tolist()}


In [10]:
version_id_mapping = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['ID']}

version_id_mapping2 = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['bill_id']}
bv2b = {v: k for k, val in version_id_mapping2.items() for v in val}
history['bill_ID'] = history['bill_id'].map(bv2b)

date_ranges = {}

for k, v in introduction_dates.items():
    first, last = min(v['Dates']), max(v['Dates'])
    date_ranges[k] = {'First_action': first, 'Last_action': last}

outcomes = history.loc[history['bill_ID'].notna()].sort_values('Date', ascending=False).groupby('bill_ID').first().reset_index()[['bill_ID', 'Action']]
outcomes.loc[outcomes['Action'].isin(['CHAPTERED', 'ENROLLED', 'FILED', 'APPROVED']), 'Outcome'] = 1
outcomes.loc[outcomes['Action'] == 'VETOED', 'Outcome'] = -1
outcomes.loc[outcomes['Outcome'].isna(), 'Outcome'] = 0

outcome = outcomes.set_index('bill_ID')['Outcome'].to_dict()

In [11]:
vote_bill_ids = {}
for i in summary_votes.loc[summary_votes['bill_id'].isin(bill_ids)].groupby(['year', 'motion_id'])['bill_id'].value_counts().index:
    year, motion_id, bill_id = i
    if (year, motion_id) not in vote_bill_ids.keys():
        vote_bill_ids[(year, motion_id)] = [bill_id]
    else:
        vote_bill_ids[(year, motion_id)].append(bill_id)


bill_vers_dig = bill_vers.merge(digests, on='bill_id', how='inner')

legislators_last_names = {}
for _, row in politicians[['chamber', 'Last', 'Term', 'full_name']].drop_duplicates().iterrows():
    legislators_last_names[(row['chamber'], row['Last'].lower(), row['Term'])] = row['full_name']

features = {row['ID']: {
    'digest': row['DigestText'],
    'MeasureState': row['MeasureState'],
    'VoteRequired': row['VoteRequired'] if row['VoteRequired'] is not None else 'No',
    'VersionNum': row['VersionNum'] if row['VersionNum'] is not None else 'No',
    'LocalProgram': row['LocalProgram'] if row['LocalProgram'] is not None else 'No',
    'FiscalCommittee': row['FiscalCommittee'] if row['FiscalCommittee'] is not None else 'No',
    'TaxLevy': row['TaxLevy'] if row['TaxLevy'] is not None else 'No',
    'Urgency': row['Urgency'] if row['Urgency'] is not None else 'No'} for _, row in bill_vers_dig.iterrows()}



In [12]:
legislator_codes = {v: k for k, v in legislators.items()}

committee_codes = {v.lower(): k for k, v in committees.items()}

bill_votes['chamber'] = bill_votes['location_code'].apply(lambda x: 'assembly' if x == 'AFLOOR' or x.startswith('CX') else 'senate' if x == 'SFLOOR' or x.startswith('CS') else 'full')
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time'])
bill_votes['term'] = bill_votes['vote_date_time'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1  else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year + 1}-{x.year + 2}")


In [13]:
author_locations = authors.loc[(authors['House'] == 'UNKNOWN') & (authors['bill_id'].map(bill_id_codes).isin(bill_ids)), ['bill_id', 'Name']].drop_duplicates()
for i, row in author_locations.iterrows():
    if 'AB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Assembly ' + row['Name']
    elif 'SB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Senate ' + row['Name']
    else:
        author_locations.loc[i, 'name'] = 'Joint ' + row['Name']

In [14]:
from fuzzywuzzy import fuzz

def fuzzy_strings(source_list, target_list):
    def preprocess_name(name):
        if not isinstance(name, str):
            return ""
        name = name.lower()
        name = re.sub(r'\(.*?\)', '', name)
        name = re.sub(r'committee on', '', name)
        name = re.sub(r'[^a-z\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        return name

    clean_source = [preprocess_name(c) for c in source_list]
    clean_target = [preprocess_name(c) for c in target_list]

    keywords = ["education", "health", "finance", "budget", "transportation",
                "judiciary", "environment", "agriculture", "energy", "labor",
                "housing", "veterans affairs", "public safety", "insurance", "banking", "public health", "small business", "redistricting",
                "public utilities", "natural resources", "water",
                "technology", "communications", "elections", "government",
                "appropriations", "rules", "ethics", 'criminal justice', "environmental protection", "college and university", "human services", "reproductive health", "mental health", "technology", "aggriculture", "urban development", "renewable energy", "gun violence", "commerce", "privacy", "cybersecurity", "infrastructure", "disaster preparedness", "prisons", "aging"]

    def get_committee_keywords(name):
        return set(kw for kw in keywords if kw in name)

    target_keywords = [get_committee_keywords(name) for name in clean_target]

    def calculate_similarity(source_idx, target_idx):
        source = clean_source[source_idx]
        target = clean_target[target_idx]

        if not source or not target:
            return 0

        if source == target:
            return 100

        token_sort = fuzz.token_sort_ratio(source, target)
        token_set = fuzz.token_set_ratio(source, target)
        partial = fuzz.partial_ratio(source, target)

        source_kw = get_committee_keywords(source)
        keyword_overlap = len(source_kw.intersection(target_keywords[target_idx]))
        keyword_bonus = min(20, keyword_overlap * 10)
        weighted_score = (token_sort * 0.3) + (token_set * 0.5) + (partial * 0.2) + keyword_bonus

        return weighted_score

    matches = {}
    for i, source in enumerate(source_list):
        scores = [calculate_similarity(i, j) for j in range(len(target_list))]

        if not scores or max(scores) < 60:
            matches[source] = None
        else:
            best_idx = np.argmax(scores)
            confidence = scores[best_idx]

            if confidence >= 60:
                matches[source] = target_list[best_idx]
            else:
                matches[source] = None

    return matches

In [15]:
author_com_matches = fuzzy_strings(author_locations['name'].unique().tolist(), leg_committees)

author_locations['name'] = author_locations['name'].map(author_com_matches)

In [16]:
sponsors = authors.loc[authors['bill_id'].map(bill_id_codes).isin(bill_ids)]
sponsors['term'] = sponsors['bill_id'].apply(lambda x: f"{x[:4]}-{int(x[:4]) + 1}" if int(x[:4]) % 2 == 1 else f"{int(x[:4]) - 1}-{x[:4]}" if int(x[:4]) % 2 == 0 and int(x[:4]) < 2009 else f"{x[:4]}-{int(x[:4]) + 1}")

lob = lobbying.loc[lobbying['clean_beneficiary'].notna(), ['FIRM_NAME', 'EXPN_DSCR', 'clean_beneficiary', 'EXPN_DATE', 'BENE_AMT']]
lob['EXPN_DATE'] = pd.to_datetime(lob['EXPN_DATE'])
lob['term'] = lob['EXPN_DATE'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1  else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year}-{x.year + 1}")

for i, row in politicians.loc[politicians['full_name'].apply(lambda x: isinstance(x, float)), ['Term', 'Last', 'chamber']].drop_duplicates().iterrows():
    term, last = row['Term'], row['Last']
    a = politicians.loc[(politicians['Last'] == last) & (politicians['Term'] == term) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
        continue
    else:
        a = politicians.loc[(politicians['Last'] == last) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
    else:
        print(last, term)


pol_names_terms = {}
for _, row in politicians[['full_name', 'Term', 'chamber']].drop_duplicates().iterrows():
    if ',' in row['full_name']:
        name = row['full_name'].split(',')[1].strip() + ' ' + row['full_name'].split(',')[0].strip()
    else:
        name = row['full_name']
    pol_names_terms[(row['full_name'].lower(), row['Term'])] = {'chamber': row['chamber'], 'name': name}

expend_assembly = expend_assembly.rename(columns={'term': 'Term'})
expend_assembly['chamber'] = 'assembly'
expend_senate = expend_senate.rename(columns={'term': 'Term'})
expend_senate['chamber'] = 'senate'

campaign_contributions = pd.concat([expend_assembly.loc[expend_assembly['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd']), expend_senate.loc[expend_senate['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd'])])

campaign_contributions['DateEnd'] = pd.to_datetime(campaign_contributions['DateEnd'])


sponsors['bill_ID'] = sponsors['bill_id'].apply(repair_bill_id)

In [17]:
voting = history.merge(bill_votes, left_on=['bill_ID', 'Date'], right_on=['bill_id', 'vote_date_time'], how='inner').rename(columns={'bill_id_x': 'bill_version'}).drop('bill_id_y', axis=1)
voting['bv_id'] = voting['bill_version'].apply(repair_bill_id)

voting_places = {}
for i, row in voting.groupby(['motion_id', 'term', 'chamber', 'Date']).agg({'legislator_name': lambda x: list(x)}).iterrows():
    motion_id, term, chamber, date = i
    g = politicians.loc[(politicians['chamber'] == chamber) & (politicians['Term'] == term) & (politicians['Last'].isin(row['legislator_name']))]
    voting_places[(motion_id, term, chamber, date)] = {
        'most_common_committee': g.groupby('committee_clean').size().sort_values(ascending=False).head(1).index[0] if len(g) > 0 else None
    }
voting['voting_place'] = voting.apply(lambda row: voting_places.get((row['motion_id'], row['term'], row['chamber'], row['Date']), {}).get('most_common_committee', None), axis=1)


hear = hearings[['bill_id', 'location_code']].merge(locations[['committee_code', 'committee_clean']], left_on='location_code', right_on='committee_code', how='left')[['bill_id', 'committee_clean']].drop_duplicates()
hear['year'] = hear['bill_id'].apply(lambda x: int(x[:4]))

positions = {p: 'Democratic Alternate' if re.search(r'Democratic\s*Alternate', p) is not None else 'Vice Chair' if re.search(r'V\s*i\s*c\s*e\s*-*\s*C\s*h\s*a\s*i\s*r\s*', p) is not None else 'Co-Chair' if re.search(r'Co\s*-\s*Chair', p) is not None else 'Chair' if re.search(r'Cha\s*i\s*r', p) is not None else 'Republican Alternate' if re.search(r'\s*Republican\s*Alternate', p) is not None else p for p in politicians['position'].unique()}

vnums = bill_vers.set_index('ID')['VersionNum'].to_dict()
vid_map = {v: k for k, val in version_id_mapping.items() for v in val}

In [None]:
bill_votes.merge(bill_vers[['ID', 'bill_ID']], left_on='bill_id', right_on='ID', how='left').drop('ID', axis=1)

In [30]:
import json, ast, datetime as _dt
import geopandas as gpd
import tempfile, zipfile, pathlib

# ---------- helpers ----------
def _safe_dt(s):
    return pd.to_datetime(s, errors='coerce')

def _canon_name(n):
    n = re.sub(r'[^\w\s]', ' ', str(n)).lower()
    n = re.sub(r'\s+', ' ', n).strip()
    return n

def _infer_origin_chamber_from_bill_id(bill_id):
    s = str(bill_id)
    if 'AB' in s: return 'assembly'
    if 'SB' in s: return 'senate'
    return None

def _term_from_date(ts):
    # California 2-year terms; November crossover
    if pd.isna(ts): return np.nan
    y = ts.year
    if y % 2 == 1:
        return f"{y}-{y+1}"
    else:
        if ts.month < 11:
            return f"{y-1}-{y}"
        return f"{y+1}-{y+2}"

def _tokenize(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return [t for t in s.split(' ') if t]

def _jaccard(a_set, b_set):
    if not a_set and not b_set: return 1.0
    i = len(a_set & b_set)
    u = len(a_set | b_set)
    return i / u if u else 0.0

In [75]:
def read_zip(zip_path, crs=3857):
    tmp = tempfile.TemporaryDirectory()
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(tmp.name)
    shp = next(pathlib.Path(tmp.name).rglob("*.shp"))
    gdf = gpd.read_file(shp).set_crs(epsg=crs)
    gdf = gdf.to_crs(epsg=3857)
    return gdf, tmp

def district_cycle(year):
    if year <= 2012: return "2001"
    if year <= 2022: return "2011"
    return "current"

counties_gdf, _ = read_zip('dashboard/backend/data/ca_counties.zip')
counties_gdf = counties_gdf[['COUNTYFP', 'NAMELSAD', 'geometry']]
counties_gdf['county_area'] = counties_gdf.geometry.area
counties_gdf['county_id'] = counties_gdf['COUNTYFP'].astype(int)

cgdf = counties_gdf.to_json(na='drop', to_wgs84=True)
data_dir = pathlib.Path('dashboard/backend/data')

asm11_zip = data_dir / '2011_assembly_state_shp.zip'
sen11_zip = data_dir / '2011_senate_state_shp.zip'
asmcur_zip = data_dir / '2021_AD_Final_shp.zip'
sencur_zip = data_dir / '2021_SD_Final_shp.zip'

dist_info = [
    (asm11_zip, "assembly", "2011", 4019),
    (sen11_zip, "senate",   "2011", 4019),
    (asmcur_zip, "assembly","current", 4269),
    (sencur_zip, "senate",  "current", 4269)
]

weight_records = []
tmps = []
for zp, house, cycle, crs in dist_info:
    gdf, tmp = read_zip(zp, crs)
    tmps.append(tmp)
    gdf = gdf.rename(columns={gdf.columns[0]: "district_id"})[["district_id", "geometry"]]
    gdf["house"] = house
    gdf["cycle"] = cycle
    gdf["dist_area"] = gdf.geometry.area

    inter = gpd.overlay(gdf, counties_gdf, how="intersection")
    inter["fragment_area"] = inter.geometry.area

    weight_records.append(
        inter[["house", "cycle", "district_id", "county_id", "fragment_area", 'county_area', 'dist_area']].reset_index(drop=True)
    )

weights = pd.concat(weight_records, ignore_index=True)
weights['weight'] = weights['fragment_area'] / weights['county_area']
weights['district_share_in_county'] = weights['fragment_area']/weights['dist_area']

In [22]:
vote_map = {'AYE':1,'YES':1,'NOE':-1,'NO':-1}
voting['vote_num'] = voting['vote_code'].str.upper().map(vote_map).fillna(0).astype(int)
motion_dict = bill_motions.set_index('motion_id')['motion_text'].to_dict()

# Define a roll call by bill + time + location_code + motion_id
roll_cols = ['bill_ID','bill_version','Date','motion_id','chamber','voting_place']
roll = (voting
        .groupby(roll_cols, dropna=False)
        .agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())),
             no =('vote_num', lambda x: int((np.array(x)<0).sum())),
             total=('vote_num','count'))
        .reset_index())
roll['pass'] = (roll['yes'] > roll['no'])

bill_votes['vote_num'] = bill_votes['vote_code'].str.upper().map(vote_map).fillna(0).astype(int)
bill_votes['Date'] = pd.to_datetime(bill_votes['vote_date_time']).dt.date
roll_cols = ['bill_id','Date','motion_id','chamber','location_code']
summary_roll = (bill_votes
        .groupby(roll_cols, dropna=False)
        .agg(yes=('vote_num', lambda x: int((np.array(x)>0).sum())),
             no =('vote_num', lambda x: int((np.array(x)<0).sum())),
             total=('vote_num','count'))
        .reset_index())
summary_roll['pass'] = (summary_roll['yes'] > summary_roll['no'])

In [26]:
# ========= Stage timing from votes (committee/floor sequence only) =========
# Definitions (fully based on votes):
# - intro: earliest vote recorded
# - comm_ref: earliest committee vote (any committee)
# - first_read: earliest floor vote (any floor)
# - second_read: earliest committee vote strictly after first_read
# - third_read: earliest floor vote strictly after second_read
def _stage_timing(group):
    g = group.sort_values('Date')
    is_committee = ~(g['voting_place'].isin(['Assembly Floor','Senate Floor']))

    # Get the bill_ID from the group (should be consistent within group)
    bill_id = g['bill_ID'].iloc[0]
    is_floor = summary_roll.loc[(summary_roll['bill_id'] == bill_id) & (summary_roll['location_code'].isin(['AFLOOR','SFLOOR']))]

    intro = g['Date'].min()

    comm_ref = g.loc[is_committee, 'Date'].min() if is_committee.any() else pd.NaT
    first_read = g['Date'].min() if g['Date'].any() else pd.NaT

    second_read = pd.NaT
    if pd.notna(first_read):
        _after1 = g[(g['Date'] > first_read) & (is_committee)]
        if not _after1.empty:
            second_read = _after1['Date'].min()

    third_read = pd.NaT
    if pd.notna(second_read):
        _after2 = g[(g['Date'] > second_read)]
        if not _after2.empty:
            third_read = _after2['Date'].min()

    # mark which floors passed (simple yes>no rule)
    asm_floor_pass = pd.NaT
    sen_floor_pass = pd.NaT

    if not is_floor.empty:
        asm_floor_data = is_floor[(is_floor['location_code'] == 'AFLOOR') & (is_floor['pass'])]
        if not asm_floor_data.empty:
            asm_floor_pass = asm_floor_data['Date'].min()

        sen_floor_data = is_floor[(is_floor['location_code'] == 'SFLOOR') & (is_floor['pass'])]
        if not sen_floor_data.empty:
            sen_floor_pass = sen_floor_data['Date'].min()

    return pd.Series({
        'intro': intro, 'comm_ref': comm_ref, 'first_read': first_read,
        'second_read': second_read, 'third_read': third_read,
        'asm_floor_pass': asm_floor_pass, 'sen_floor_pass': sen_floor_pass
    })

stages_df = roll.groupby('bill_ID').apply(_stage_timing).reset_index()

  first_read = g['Date'].min() if g['Date'].any() else pd.NaT
  stages_df = roll.groupby('bill_ID').apply(_stage_timing).reset_index()


In [31]:
outcomes = (history.dropna(subset=['bill_ID'])
            .sort_values('Date', ascending=False)
            .groupby('bill_ID').first().reset_index()[['bill_ID','Action']])
outcomes.loc[outcomes['Action'].isin(['CHAPTERED','ENROLLED','FILED','APPROVED']),'Outcome'] = 1
outcomes.loc[outcomes['Action'].isin(['VETOED']),'Outcome'] = -1
outcomes['Outcome'] = outcomes['Outcome'].fillna(0).astype(int)
y_df = outcomes[['bill_ID','Outcome']].rename(columns={'Outcome':'outcome'})

# ========= Topic / subject from versions/digests (non-modeled) =========
if 'GeneralSubject' in versions.columns:
    topic_map_df = (versions[['bill_id','GeneralSubject']]
                    .dropna()
                    .drop_duplicates()
                    .assign(bill_ID=lambda d: d['bill_id'].map(bv2b))
                    .dropna(subset=['bill_ID'])
                    .drop_duplicates('bill_ID')
                    .rename(columns={'GeneralSubject':'topic'})[['bill_ID','topic']])
else:
    topic_map_df = pd.DataFrame(columns=['bill_ID','topic'])

# ========= First/Last action windows for longevity =========
first_last = (history.dropna(subset=['bill_ID'])
              .groupby('bill_ID')['Date']
              .agg(First_action='min', Last_action='max')
              .reset_index())

# ========= Pipeline base =========
pipe_base = (stages_df
             .merge(first_last, on='bill_ID', how='left')
             .merge(topic_map_df, on='bill_ID', how='left')
             .merge(y_df, on='bill_ID', how='left'))

# ========= Funnel metrics (consecutive stage pairs) =========
stage_order = [c for c in ['intro','comm_ref','first_read','second_read','third_read', 'asm_floor_pass','sen_floor_pass'] if c in pipe_base.columns]
pairs = [(stage_order[i], stage_order[i+1]) for i in range(len(stage_order)-1)]

rows = []
for a,b in pairs:
    aa = _safe_dt(pipe_base[a]); bb = _safe_dt(pipe_base[b])
    entered  = int(aa.notna().sum())
    advanced = int(((aa.notna()) & (bb.notna())).sum())
    rate     = float(advanced / entered) if entered else np.nan
    mdays    = float(np.median((bb - aa).dt.days.dropna().values)) if advanced else np.nan
    rows.append({'from':a,'to':b,'entered':entered,'advanced':advanced,'pass_rate':rate,'median_days':mdays})
pipeline_stage_funnel = pd.DataFrame(rows)

pipeline_timestamps_wide = pipe_base[['bill_ID','topic'] + stage_order].copy()
for s in stage_order:
    pipeline_timestamps_wide[f'{s}_ts'] = _safe_dt(pipeline_timestamps_wide[s]).astype('int64', errors='ignore')//10**9

# ========= Stuck candidates (present at a, missing b; with 90th pct ref time) =========
stuck_rows=[]
for a,b in pairs:
    aa = _safe_dt(pipe_base[a]); bb = _safe_dt(pipe_base[b])
    dd = (bb - aa).dt.days
    q90 = np.nanpercentile(dd.dropna().values, 90) if dd.notna().any() else np.nan
    sub = pipe_base[(aa.notna()) & (bb.isna())][['bill_ID','topic']].copy()
    if not sub.empty:
        sub['stage']=a; sub['q90']=q90
        stuck_rows.append(sub)
pipeline_stuck_candidates = pd.concat(stuck_rows, ignore_index=True) if stuck_rows else pd.DataFrame(columns=['bill_ID','topic','stage','q90'])

In [58]:
hear_seq = (hearings[['bill_id','location_code']]
            .merge(locations[['committee_code','committee_clean']],
                   left_on='location_code', right_on='committee_code', how='left')
            .rename(columns={'committee_clean':'committee'})
           )

route_df = (hear_seq.groupby('bill_id')['committee']
            .apply(lambda s: tuple([x for x in s.dropna().tolist() if x]))
            .reset_index()
            .rename(columns={'committee':'route'}))
route_df['route_key'] = route_df['route'].apply(lambda r: ' > '.join(list(r)[:5]) if isinstance(r, tuple) and r else None)
route_df.rename(columns={'bill_id':'bill_ID'}, inplace=True)

route_perf = (route_df
              .merge(y_df, on='bill_ID', how='left')
              .merge(topic_map_df, on='bill_ID', how='left'))
route_archetypes = (route_perf.groupby(['topic','route_key'])
                    .agg(n=('bill_ID','nunique'),
                         pass_rate=('outcome', lambda x: float(np.mean(np.array(x)==1)) if len(x)>0 else np.nan))
                    .reset_index()
                    .sort_values(['topic','n'], ascending=[True,False]))

In [33]:
dig = digests[['bill_id','DigestText']].copy()
dig['bill_ID'] = dig['bill_id'].map(bv2b)
ver = versions[['bill_id','VersionNum']].copy()
ver['bill_ID'] = ver['bill_id'].map(bv2b)
dv = (ver.merge(dig, on=['bill_id','bill_ID'], how='inner')
         .dropna(subset=['DigestText']))

def _digest_stats(df):
    df = df.sort_values('VersionNum')
    toks = [set(_tokenize(t)) for t in df['DigestText']]
    sims=[]
    for i in range(1,len(toks)):
        sims.append(_jaccard(toks[i-1], toks[i]))
    return pd.Series({'n_versions': len(df), 'median_sim': float(np.median(sims)) if sims else np.nan})

amendment_churn = (dv.groupby('bill_ID').apply(_digest_stats).reset_index()
                   .merge(topic_map_df, on='bill_ID', how='left'))


  amendment_churn = (dv.groupby('bill_ID').apply(_digest_stats).reset_index()


In [62]:
now = pd.Timestamp.now().date()
pb = pipe_base[['bill_ID','topic'] + stage_order].copy()
for c in stage_order: pb[c] = _safe_dt(pb[c])
pb['last_date'] = pb[stage_order].max(axis=1)

by_stage_q80 = {}
for c in stage_order:
    dd = pb[c].apply(lambda x: (now - x.date()).days if pd.notna(x) else np.nan)
    by_stage_q80[c] = np.nanpercentile(dd.dropna().values, 80) if dd.notna().any() else np.nan

last_stage_col = None
for c in reversed(stage_order):
    if pb[c].notna().any(): last_stage_col = c; break

pb = pb.merge(route_archetypes[['route_key','topic','pass_rate']].drop_duplicates(subset=['route_key','topic']),
              on='topic', how='left')
pb = pb.merge(route_df[['bill_ID','route_key']], on='bill_ID', how='left')
pb = pb.merge(amendment_churn[['bill_ID','n_versions']], on='bill_ID', how='left')

def _risk_row(r):
    q80 = by_stage_q80.get(last_stage_col, np.inf)
    churn  = (r.get('n_versions',0) or 0) >= 5
    low_route = (r.get('pass_rate',1.0) or 1.0) < 0.3
    return int(sum([churn, low_route]))

pb['risk'] = pb.apply(_risk_row, axis=1)
risk_list = pb[['bill_ID','topic','route_key_x','n_versions','risk']].copy()

In [66]:
entries = stages_df[['bill_ID','comm_ref']].dropna()
exits = stages_df[['bill_ID'] + [c for c in stage_order if c!='comm_ref']].copy()
exits['has_exit'] = exits.drop(columns=['bill_ID']).notna().any(axis=1)
gate = entries.merge(exits[['bill_ID','has_exit']], on='bill_ID', how='left')

heard = hear_seq[['bill_id','committee']].dropna().drop_duplicates().rename(columns={'bill_id':'bill_ID'})
gk = heard.merge(gate[['bill_ID','has_exit']], on='bill_ID', how='left')
committee_gatekeeping = (gk.groupby('committee')
                         .agg(entries=('bill_ID','nunique'),
                              exits=('has_exit', lambda x: int(np.nansum(np.array(x)==True))))
                         .reset_index())
committee_gatekeeping['gatekeeping'] = 1 - (committee_gatekeeping['exits'] /
                                            committee_gatekeeping['entries'].replace(0, np.nan))

hear_dates = history[['bill_ID','Date','Action']].copy()
hear_dates = hear_dates[hear_dates['Action'].str.upper()
                        .str.contains('HEARING|REFERRED|RE-REFERRED|COMMITTEE', na=False)]
hear_seq.rename(columns={'bill_id':'bill_ID'}, inplace=True)
hear_dates['week'] = _safe_dt(hear_dates['Date']).dt.to_period('W').astype(str)
committee_workload_median = (hear_seq.merge(hear_dates[['bill_ID','week']], on='bill_ID', how='left')
                             .groupby(['committee','week'])
                             .agg(bills=('bill_ID','nunique'))
                             .groupby('committee')
                             .agg(median_weekly_bills=('bills','median'))
                             .reset_index())

In [67]:
origin = stages_df[['bill_ID']].copy()
origin['origin'] = origin['bill_ID'].apply(_infer_origin_chamber_from_bill_id)
cc = stages_df[['bill_ID','asm_floor_pass','sen_floor_pass']].copy()
cc['A_then_S'] = cc['asm_floor_pass'].notna() & cc['sen_floor_pass'].notna()
cc['S_then_A'] = cc['sen_floor_pass'].notna() & cc['asm_floor_pass'].notna()
cross_chamber_friction = (cc.merge(topic_map_df, on='bill_ID', how='left')
                          .groupby('topic')
                          .agg(pass_Asm_then_Sen=('A_then_S', lambda x: int(np.nansum(x))),
                               pass_Sen_then_Asm=('S_then_A', lambda x: int(np.nansum(x))))
                          .reset_index())

sv = stages_df[['bill_ID','intro']].merge(y_df, on='bill_ID', how='left')
sv['start'] = _safe_dt(sv['intro'])
ends = first_last[['bill_ID','Last_action']].rename(columns={'Last_action':'end'})
sv = sv.merge(ends, on='bill_ID', how='left')
sv['end'] = _safe_dt(sv['end'])
sv = sv.merge(topic_map_df, on='bill_ID', how='left')

def _survival_topic(df):
    df = df.dropna(subset=['start'])
    if df.empty: return pd.DataFrame(columns=['t','survival'])
    t0 = df['start'].min()
    t1 = df['end'].max() if df['end'].notna().any() else t0 + pd.Timedelta(days=1)
    grid = pd.date_range(t0, t1, freq='7D')
    rows=[]
    for g in grid:
        alive = ((df['end'].isna()) | (df['end'] > g)).sum()
        total = len(df)
        rows.append({'t': g, 'survival': alive/total if total else np.nan})
    return pd.DataFrame(rows)

_surv = []
for topic, g in sv.groupby('topic'):
    sdf = _survival_topic(g)
    sdf['topic'] = topic
    _surv.append(sdf)
survival_curves = pd.concat(_surv, ignore_index=True) if _surv else pd.DataFrame(columns=['t','survival','topic'])


In [71]:
v = voting[['bill_ID','legislator_name','vote_code','location_code']].copy()
vote_num = {'AYE':1,'YES':1,'NOE':-1,'NO':-1}
v['vote'] = v['vote_code'].str.upper().map(vote_num).fillna(0).astype(int)
mat = v.pivot_table(index='legislator_name', columns='bill_ID', values='vote', aggfunc='first').fillna(0).astype(int)
l = mat.index.to_list()
sim_edges=[]
if mat.shape[0] >= 2:
    X = mat.to_numpy(dtype=np.float32)
    Xc = X - X.mean(axis=1, keepdims=True)
    denom = np.sqrt((Xc**2).sum(axis=1, keepdims=True)); denom[denom==0]=1.0
    Xn = Xc/denom
    for i in range(Xn.shape[0]):
        dots = Xn[i] @ Xn.T
        dots[i] = -1
        idx = np.where(dots>=0.6)[0]
        for j in idx:
            if i<j:
                sim_edges.append((l[i], l[j], float(dots[j])))
vote_similarity_edges = pd.DataFrame(sim_edges, columns=['u','v','sim'])

adj=defaultdict(list)
for _,r in vote_similarity_edges.iterrows():
    adj[r['u']].append(r['v']); adj[r['v']].append(r['u'])
visited=set(); comps=[]
for node in l:
    if node in visited: continue
    q=deque([node]); comp=[]
    while q:
        x=q.popleft()
        if x in visited: continue
        visited.add(x); comp.append(x)
        for nb in adj.get(x, []):
            if nb not in visited: q.append(nb)
    comps.append(comp)
vote_communities = pd.DataFrame([(n,i) for i,comp in enumerate(comps) for n in comp],
                                columns=['legislator_name','community'])

vc = voting[['legislator_name','vote_code','location_code']].copy()
vc['is_floor'] = vc['location_code'].isin(['AFLOOR','SFLOOR'])
vc['yes'] = vc['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
leg_comm = vc[~vc['is_floor']].groupby('legislator_name')['yes'].mean().rename('comm_yes')
leg_floor = vc[vc['is_floor']].groupby('legislator_name')['yes'].mean().rename('floor_yes')
committee_floor_drift = (pd.concat([leg_comm, leg_floor], axis=1)
                         .reset_index())
committee_floor_drift['drift'] = committee_floor_drift['floor_yes'] - committee_floor_drift['comm_yes']

In [72]:
dig_m = digests[['bill_id','DigestText']].copy()
dig_m['bill_ID'] = dig_m['bill_id'].map(bv2b)
dig_m = dig_m.dropna(subset=['bill_ID','DigestText'])
dig_m = dig_m.merge(y_df, on='bill_ID', how='left')

def _lift(df):
    toks_pos=defaultdict(int); toks_neg=defaultdict(int)
    for _,r in df.iterrows():
        toks=set(_tokenize(r['DigestText']))
        if int(r.get('outcome',0))==1:
            for t in toks: toks_pos[t]+=1
        else:
            for t in toks: toks_neg[t]+=1
    rows=[]
    all_t = set(list(toks_pos.keys())+list(toks_neg.keys()))
    for t in all_t:
        pos = toks_pos.get(t,0)+1
        neg = toks_neg.get(t,0)+1
        rows.append((t, float(np.log(pos/neg)), toks_pos.get(t,0), toks_neg.get(t,0)))
    return (pd.DataFrame(rows, columns=['token','log_lift_pass_vs_other','pos','neg'])
              .sort_values('log_lift_pass_vs_other', ascending=False))

text_lift_top_tokens = _lift(dig_m)

In [74]:
ca = pd.concat([
    expend_assembly[['ExpenderName','Amount','matched_target_name','Term','DateEnd']].dropna(subset=['ExpenderName','Amount','matched_target_name']),
    expend_senate  [['ExpenderName','Amount','matched_target_name','Term','DateEnd']].dropna(subset=['ExpenderName','Amount','matched_target_name'])
], ignore_index=True)

port = ca.groupby(['ExpenderName','matched_target_name'])['Amount'].sum().reset_index()
def _hhi(g):
    s = g['Amount'].sum()
    if s<=0: return np.nan
    p = (g['Amount']/s).values
    return float(np.sum(p*p))
donor_portfolios_hhi = (port.groupby('ExpenderName').apply(_hhi)
                        .reset_index().rename(columns={0:'hhi'}))

# Lobbying firm × committee heatmap (beneficiary string mapped to known committees where possible)
known_committees = locations[['committee_code','committee_name']].dropna()
kc = {c.lower():n for c,n in zip(known_committees['committee_code'], known_committees['committee_name'])}
lob = lobbying[['FIRM_NAME','clean_beneficiary','BENE_AMT']].dropna().copy()
lob['benef_code'] = lob['clean_beneficiary'].str.upper()
lob['committee'] = np.where(lob['benef_code'].str.startswith(('CX','CS')),
                            lob['benef_code'].str.lower().map({k.lower():v for k,v in kc.items()}),
                            np.nan)
lobbying_firm_committee_heatmap = (lob.dropna(subset=['committee'])
                                   .groupby(['FIRM_NAME','committee'])
                                   .agg(contacts=('committee','count'), spend=('BENE_AMT','sum'))
                                   .reset_index())

# money-vote alignment (by term, crude quartile split)
vt = voting[['legislator_name','vote_code','vote_date_time']].copy()
vt['canon'] = vt['legislator_name'].apply(_canon_name)
vt['term'] = vt['vote_date_time'].apply(_term_from_date)
vt['yes'] = vt['vote_code'].str.upper().isin(['AYE','YES']).astype(int)
leg_term_rate = vt.groupby(['canon','term'])['yes'].mean().reset_index().rename(columns={'yes':'yes_rate'})

don = ca.copy()
don['canon'] = don['matched_target_name'].apply(_canon_name)
fund = (don.groupby(['canon','Term'])['Amount'].sum()
        .reset_index().rename(columns={'Term':'term','Amount':'funding'}))
ft = fund.merge(leg_term_rate, on=['canon','term'], how='inner')

  donor_portfolios_hhi = (port.groupby('ExpenderName').apply(_hhi)


In [84]:
def _quartiles(g):
    if g.empty: return pd.Series({'yes_rate_top':np.nan,'yes_rate_bottom':np.nan,'delta':np.nan,'n_top':0,'n_bottom':0})
    q = g['funding'].quantile([0.25,0.75]).values
    low = g[g['funding']<=q[0]]; high = g[g['funding']>=q[1]]
    return pd.Series({'yes_rate_top': float(high['yes_rate'].mean()) if not high.empty else np.nan,
                      'yes_rate_bottom': float(low['yes_rate'].mean()) if not low.empty else np.nan,
                      'delta': float((high['yes_rate'].mean() - low['yes_rate'].mean())) if (not high.empty and not low.empty) else np.nan,
                      'n_top': int(high.shape[0]), 'n_bottom': int(low.shape[0])})
money_vote_alignment = ft.groupby('term').apply(_quartiles, include_groups=False).reset_index()

# Event-time spending around legislative window
cc = ca[['DateEnd','Amount']].dropna().copy()
cc['DateEnd'] = pd.to_datetime(_safe_dt(cc['DateEnd'])).dt.date
start_min = pd.to_datetime(_safe_dt(first_last['First_action']).min() if first_last['First_action'].notna().any() else cc['DateEnd'].min()).date()
cc['t'] = cc['DateEnd'].apply(lambda d: (d - start_min).days if pd.notna(d) else np.nan)
money_event_time_curve = (cc.groupby('t')['Amount'].mean().reset_index().sort_values('t'))

In [85]:
pol_roster = politicians[['District No.','Term','full_name','chamber']].drop_duplicates().copy()
pol_roster['district_id'] = pd.to_numeric(pol_roster['District No.'], errors='coerce')

# donations (assembly/senate files)
ea = expend_assembly.copy()
ea = ea[ea['Term'].apply(lambda x: isinstance(x,str))]
ea['term_year'] = ea['Term'].str.extract(r'^(\d{4})').astype(int)
ea['term_year'] = np.where((ea['term_year']%2==0), ea['term_year']-1, ea['term_year'])
ea_agg = ea.groupby(['matched_target_name','term_year'])['Amount'].sum().reset_index()

es = expend_senate.copy()
es = es[es['Term'].apply(lambda x: isinstance(x,str))]
es['term_year'] = es['Term'].str.extract(r'^(\d{4})').astype(int)
es['term_year'] = np.where((es['term_year']%2==0), es['term_year']-1, es['term_year'])
es_agg = es.groupby(['matched_target_name','term_year'])['Amount'].sum().reset_index()

# lobbying by legislator-term
lb = lobbying[['clean_beneficiary','EXPN_DATE','BENE_AMT']].dropna().copy()
lb['EXPN_DATE'] = _safe_dt(lb['EXPN_DATE'])
lb['term_year'] = lb['EXPN_DATE'].apply(lambda x: np.nan if pd.isna(x) else (x.year-1 if (x.year%2==0 and x.month<11) else (x.year+1 if x.year%2==0 else x.year)))
lob_ag = lb.groupby(['clean_beneficiary','term_year'])['BENE_AMT'].sum().reset_index().rename(columns={'BENE_AMT':'total_lobbying'})

def _nkey(s): return _canon_name(re.sub(r',','', str(s)))
pol_roster['name_key'] = pol_roster['full_name'].apply(_nkey)
ea_agg['name_key'] = ea_agg['matched_target_name'].apply(_nkey)
es_agg['name_key'] = es_agg['matched_target_name'].apply(_nkey)
lob_ag['name_key'] = lob_ag['clean_beneficiary'].apply(_nkey)
pol_roster['term_year'] = pol_roster['Term'].str.extract(r'^(\d{4})').astype(float)

lfund = (pol_roster
         .merge(lob_ag[['name_key','term_year','total_lobbying']], on=['name_key','term_year'], how='left')
         .merge(ea_agg[['name_key','term_year','Amount']].rename(columns={'Amount':'don_a'}), on=['name_key','term_year'], how='left')
         .merge(es_agg[['name_key','term_year','Amount']].rename(columns={'Amount':'don_s'}), on=['name_key','term_year'], how='left'))

lfund['total_donations'] = lfund[['don_a','don_s']].sum(axis=1, skipna=True)
lfund['total_lobbying']  = lfund['total_lobbying'].fillna(0)
lfund['total_received']  = lfund[['total_donations','total_lobbying']].sum(axis=1, skipna=True)

# map roster cycle to weights cycle
lfund['cycle'] = np.where(lfund['term_year']<=2012, '2011', 'current')

# allocate district totals to counties
reg = (lfund[['cycle','district_id','chamber','total_donations','total_lobbying','total_received']]
       .merge(weights, on=['cycle','district_id'], how='left'))
for c in ['total_donations','total_lobbying','total_received']:
    reg[c] = reg[c] * reg['district_share_in_county']
reg_funds = (reg.groupby(['county_id','chamber'])
             .agg(total_donations=('total_donations','sum'),
                  total_lobbying=('total_lobbying','sum'),
                  total_received=('total_received','sum'))
             .reset_index())

co_cal = reg_funds.merge(counties_gdf[['county_id','NAMELSAD','geometry']], on='county_id', how='left')
ca_legislator_funding_geo = gpd.GeoDataFrame(co_cal, geometry='geometry', crs=counties_gdf.crs).to_crs(epsg=4326)
ca_legislator_funding = reg_funds.copy()

In [86]:
bill_dates_df = first_last.copy()
bill_dates_df['longevity_days'] = (bill_dates_df['Last_action'] - bill_dates_df['First_action']).dt.days
signals = (roll.groupby('bill_ID')
           .apply(lambda g: float(np.mean((g['yes']/(g['total'].replace(0, np.nan))) >= 0.5)))
           .reset_index().rename(columns={0:'vote_signal'}))
n_versions = (versions.assign(bill_ID=lambda d: d['bill_id'].map(bv2b))
              .dropna(subset=['bill_ID'])
              .groupby('bill_ID')['VersionNum'].nunique()
              .reset_index().rename(columns={'VersionNum':'bill_version_count'}))

bills_table = (topic_map_df
               .merge(y_df, on='bill_ID', how='left')
               .merge(bill_dates_df[['bill_ID','First_action','longevity_days']], on='bill_ID', how='left')
               .merge(signals, on='bill_ID', how='left')
               .merge(amendment_churn[['bill_ID','n_versions','median_sim']], on='bill_ID', how='left')
               .merge(n_versions, on='bill_ID', how='left'))
bills_table['First_action'] = pd.to_datetime(bills_table['First_action']).dt.strftime('%Y-%m-%d')

  .apply(lambda g: float(np.mean((g['yes']/(g['total'].replace(0, np.nan))) >= 0.5)))


In [87]:
precomp_outputs = {
    'pipeline_stage_funnel': pipeline_stage_funnel,
    'pipeline_timestamps_wide': pipeline_timestamps_wide,
    'pipeline_stuck_candidates': pipeline_stuck_candidates,
    'route_archetypes': route_archetypes,
    'amendment_churn': amendment_churn,
    'risk_list': risk_list,
    'committee_gatekeeping': committee_gatekeeping,
    'committee_workload_median': committee_workload_median,
    'cross_chamber_friction': cross_chamber_friction,
    'survival_curves': survival_curves,
    'vote_similarity_edges': vote_similarity_edges,
    'vote_communities': vote_communities,
    'committee_floor_drift': committee_floor_drift,
    'text_lift_top_tokens': text_lift_top_tokens,
    'donor_portfolios_hhi': donor_portfolios_hhi,
    'lobbying_firm_committee_heatmap': lobbying_firm_committee_heatmap,
    'money_vote_alignment': money_vote_alignment,
    'money_event_time_curve': money_event_time_curve,
    'ca_legislator_funding_geo': ca_legislator_funding_geo,  # GeoDataFrame
    'ca_legislator_funding': ca_legislator_funding,          # tabular
    'bills_table': bills_table
}