In [173]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch, datetime, hashlib, json, re, warnings, pathlib, zipfile, tempfile, pickle, unicodedata
from torch_geometric.data import HeteroData
import geopandas as gpd

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)


 ## Data

In [174]:
summary_votes = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')


In [175]:
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})


In [176]:
authors = pd.read_csv('ca_leg/legislation_data/authors.csv')


In [177]:
history = pd.read_csv('ca_leg/legislation_data/history.csv')


In [178]:
versions = pd.read_csv('ca_leg/legislation_data/bill_versions.csv')


In [179]:
bill_votes = pd.read_csv('ca_leg/legislation_data/bill_detail_vote_tbl.csv', parse_dates=['session_date'])


In [180]:
bill_summary = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')


In [181]:
bill_motions = pd.read_csv('ca_leg/legislation_data/bill_motion_tbl.csv')


In [182]:
locations = pd.read_csv('ca_leg/legislation_data/committee_codes.csv')



In [183]:
politicians = pd.read_csv('ca_leg/legislation_data/politicians.csv')



In [184]:
lobbying = pd.read_csv('calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})


In [185]:
expend_assembly = pd.read_csv('calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})



In [186]:
expend_senate = pd.read_csv('calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})



In [187]:
digests = pd.read_csv('ca_leg/legislation_data/digest.csv')



In [188]:
hearings = pd.read_csv('ca_leg/legislation_data/committee_hearing_tbl.csv')


In [189]:
disclosure = pd.read_csv('calaccess/CVR_LOBBY_DISCLOSURE_CD2.csv', dtype=str)

 ## Cleaning

In [190]:
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time']).apply(lambda x: x.strftime('%Y-%m-%d'))


In [191]:
bill_votes['legislator_name'] = bill_votes['legislator_name'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))


In [192]:
ACTION_KEYWORDS = [
    "Assembly Third Reading", "Assembly 3rd reading", 'senate 3rd reading', "Senate Third Reading","Concurrence - Urgency Added", "Concurrence in Senate Amendments", "Do pass as amended, and re-refer", "Do pass as amended, but re-refer", "Do pass as amended", "Do pass and be re-referred",
    "Concurrence", "Consent Calendar", "Urgency Clause", "Special Consent",
    "Motion to Reconsider", "Do pass", "Reconsideration", "Committee amendments",
    "W/O REF. TO FILE", "Be re-referred to the Committee",
    "Lay on the Table", "Amend by", "Unfinished Business", "Placed on Appropriations Suspense File",
]

def extract_action(motion_text):
    if not isinstance(motion_text, str) or motion_text is None:
        return None
    motion = motion_text.upper()

    action = next((act for act in ACTION_KEYWORDS if act.upper() in motion), None)
    if action != 'Reconsideration' and 'RECONSIDER' in motion:
        if action is not None:
            action += ' Reconsideration'
        else:
            action = 'Reconsideration'


    return action if action else None


In [193]:
bill_motions['simplified_motion'] = bill_motions['motion_text'].apply(extract_action)


In [194]:
clean_coms = {}
for i, row in locations.iterrows():
    if row['committee_code'].startswith('CZ'):
        continue
    name = row['committee_name']
    if row['committee_code'].startswith('CS'):
        if name.startswith('Sen.'):
            cname = re.sub(r'Sen. ', 'senate ', name).lower()
        elif name.startswith('Senate '):
            cname = name.lower()
        else:
            cname = 'senate ' + name.lower()
    elif row['committee_code'].startswith('CX'):
        if name.lower().startswith('assembly'):
            cname = name.lower()
        else:
            cname = 'assembly ' + name.lower()
    if re.search(r'x\d$', cname) is not None:
        cname = re.sub(r'x(?=\d$)', 'no. ', cname)
    clean_coms[row['committee_code']] = cname


In [195]:
from rapidfuzz import fuzz, process

leg_committees = [f"{row['chamber']} {row['committee_clean']}".lower() for _, row in politicians[['committee_clean', 'chamber']].drop_duplicates().iterrows()]

def match_committees(_names, clean_coms, threshold=92):
    clean_c = list(clean_coms.values())
    clean_codes = list(clean_coms.keys())
    name_mapping = {}
    for i, clean in enumerate(clean_c):
        code = clean_codes[i]
        matches = []
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=threshold
        ))
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.partial_ratio,
            score_cutoff=threshold
        ))
        valid_matches = [m for m in matches if m is not None]
        if len(valid_matches) > 0:
            best_match = max(valid_matches, key=lambda x: x[1])
            name_mapping[code] = best_match[0]
        else:
            fall_back = process.extractOne(
                clean,
                _names,
                scorer=fuzz.token_sort_ratio,
                score_cutoff=threshold - 8
            )
            if fall_back is not None:
                name_mapping[code] = fall_back[0]
            else:
                name_mapping[code] = None
    return name_mapping

committee_matches = match_committees(leg_committees, clean_coms)


In [196]:
locations['committee_clean'] = locations['committee_code'].map(committee_matches)


In [197]:
locations.loc[locations['committee_name'] == 'EDUCATION X5', 'committee_clean'] = 'Budget and Fiscal Review: Education'
locations.loc[locations['committee_code'] == 'CX12', 'committee_clean'] = 'Budget No. 1 on Health and Human Services'
locations.loc[locations['committee_code'] == 'CS68', 'committee_clean'] = 'Budget No. 3 - Health and Human Services'
locations.loc[locations['committee_code'] == 'CS66', 'committee_clean'] = 'Senate Veterans Affairs'
locations.loc[locations['committee_code'] == 'CS56', 'committee_clean'] = 'Senate Public Employment and Retirement'
locations.loc[locations['committee_code'] == 'CS62', 'committee_clean'] = 'Senate Budget and Fiscal Review'
locations.loc[locations['committee_code'] == 'CX23', 'committee_clean'] = 'Assembly Utilities and Commerce'


In [198]:
motion_codes = {
    row['motion_id']: row['simplified_motion']
    for _, row in bill_motions.iterrows()
}


In [199]:
summary_votes['motion_text'] = summary_votes['motion_id'].map(motion_codes)


In [200]:
def repair_bill_id(id):
    front, end = id[:4], id[4:]
    if re.search(r'\d{4}$', front):
        return f"{front}{int(front) + 1}{end}"
    else:
        return id


In [201]:
versions['ID'] = versions['bill_id'].apply(lambda x: repair_bill_id(x))


In [202]:
lob_repairs = {}
for _, row in lobbying.loc[(lobbying['FILING_ID'].isin(lobbying.loc[lobbying['FIRM_NAME'].isna(), 'FILING_ID'].drop_duplicates().tolist())) & (lobbying['FIRM_NAME'].notna()), ['FILING_ID', 'FIRM_NAME']].drop_duplicates().iterrows():
    lob_repairs[row['FILING_ID']] = row['FIRM_NAME']

In [203]:
missed_firms = disclosure.loc[disclosure['FIRM_NAME'].isna(), ['FILING_ID', 'FILER_NAMF', 'FILER_NAML']].drop_duplicates()

def firm_correction(row):
    if row['FILER_NAMF'] == None or not isinstance(row['FILER_NAMF'], str):
        return row['FILER_NAML']
    elif row['FILER_NAML'] == None or not isinstance(row['FILER_NAML'], str):
        return row['FILER_NAMF']
    else:
        return row['FILER_NAMF'] + ' ' + row['FILER_NAML']

missed_firms['correction'] = missed_firms.apply(firm_correction, axis=1)

correct_firms = {row['FILING_ID']: row['correction'] for _, row in missed_firms.iterrows()}

lobbying.loc[lobbying['FIRM_NAME'].isna(), 'FIRM_NAME'] = lobbying.loc[lobbying['FIRM_NAME'].isna(), 'FILING_ID'].astype(str).map(correct_firms)

In [204]:
def fix_firm_name(row):
    if pd.isna(row['FIRM_NAME']):
        if row['FILING_ID'] in lob_repairs.keys():
            return lob_repairs.get(row['FILING_ID'])
        else:
            try:
                id = int(re.sub(r'\s+', '', str(row['FILING_ID'])).strip())
            except:
                id = None
            if id == None:
                if not pd.isna(row['FILING_ID']):
                    return row['FILING_ID']
            elif str(id) in lob_repairs.keys():
                return lob_repairs.get(str(id))
    return row['FIRM_NAME']
lobbying['FIRM'] = lobbying.apply(fix_firm_name, axis=1)

In [205]:
import unicodedata
from rapidfuzz import fuzz

STOPWORDS = {"the", "and", "of", "&", "for", "to"}
LEGAL = {
    "inc", "incorporated", "corp", "corporation",
    "llc", "l.l.c", "lp", "l.p", "llp", "l.l.p",
    "co", "company", "group", "partners",
    "holdings", "association", "assn", "assoc"
}

def clean_tokens(name):
    if not isinstance(name, str):
        return []

    name = unicodedata.normalize("NFKD", name)
    name = name.encode("ascii", "ignore").decode("ascii")
    name = name.lower()
    name = re.sub(r"[^\w\s]", " ", name).strip()

    return sorted(
        t for t in name.split()
        if t not in STOPWORDS and t not in LEGAL
    )

def group_similar_names(canonicals, threshold=93):
    groups = []
    group_ids = [-1] * len(canonicals)
    current_gid = 0

    for i, name_i in tqdm(enumerate(canonicals), total=len(canonicals)):
        if group_ids[i] != -1:
            continue

        group_ids[i] = current_gid

        for j in range(i + 1, len(canonicals)):
            if group_ids[j] != -1:
                continue

            score = fuzz.token_set_ratio(name_i, canonicals[j])
            if score >= threshold:
                group_ids[j] = current_gid

        current_gid += 1

    return group_ids



In [206]:
lobbying['tokens'] = lobbying['FIRM'].apply(clean_tokens)
lobbying['canonical'] = lobbying["tokens"].apply(lambda t: " ".join(t))
lobbying['name_group'] = group_similar_names(lobbying['canonical'].tolist())


100%|██████████| 121019/121019 [01:34<00:00, 1279.49it/s] 


In [207]:
donor_n = pd.concat([expend_assembly['ExpenderName'], expend_senate['ExpenderName']]).drop_duplicates().to_frame()
donor_n['tokens'] = donor_n['ExpenderName'].apply(clean_tokens)
donor_n['canonical'] = donor_n['tokens'].apply(lambda t: " ".join(t))
donor_n['name_group'] = group_similar_names(donor_n['canonical'].tolist())


100%|██████████| 888/888 [00:01<00:00, 846.75it/s] 


In [208]:
lob_names = {}
for _, row in lobbying.groupby('name_group').agg({'FIRM': 'first'}).reset_index().iterrows():
    lob_names[row['name_group']] = row['FIRM']
lobbying['FIRM'] = lobbying['name_group'].map(lob_names)

In [209]:
don_names = {}
for _, row in donor_n.groupby('name_group').agg({'ExpenderName': 'first'}).reset_index().iterrows():
    don_names[row['name_group']] = row['ExpenderName']

don_name_group_map = {}
for _, row in donor_n[['ExpenderName', 'name_group']].drop_duplicates().iterrows():
    don_name_group_map[row['ExpenderName']] = row['name_group']

expend_assembly['Expender'] = expend_assembly['ExpenderName'].apply(lambda x: don_names.get(don_name_group_map.get(x)))
expend_senate['Expender'] = expend_senate['ExpenderName'].apply(lambda x: don_names.get(don_name_group_map.get(x)))


In [210]:
bill_vers = versions.loc[versions['bill_id'].str.startswith('2')]
for i, row in bill_vers.iterrows():
    tail = f"{row['VersionNum']}{row['MeasureState']}"
    repaired = repair_bill_id(re.sub(tail, '', row['bill_id']))
    end = int(repaired[-4:])

    bill_vers.loc[i, 'bill_ID'] = f"{repaired[:-4]}{end}"


In [211]:
for k, v in {'Horton': 'Shirley', 'Calderon': 'Ron', 'Berryhill': 'Tom', 'Stone': 'Mark', 'Rubio': 'Susan', 'Rivas': 'Robert', 'Nguyen': 'Janet'}.items():
    politicians.loc[(politicians['full_name'].isna()) & (politicians['Last'] == k), 'full_name'] = f"{k}, {v}"


In [212]:
pol_fixes = {}
manual = {'Steinberg, Darrell': [6, 'D'],
 'Calderon, Ron': [30, 'D'],
 'Stone, Mark': [29, 'D'],
 'Rubio, Susan': [22, 'D'],
 'Nguyen, Janet': [36, 'R'],
 'Berryhill, Tom': [8, 'R']}

for _, row in politicians.loc[politicians['District No.'].isna(), ['full_name', 'chamber', 'Term']].drop_duplicates().iterrows():
    if politicians.loc[(politicians['chamber'] == row['chamber']) & (politicians['full_name'] == row['full_name']) & (politicians['Term'] == row['Term'])].dropna(subset='District No.').shape[0] > 0:
        f = politicians.loc[(politicians['chamber'] == row['chamber']) & (politicians['full_name'] == row['full_name']) & (politicians['Term'] == row['Term'])].dropna(subset='District No.')
        pol_fixes[(row['full_name'], row['chamber'], row['Term'])] = {'party': f['Party'].values[0], 'district_id': re.sub(r'\s+', '', f['District No.'].values[0])}

    else:
        pol_fixes[(row['full_name'], row['chamber'], row['Term'])] = {'party': manual.get(row['full_name'])[1], 'district_id': manual.get(row['full_name'])[0]}


In [213]:
def party_district_fix(rows):
    party, district = [], []
    for _, row in rows.iterrows():
        fix = pol_fixes.get((row['full_name'], row['chamber'], row['Term']))
        party.append(fix['party'])
        district.append(fix['district_id'])
    return party, district

parties, districts = party_district_fix(politicians.loc[politicians['District No.'].isna()])
politicians.loc[politicians['District No.'].isna(), 'Party'] = parties
politicians.loc[politicians['District No.'].isna(), 'District No.'] = districts


In [214]:
legislators = {i: pol for i, pol in enumerate(politicians['full_name'].unique().tolist())}

leg_parties = {row['full_name']: row['Party'] for _, row in politicians[['full_name', 'Party']].drop_duplicates().iterrows()}
leg_occupations = {row['full_name']: row['Occupation'] for _, row in politicians[['full_name', 'Occupation']].drop_duplicates().iterrows()}
committees = {i: com for i, com in enumerate(politicians['committee_clean'].unique().tolist())}
lobby_firms = {i: firm for i, firm in enumerate(lobbying['FIRM'].unique().tolist())}

donor_names = list(set(expend_assembly['Expender'].unique().tolist() + expend_senate['Expender'].unique().tolist()))
donors = {i: donor for i, donor in enumerate(donor_names)}


In [215]:
bill_titles = {row['bill_ID']: row['Title'] for _, row in bill_vers[['bill_ID', 'Title']].drop_duplicates().iterrows()}


In [216]:
bill_subjects = {row['bill_ID']: row['GeneralSubject'] for _, row in bill_vers.loc[bill_vers['GeneralSubject'].apply(lambda x: x is not None and isinstance(x, str)), ['bill_ID', 'GeneralSubject']].drop_duplicates().iterrows()}


In [217]:
bill_ids = list(set(bill_votes.loc[bill_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist() + summary_votes.loc[summary_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist()))


In [218]:
bill_id_codes = {row['bill_id']: row['bill_ID'] for _, row in bill_vers.drop_duplicates(subset=['bill_id', 'bill_ID']).iterrows()}
history['bill_ID'] = history['bill_id'].map(bill_id_codes)


In [219]:
history['Date'] = pd.to_datetime(history['Date'])


In [220]:
introduction_dates = {}
for v, group in history.loc[history['bill_ID'].isin(bill_ids)].groupby('bill_ID'):
    introduction_dates[v] = {'Dates': group['Date'].unique().tolist(), 'Actions': group.sort_values('Date', ascending=True).drop_duplicates(subset=['Action', 'Date'])['Action'].tolist()}


In [221]:
version_id_mapping = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['ID']}


In [222]:
outcomes = history.loc[history['bill_ID'].notna()].sort_values('Date', ascending=False).groupby('bill_ID').first().reset_index()[['bill_ID', 'Action']]
outcomes.loc[outcomes['Action'].isin(['CHAPTERED', 'ENROLLED', 'FILED', 'APPROVED']), 'Outcome'] = 1
outcomes.loc[outcomes['Action'] == 'VETOED', 'Outcome'] = -1
outcomes.loc[outcomes['Outcome'].isna(), 'Outcome'] = 0


In [223]:
outcome = outcomes.set_index('bill_ID')['Outcome'].to_dict()


In [224]:
vote_bill_ids = {}
for i in summary_votes.loc[summary_votes['bill_id'].isin(bill_ids)].groupby(['year', 'motion_id'])['bill_id'].value_counts().index:
    year, motion_id, bill_id = i
    if (year, motion_id) not in vote_bill_ids.keys():
        vote_bill_ids[(year, motion_id)] = [bill_id]
    else:
        vote_bill_ids[(year, motion_id)].append(bill_id)



In [225]:
bill_vers_dig = bill_vers.merge(digests, on='bill_id', how='inner')


In [226]:
politicians.loc[(politicians['full_name'] == 'Torlakson, Tom') & (politicians['District No.'] == '6 7'), 'District No.'] = '6'


In [227]:
legislators_last_names = {}
for _, row in politicians[['chamber', 'Last', 'Term', 'full_name']].drop_duplicates().iterrows():
    legislators_last_names[(row['chamber'], row['Last'].lower(), row['Term'])] = row['full_name']


In [228]:
features = {row['ID']: {
    'digest': row['DigestText'],
    'MeasureState': row['MeasureState'],
    'VoteRequired': row['VoteRequired'] if row['VoteRequired'] is not None else 'No',
    'VersionNum': row['VersionNum'] if row['VersionNum'] is not None else 'No',
    'LocalProgram': row['LocalProgram'] if row['LocalProgram'] is not None else 'No',
    'FiscalCommittee': row['FiscalCommittee'] if row['FiscalCommittee'] is not None else 'No',
    'TaxLevy': row['TaxLevy'] if row['TaxLevy'] is not None else 'No',
    'Urgency': row['Urgency'] if row['Urgency'] is not None else 'No'} for _, row in bill_vers_dig.iterrows()}


In [229]:
legislator_codes = {v: k for k, v in legislators.items()}


In [230]:
committee_codes = {v.lower(): k for k, v in committees.items()}


In [231]:
bill_votes['chamber'] = bill_votes['location_code'].apply(lambda x: 'assembly' if x == 'AFLOOR' or x.startswith('CX') else 'senate' if x == 'SFLOOR' or x.startswith('CS') else 'full')
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time'])
bill_votes['term'] = bill_votes['vote_date_time'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1  else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year + 1}-{x.year + 2}")


In [232]:
author_locations = authors.loc[(authors['House'] == 'UNKNOWN') & (authors['bill_id'].map(bill_id_codes).isin(bill_ids)), ['bill_id', 'Name']].drop_duplicates()
for i, row in author_locations.iterrows():
    if 'AB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Assembly ' + row['Name']
    elif 'SB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Senate ' + row['Name']
    else:
        author_locations.loc[i, 'name'] = 'Joint ' + row['Name']


In [233]:
from fuzzywuzzy import fuzz

def fuzzy_strings(source_list, target_list):
    def preprocess_name(name):
        if not isinstance(name, str):
            return ""
        name = name.lower()
        name = re.sub(r'\(.*?\)', '', name)
        name = re.sub(r'committee on', '', name)
        name = re.sub(r'[^a-z\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        return name

    clean_source = [preprocess_name(c) for c in source_list]
    clean_target = [preprocess_name(c) for c in target_list]

    keywords = ["education", "health", "finance", "budget", "transportation",
                "judiciary", "environment", "agriculture", "energy", "labor",
                "housing", "veterans affairs", "public safety", "insurance", "banking", "public health", "small business", "redistricting",
                "public utilities", "natural resources", "water",
                "technology", "communications", "elections", "government",
                "appropriations", "rules", "ethics", 'criminal justice', "environmental protection", "college and university", "human services", "reproductive health", "mental health", "technology", "aggriculture", "urban development", "renewable energy", "gun violence", "commerce", "privacy", "cybersecurity", "infrastructure", "disaster preparedness", "prisons", "aging"]

    def get_committee_keywords(name):
        return set(kw for kw in keywords if kw in name)

    target_keywords = [get_committee_keywords(name) for name in clean_target]

    def calculate_similarity(source_idx, target_idx):
        source = clean_source[source_idx]
        target = clean_target[target_idx]

        if not source or not target:
            return 0

        if source == target:
            return 100

        token_sort = fuzz.token_sort_ratio(source, target)
        token_set = fuzz.token_set_ratio(source, target)
        partial = fuzz.partial_ratio(source, target)

        source_kw = get_committee_keywords(source)
        keyword_overlap = len(source_kw.intersection(target_keywords[target_idx]))
        keyword_bonus = min(20, keyword_overlap * 10)
        weighted_score = (token_sort * 0.3) + (token_set * 0.5) + (partial * 0.2) + keyword_bonus

        return weighted_score

    matches = {}
    for i, source in enumerate(source_list):
        scores = [calculate_similarity(i, j) for j in range(len(target_list))]

        if not scores or max(scores) < 60:
            matches[source] = None
        else:
            best_idx = np.argmax(scores)
            confidence = scores[best_idx]

            if confidence >= 60:
                matches[source] = target_list[best_idx]
            else:
                matches[source] = None

    return matches


In [234]:
author_com_matches = fuzzy_strings(author_locations['name'].unique().tolist(), leg_committees)


In [235]:
author_locations['name'] = author_locations['name'].map(author_com_matches)


In [236]:
sponsors = authors.loc[authors['bill_id'].map(bill_id_codes).isin(bill_ids)]
sponsors['term'] = sponsors['bill_id'].apply(lambda x: f"{x[:4]}-{int(x[:4]) + 1}" if int(x[:4]) % 2 == 1 else f"{int(x[:4]) - 1}-{x[:4]}" if int(x[:4]) % 2 == 0 and int(x[:4]) < 2009 else f"{x[:4]}-{int(x[:4]) + 1}")


In [237]:
lob = lobbying.loc[lobbying['clean_beneficiary'].notna(), ['FIRM', 'EXPN_DSCR', 'clean_beneficiary', 'EXPN_DATE', 'BENE_AMT']]
lob['EXPN_DATE'] = pd.to_datetime(lob['EXPN_DATE'])


In [238]:
def get_term(date):
    if not isinstance(date, pd.Timestamp):
        return None
    year = date.year
    if year % 2 != 1:
        if date.month < 12:
            return f"{year-1}-{year}"
        else:
            return f"{year+1}-{year+2}"
    else:
        return f"{year}-{year+1}"

lob['term'] = lob['EXPN_DATE'].apply(get_term)


In [239]:
for i, row in politicians.loc[politicians['full_name'].apply(lambda x: isinstance(x, float)), ['Term', 'Last', 'chamber']].drop_duplicates().iterrows():
    term, last = row['Term'], row['Last']
    a = politicians.loc[(politicians['Last'] == last) & (politicians['Term'] == term) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
        continue
    else:
        a = politicians.loc[(politicians['Last'] == last) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
    else:
        print(last, term)



In [240]:
pol_names_terms = {}
for _, row in politicians[['full_name', 'Term', 'chamber']].drop_duplicates().iterrows():
    if ',' in row['full_name']:
        name = row['full_name'].split(',')[1].strip() + ' ' + row['full_name'].split(',')[0].strip()
    else:
        name = row['full_name']
    pol_names_terms[(row['full_name'].lower(), row['Term'])] = {'chamber': row['chamber'], 'name': name}


In [241]:
politicians['clean_full_name'] = politicians['full_name'].apply(lambda x: x.split(',')[1].strip() + ' ' + x.split(',')[0].strip() if ',' in x else x)
name_fix = {}
for _, row in politicians[['clean_full_name', 'full_name']].drop_duplicates().iterrows():
    name_fix[row['clean_full_name']] = row['full_name']


In [242]:
expend_assembly = expend_assembly.rename(columns={'term': 'Term'})
expend_assembly['chamber'] = 'assembly'
expend_senate = expend_senate.rename(columns={'term': 'Term'})
expend_senate['chamber'] = 'senate'


In [243]:
campaign_contributions = pd.concat([expend_assembly.loc[expend_assembly['matched_target_name'].notna(), ['Expender', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['Expender', 'Amount', 'matched_target_name', 'DateEnd']), expend_senate.loc[expend_senate['matched_target_name'].notna(), ['Expender', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['Expender', 'Amount', 'matched_target_name', 'DateEnd'])])


In [244]:
campaign_contributions['DateEnd'] = pd.to_datetime(campaign_contributions['DateEnd'])


In [245]:
sponsors['bill_ID'] = sponsors['bill_id'].apply(repair_bill_id)


In [246]:
voting = history.merge(bill_votes, left_on=['bill_ID', 'Date'], right_on=['bill_id', 'vote_date_time'], how='inner').rename(columns={'bill_id_x': 'bill_version'}).drop('bill_id_y', axis=1)
voting['bv_id'] = voting['bill_version'].apply(repair_bill_id)


In [247]:
voting_places = {}
for i, row in voting.groupby(['motion_id', 'term', 'chamber', 'Date']).agg({'legislator_name': lambda x: list(x)}).iterrows():
    motion_id, term, chamber, date = i
    g = politicians.loc[(politicians['chamber'] == chamber) & (politicians['Term'] == term) & (politicians['Last'].isin(row['legislator_name']))]
    voting_places[(motion_id, term, chamber, date)] = {
        'most_common_committee': g.groupby('committee_clean').size().sort_values(ascending=False).head(1).index[0] if len(g) > 0 else None
    }
voting['voting_place'] = voting.apply(lambda row: voting_places.get((row['motion_id'], row['term'], row['chamber'], row['Date']), {}).get('most_common_committee', None), axis=1)


In [248]:
hear = hearings[['bill_id', 'location_code']].merge(locations[['committee_code', 'committee_clean']], left_on='location_code', right_on='committee_code', how='left')[['bill_id', 'committee_clean']].drop_duplicates()
hear['year'] = hear['bill_id'].apply(lambda x: int(x[:4]))


 ### Load

In [249]:
positions = {p: 'Democratic Alternate' if re.search(r'Democratic\s*Alternate', p) is not None else 'Vice Chair' if re.search(r'V\s*i\s*c\s*e\s*-*\s*C\s*h\s*a\s*i\s*r\s*', p) is not None else 'Co-Chair' if re.search(r'Co\s*-\s*Chair', p) is not None else 'Chair' if re.search(r'Cha\s*i\s*r', p) is not None else 'Republican Alternate' if re.search(r'\s*Republican\s*Alternate', p) is not None else p for p in politicians['position'].unique()}


In [250]:
vnums = bill_vers.set_index('ID')['VersionNum'].to_dict()
vid_map = {v: k for k, val in version_id_mapping.items() for v in val}


In [251]:
sponsors['chamber'] = sponsors['House'].apply(lambda x: x.lower() if isinstance(x, str) else None)
sponsors = sponsors.merge(politicians[['Term', 'Last', 'chamber', 'full_name']].drop_duplicates(), left_on=['chamber', 'Name', 'term'], right_on=['chamber', 'Last', 'Term'], how='left')


In [252]:
from unidecode import unidecode

def clean_text(text):
    text = " ".join(text.split(',')[::-1])
    text = unidecode(text.lower().strip())
    return re.sub(r'[^\w\s]', '', text)

pol_names_terms2 = {}
for k, v in pol_names_terms.items():
    pol_names_terms2[(clean_text(k[0]), k[1])] = v


In [253]:
def text_clean(title):
    if not isinstance(title, str):
        return ''
    title = re.sub(r'\(.*?\)', '', title)
    title = re.sub(r'[^a-zA-Z0-9\s]', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title.lower()

lobbying['firm'] = lobbying['FIRM'].apply(text_clean)
lob['firm'] = lob['FIRM'].apply(text_clean)

 ## Pre-Computations

## Regions

In [254]:
def _safe_dt(s):
    return pd.to_datetime(s, errors='coerce')

def _canon_name(n):
    n = re.sub(r'[^\w\s]', ' ', str(n)).lower()
    n = re.sub(r'\s+', ' ', n).strip()
    return n

def _infer_origin_chamber_from_bill_id(bill_id):
    s = str(bill_id)
    if 'AB' in s: return 'assembly'
    if 'SB' in s: return 'senate'
    return None

def _term_from_date(ts):
    if pd.isna(ts): return np.nan
    y = ts.year
    if y % 2 == 1:
        return f"{y}-{y+1}"
    else:
        if ts.month < 11:
            return f"{y-1}-{y}"
        return f"{y+1}-{y+2}"

def _tokenize(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return [t for t in s.split(' ') if t]

def _jaccard(a_set, b_set):
    if not a_set and not b_set: return 1.0
    i = len(a_set & b_set)
    u = len(a_set | b_set)
    return i / u if u else 0.0

def read_zip(zip_path, crs=3857):
    tmp = tempfile.TemporaryDirectory()
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(tmp.name)
    shp = next(pathlib.Path(tmp.name).rglob("*.shp"))
    gdf = gpd.read_file(shp).set_crs(epsg=crs)
    gdf = gdf.to_crs(epsg=3857)
    return gdf, tmp

def district_cycle(year):
    if year <= 2012: return "2001"
    if year <= 2022: return "2011"
    return "current"


populations = pd.read_csv('E-4_2010-2020-Internet-Version.csv', skiprows=1).iloc[:, :12]
for c in populations.columns:
    populations[c] = populations[c].astype(str)

for c in populations.iloc[:, 1:].columns:
    populations[c] = populations[c].apply(lambda x: re.sub(r'[^0-9]', '', x)).astype(int)

pops = {}
for _, row in populations.iterrows():
    county = f'{row["COUNTY"].strip()} County'
    pops[county] = np.mean(row.iloc[1:])




In [255]:
counties_gdf, _ = read_zip('dashboard/backend/ca_counties.zip')
counties_gdf = counties_gdf[['COUNTYFP', 'NAMELSAD', 'geometry']]
counties_gdf['county_area_m2'] = counties_gdf.geometry.area
counties_gdf['county_id'] = counties_gdf['COUNTYFP'].astype(int)
counties_gdf['population'] = counties_gdf['NAMELSAD'].map(pops).astype('float64')
counties_gdf['pop_density_per_m2'] = counties_gdf['population'] / counties_gdf['county_area_m2']
data_dir = pathlib.Path('dashboard/backend')
asm11_zip = data_dir / '2011_assembly_state_shp.zip'
sen11_zip = data_dir / '2011_senate_state_shp.zip'
asmcur_zip = data_dir / '2021_AD_Final_shp.zip'
sencur_zip = data_dir / '2021_SD_Final_shp.zip'
dist_info = [(asm11_zip, "assembly", "2011", 4019),(sen11_zip, "senate", "2011", 4019),(asmcur_zip, "assembly","current", 4269),(sencur_zip, "senate",  "current", 4269)]
weight_records = []
tmps = []
for zp, house, cycle, crs in dist_info:
    gdf, tmp = read_zip(zp, crs)
    tmps.append(tmp)
    gdf = gdf.rename(columns={gdf.columns[0]: "district_id"})[["district_id", "geometry"]]
    gdf["house"] = house
    gdf["cycle"] = cycle
    gdf = gdf.to_crs(3857)
    gdf['dist_area_m2'] = gdf.geometry.area
    inter = gpd.overlay(gdf, counties_gdf, how="intersection")
    inter = inter[['house', 'cycle', 'district_id', 'COUNTYFP', 'NAMELSAD', 'geometry', 'dist_area_m2', 'county_area_m2', 'population', 'pop_density_per_m2']]
    inter['fragment_area_m2'] = inter.geometry.area
    inter['est_overlay_pop'] = inter['pop_density_per_m2'] * inter["fragment_area_m2"]

    inter["pop_denominator"] = inter.groupby(["house","cycle","district_id"])["est_overlay_pop"].transform("sum")

    inter["area_denominator"] = inter.groupby(["house","cycle","district_id"])["fragment_area_m2"].transform("sum")
    inter["w_area"] = np.where(inter["area_denominator"] > 0,
                               inter["fragment_area_m2"] / inter["area_denominator"],
                               np.nan)

    inter["w_pop"] = inter["est_overlay_pop"] / inter["pop_denominator"]
    bad = ~np.isfinite(inter["w_pop"]) | (inter["w_pop"] < 0)
    inter.loc[bad, "w_pop"] = inter.loc[bad, "w_area"]
    inter["district_share_in_county_area"] = inter["fragment_area_m2"] / inter["dist_area_m2"]
    inter["district_share_in_county_pop"] = inter["w_pop"]

    weight_records.append(
        inter[[
            "house","cycle","district_id","COUNTYFP","NAMELSAD",
            "fragment_area_m2","dist_area_m2","county_area_m2",
            "population","pop_density_per_m2","est_overlay_pop",
            "w_pop","w_area","district_share_in_county_pop","district_share_in_county_area"
        ]].reset_index(drop=True)
    )

weights = pd.concat(weight_records, ignore_index=True)


In [256]:
vote_events = voting.merge(politicians[['Party', 'Term', 'Last', 'clean_full_name', "chamber"]], left_on=['legislator_name', 'chamber', 'term'], right_on=['Last', 'chamber', 'Term'], how='left')

In [257]:
vote_events = vote_events[['Date', 'bill_ID', 'bv_id', 'chamber', 'term', 'voting_place', 'Party', 'clean_full_name', "vote_code", "motion_id", "location_code"]].drop_duplicates()

In [258]:
vote_map = {
    "AYE": 1, "YES": 1, 1: 1,
    "NO": 0, 0: 0
}

vote_events['vote'] = vote_events['vote_code'].map(vote_map)
vote_events = vote_events.dropna(subset='vote')
vote_events['vote'] = vote_events['vote'].astype(int)

In [259]:
actor_topic = pd.read_parquet('dashboard/backend/data/outs/actor_topic.parquet')


In [260]:
po = politicians[['Party', 'District No.', 'Term', 'Last', 'full_name', 'chamber', 'clean_full_name']].drop_duplicates().rename(columns={'Term': 'term'})


In [261]:
def lob_name_fix(row):
    if (row['clean_beneficiary'], row['term']) in pol_names_terms.keys():
        return pol_names_terms.get((row['clean_beneficiary'], row['term'])).get('name')
    elif (row['clean_beneficiary'], row['term']) in pol_names_terms2.keys():
        return pol_names_terms2.get((row['clean_beneficiary'], row['term'])).get('name')
    else:
        return None

lob['clean_full_name'] = lob.apply(lob_name_fix, axis=1)


In [262]:
lo = lob.merge(po, on=['clean_full_name', 'term'])[['firm', 'BENE_AMT', 'term', 'Party', 'chamber', 'full_name', 'District No.']]

In [263]:
do = campaign_contributions.merge(po, left_on=['matched_target_name', 'Term'], right_on=['full_name', 'term'])[['Expender', 'Amount', 'Term', 'Party', 'chamber_x', 'full_name', 'District No.']]

In [264]:
for t in [do, lo]:
    t.columns = ['Firm', 'Amount', 'Term', 'Party', 'Chamber', 'full_name', 'District']

In [265]:
funding = pd.concat([
    do.assign(kind="donation"),
    lo.assign(kind="lobbying")
], ignore_index=True)

funding["Amount"] = funding["Amount"].astype(float)


In [266]:
funding.columns = ['Firm', 'amount', 'term', 'party', 'house', 'full_name', 'district_id', 'kind']
funding['year'] = funding['term'].apply(lambda x: int(x.split('-')[0]))


In [267]:
funding['cycle'] = funding['year'].apply(district_cycle)
funding['district_id'] = funding['district_id'].apply(lambda x: re.sub(r'\s+', '', str(x))).astype(float)

funding['district_id'] = funding['district_id'].astype(int)
funding['cycle'] = funding['cycle'].apply(lambda x: x if x == 'current' else '2011')

In [268]:
funding_w = funding.merge(
    weights,
    on=["house","cycle","district_id"],
    how="left",
    validate="many_to_many"
)


In [269]:
funding_w["county_amount"] = (
    funding_w["amount"] * funding_w["district_share_in_county_pop"]
)

In [270]:
county_funding = (
    funding_w
    .groupby(["COUNTYFP","NAMELSAD"], as_index=False)
    .agg(total_amount=("county_amount","sum"))
    .rename(columns={"COUNTYFP":"county_id","NAMELSAD":"county_name"})
)

county_funding["county_id"] = county_funding["county_id"].astype(int)


In [271]:
county_funders = (
    funding_w
    .groupby(
        ["COUNTYFP","NAMELSAD","Firm"],
        as_index=False
    )
    .agg(total_amount=("county_amount","sum"))
    .rename(columns={
        "COUNTYFP":"county_id",
        "NAMELSAD":"county_name",
        "Firm":"funder"
    })
)

In [272]:
county_term_funding = funding_w.groupby(['NAMELSAD', 'term'], as_index=False).agg(total_amount=('county_amount', "sum")).rename(columns={"COUNTYFP":"county_id", "NAMELSAD":"county_name"})

In [273]:
with open('node_id_map.json', 'r') as f:
    node_id_map = json.load(f)

legislators = pickle.load(open('legislators.pkl', 'rb'))
committees = pickle.load(open('committees.pkl', 'rb'))

node_ids = {}
for k in node_id_map.keys():
    node_ids[k] = {v: i for i, v in node_id_map[k].items()}

In [274]:
def name_from_id(row):
    node_id, node_type = row
    if node_type in ['donor', 'lobby_firm']:
        return node_ids[node_type].get(node_id)
    mapped = node_ids.get(node_type, {}).get(node_id)
    if not mapped:
        return None
    num = int(mapped.split("_")[0])
    if node_type == 'committee':
        return committees.get(num)
    return legislators.get(num)

actor_topic['name'] = actor_topic[['actor_index', 'actor_type']].apply(name_from_id, axis=1)

In [275]:
with open('subject_key.json', 'r') as f:
    subject_key = json.load(f)

In [276]:
funder_topics = (
    actor_topic
    .loc[actor_topic['actor_type'].isin(['donor', 'lobby_firm'])]
    .assign(
        actor_name=lambda d: d['name'].str.strip()
    )
    [['actor_name', 'topic_id', 'stance']]
    .dropna(subset=['actor_name', 'topic_id', 'stance'])
    .groupby(['actor_name', 'topic_id'], as_index=False)
    .agg(stance=('stance', 'mean'))
)

county_funder_topics = (
    county_funders
    .merge(
        funder_topics,
        left_on='funder',
        right_on='actor_name',
        how='left'
    )
    .drop(columns='actor_name')
    .dropna(subset=['topic_id', 'stance'])
)

county_funder_topics['abs_weight'] = county_funder_topics['total_amount'] * county_funder_topics['stance'].abs()
county_funder_topics['signed_weight'] = county_funder_topics['total_amount'] * county_funder_topics['stance']

county_funder_topics = county_funder_topics.assign(
    abs_weight = county_funder_topics['total_amount'] * county_funder_topics['stance'].abs(),
    signed_weight = county_funder_topics['total_amount'] * county_funder_topics['stance']
)

def top_topics(df, sign, n=3):
    d = df.loc[(df['stance'] > 0) if sign == 'support' else (df['stance'] < 0)]
    return (
        d.sort_values('abs_weight', ascending=False)
         .groupby(['county_id', 'funder'], as_index=False)
         .head(n)
         .assign(position=sign)
    )

top_support = top_topics(county_funder_topics, 'support', n=3)
top_oppose = top_topics(county_funder_topics, 'oppose', n=3)
topics_long = pd.concat([top_support, top_oppose], ignore_index=True)

topics_long['topic'] = (
    topics_long['topic_id']
    .astype(int)
    .astype(str)
    .map(subject_key)
)

def pack_topics(df):
    support = (
        df.loc[df['position'] == 'support']
          .sort_values('abs_weight', ascending=False)
          ['topic']
          .tolist()
    )

    oppose = (
        df.loc[df['position'] == 'oppose']
          .sort_values('abs_weight', ascending=False)
          ['topic']
          .tolist()
    )

    return pd.Series({
        'top_supported_topics': support,
        'top_opposed_topics': oppose
    })


county_top_funders = (
    topics_long
    .groupby(
        ['county_id', 'county_name', 'funder', 'total_amount'],
        as_index=False
    )
    .apply(pack_topics)
    .reset_index(drop=True)
)

  .apply(pack_topics)


In [277]:
county_top_funders = (
    county_top_funders
    .assign(
        rank_in_county=lambda d: (
            d.groupby('county_id')['total_amount']
             .rank(method='dense', ascending=False)
        )
    )
    .loc[lambda d: d['rank_in_county'] <= 5]
    .sort_values(['county_id', 'rank_in_county'])
    .drop(columns='rank_in_county')
)

In [278]:
county_top_funders = county_top_funders[['county_id', 'county_name', 'funder', 'total_amount', 'top_supported_topics','top_opposed_topics'
]]

county_top_funders['top_opposed_topics'] = county_top_funders['top_opposed_topics'].apply(lambda x: " / ".join(x))
county_top_funders['top_supported_topics'] = county_top_funders['top_supported_topics'].apply(lambda x: " / ".join(x))

In [279]:
firm_totals = {row['Firm']: row['amount'] for _, row in funding_w[['Firm', 'amount', 'term', 'NAMELSAD', 'full_name']].drop_duplicates().groupby('Firm')['amount'].sum().reset_index().iterrows()}
county_top_funders['firm_total'] = county_top_funders['funder'].map(firm_totals)
county_top_funders['region_concentration'] = county_top_funders['total_amount'] / county_top_funders['firm_total']

## Power & Influence

In [280]:
legislator_funding = (
    funding
    .groupby(
        ['full_name', 'term', 'house', 'party', 'kind'],
        as_index=False
    )
    .agg(amount=('amount', 'sum'))
    .pivot_table(
        index=['full_name', 'term', 'house', 'party'],
        columns='kind',
        values='amount',
        fill_value=0
    )
    .reset_index()
)

legislator_funding.columns.name = None
legislator_funding = legislator_funding.rename(
    columns={'donation': 'donations', 'lobbying': 'lobbying'}
)

legislator_funding['total_funding'] = (
    legislator_funding['donations'] + legislator_funding['lobbying']
)

legislator_funding['funding_pct_overall'] = (
    legislator_funding
    .groupby('term')['total_funding']
    .rank(pct=True)
)

legislator_funding['funding_pct_house'] = (
    legislator_funding
    .groupby(['term', 'house'])['total_funding']
    .rank(pct=True)
)

legislator_funding['funding_rank_house'] = (
    legislator_funding
    .groupby(['term', 'house'])['total_funding']
    .rank(method='dense', ascending=False)
)

In [281]:
legislator_funding['funding_tier'] = pd.cut(
    legislator_funding['funding_pct_overall'],
    bins=[0, 0.5, 0.9, 0.97, 1.0],
    labels=['Low', 'Medium', 'High', 'Very High']
)

In [282]:
leg_topics = actor_topic.loc[actor_topic['actor_type'] == 'legislator_term']
leg_topics['term'] = leg_topics['actor_index'].apply(lambda x: node_ids['legislator_term'].get(x).split('_')[1])
leg_topics = leg_topics.dropna(subset=['name', 'topic_id', 'stance'])[['name', 'term', 'topic_id', 'stance', "influence"]].groupby(['name', 'term', 'topic_id'], as_index=False).agg(stance=('stance', 'mean'), influence=("influence", "mean"))

In [283]:
def pack_focus_topics(df, n=5):
    df = df.assign(abs_i=df['stance'].abs())

    top_topics = (
        df.sort_values('abs_i', ascending=False)
          .head(n)['topic_id']
          .astype(int)
          .tolist()
    )

    return pd.Series({
        'top_topics': top_topics,
        'topic_concentration': df['abs_i'].sum()
    })

leg_topic_summary = (
    leg_topics
    .groupby(['name', 'term'])
    .apply(pack_focus_topics)
    .reset_index()
    .rename(columns={'name': 'full_name'})
)

  .apply(pack_focus_topics)


In [284]:
overall_components = (
    leg_topics
    .assign(abs_impact=lambda d: d['influence'].abs())
    .groupby(['name', 'term'], as_index=False)
    .agg(
        influence_l1=('abs_impact', 'sum'),
        influence_l2=('abs_impact', lambda x: np.sqrt((x**2).sum())),
        n_topics=('topic_id', 'nunique')
    )
)

overall_components['overall_influence'] = (
    overall_components['influence_l1'] *
    np.log1p(overall_components['n_topics'])
)

overall_components['overall_influence_z'] = (
    overall_components
    .groupby('term')['overall_influence']
    .transform(lambda x: (x - x.mean()) / x.std())
)

overall_components = overall_components.rename(columns={'name': 'full_name'})

In [285]:
legislator_power = (
    legislator_funding
    .merge(leg_topic_summary, on=['full_name', 'term'], how='left')
    .merge(
        overall_components[
            ['full_name', 'term', 'overall_influence', 'overall_influence_z',
             'influence_l1', 'influence_l2', 'n_topics']
        ],
        on=['full_name', 'term'],
        how='left'
    )
)

In [286]:
legislator_power['influence_tier'] = pd.cut(
    legislator_power['overall_influence_z'],
    bins=[-np.inf, -1, 0, 1, np.inf],
    labels=['Low', 'Below Avg', 'Above Avg', 'High']
)

In [287]:
legislator_power['name'] = legislator_power['full_name'].apply(lambda x: x.split(',')[1].strip() + ' ' + x.split(',')[0].strip() if ',' in x else x)

In [288]:
vote_events['motion'] = vote_events['motion_id'].map(motion_codes)

leg_votes = (
    vote_events
    .dropna(subset=['clean_full_name', 'vote'])
    .rename(columns={'clean_full_name': 'full_name'})
)

yes_rate = (
    leg_votes
    .groupby(['full_name', 'term'], as_index=False)
    .agg(yes_rate=('vote', 'mean'))
)

In [289]:
FRICTION = {
    'Concurrence',
    'Reconsideration',
    'Urgency Clause',
    'Placed on Appropriations Suspense File',
    'W/O REF. TO FILE'
}

leg_votes['procedural'] = leg_votes['motion'].isin(FRICTION)

procedural_exposure = (
    leg_votes
    .groupby(['full_name', 'term'], as_index=False)
    .agg(procedural_exposure=('procedural', 'mean'))
)

In [290]:
leg_votes = leg_votes.merge(
    yes_rate[['full_name', 'term', 'yes_rate']],
    on=['full_name', 'term'],
    how='left'
)

leg_votes['vote_deviation'] = (leg_votes['vote'] - leg_votes['yes_rate']).abs()

vote_volatility = (
    leg_votes
    .groupby(['full_name', 'term'], as_index=False)
    .agg(vote_volatility=('vote_deviation', 'mean'))
)

In [291]:
leg_votes['is_floor'] = leg_votes['location_code'].isin(['AFLOOR', 'SFLOOR'])

procedural_leverage = (
    leg_votes
    .groupby(['full_name', 'term'], as_index=False)
    .agg(procedural_leverage=('is_floor', lambda x: 1 - x.mean()))
)

In [292]:
legislator_behavior = (
    yes_rate
    .merge(procedural_exposure, on=['full_name', 'term'], how='left')
    .merge(vote_volatility, on=['full_name', 'term'], how='left')
    .merge(procedural_leverage, on=['full_name', 'term'], how='left')
)

## Bills Table

In [293]:
pols = politicians[['chamber', 'Term', 'Party', "full_name"]].drop_duplicates().pivot_table(index='Term', columns='Party', aggfunc='count').iloc[:, :3]
pols.columns = ['D', 'I', 'R']
pols['I'] = pols['I'].fillna(0)
pols = pols.reset_index().drop(columns='I')
pols['p_D'] = pols['D'] / (pols['D'] + pols['R'])
pols['p_R'] = pols['R'] / (pols['D'] + pols['R'])

bill_events = pd.DataFrame.from_dict({
    "bill_ID": vote_events['bill_ID'].drop_duplicates().values,
    'First': vote_events['bill_ID'].drop_duplicates().apply(lambda x: min(introduction_dates.get(x)['Dates'])).values,
    "Last": vote_events['bill_ID'].drop_duplicates().apply(lambda x: max(introduction_dates.get(x)['Dates'])).values
})

In [294]:
def voting_rows(row):
    last = row['legislator_name'].strip().lower()
    legislator = legislators_last_names.get((row['chamber'].lower(), last, row['term']), None)
    if legislator is None:
        if len(last.split(' ')) > 1:
            legislator = row['legislator_name']
    return legislator

voting['full_name'] = voting.apply(voting_rows, axis=1)

In [295]:
vot = voting.merge(politicians[['chamber','Party', "full_name"]].drop_duplicates(), on=['chamber', 'full_name'], how='left')

In [296]:
vt = vot.groupby(['bill_ID', 'Party'])['vote_code'].apply(lambda x: (x == 'AYE').mean()).unstack()
bill_polarization = (vt["D"] - vt["R"]).abs().rename("polarization")

In [297]:
ve = vote_events.copy()
ve['comb'] = ve['Date'].astype(str) + "." + ve['voting_place']
vote_num = ve.groupby(['bill_ID', 'bv_id'])['comb'].nunique().sort_values(ascending=False).reset_index()
n_versions = ve.groupby('bill_ID')['bv_id'].nunique().reset_index()
yes_rate = ve.groupby(['bill_ID', 'term'])['vote'].mean().reset_index()

In [298]:
yes_rate['p_D'] = yes_rate['term'].map({row['Term']: row['p_D'] for _, row in pols[['Term', 'p_D']].drop_duplicates().iterrows()})
yes_rate['vote_deviation'] = (yes_rate['vote'] - yes_rate['p_D']).abs()
yes_rate = yes_rate.merge(vote_num, on='bill_ID', how='left')

In [299]:
data = torch.load('data5.pt', weights_only=False)

In [300]:
edge_type = ('legislator_term', 'voted_on', 'bill_version')
store = data[edge_type]

src, dst = store.edge_index
edge_attr = store.edge_attr
vote_idx = -1
votes = edge_attr[:, vote_idx].float()
votes = (votes > 0).float()
bv_to_bill = vid_map
bill_ids = [bv_to_bill.get(node_ids['bill_version'].get(int(bv))) for bv in dst.cpu().numpy()]

In [301]:
vote_df = pd.DataFrame({
    'bill_ID': bill_ids,
    'vote': votes.cpu().numpy()
}).dropna(subset=['bill_ID'])

bill_yes_rate = (
    vote_df
    .groupby('bill_ID', as_index=False)
    .agg(
        yes_rate=('vote', 'mean'),
        n_votes=('vote', 'size')
    )
)

In [302]:
bill_stats = (
    bill_yes_rate.merge(yes_rate, on='bill_ID', how='left')
    .merge(n_versions, on='bill_ID', how='left')
    .merge(bill_events, on='bill_ID', how='left')
    .merge(vote_num, on='bill_ID', how='left')
    .merge(bill_polarization, on='bill_ID', how='left')
    .drop_duplicates()
)

bill_stats['vote_deviation'] = (bill_stats['yes_rate'] - bill_stats['p_D']).abs()

bill_stats['lifespan_days'] = (
    bill_stats['Last'] - bill_stats['First']
).dt.days.clip(lower=1)

In [303]:
bill_stats['n_dem'] = bill_stats['term'].map({row['Term']: row['D'] for _, row in pols[['Term', 'D']].drop_duplicates().iterrows()})
bill_stats['n_rep'] = bill_stats['term'].map({row['Term']: row['R'] for _, row in pols[['Term', 'R']].drop_duplicates().iterrows()})

In [304]:
bill_stats["balance_factor"] = np.sqrt((bill_stats["n_dem"] * bill_stats["n_rep"]) /(bill_stats["n_dem"] + bill_stats["n_rep"])**2)

bill_stats["polarization_adj"] = (bill_stats["polarization"] * bill_stats["balance_factor"])

In [305]:
bill_stats['yes_uncertainty'] = (
    1 - (bill_stats['yes_rate'] - 0.5).abs() * 2
)

ALPHA = 0.7

bill_stats['contention'] = (
    ALPHA * bill_stats['vote_deviation']
    + (1 - ALPHA) * bill_stats['yes_uncertainty']
)

bill_stats['procedural_intensity'] = (
    bill_stats['comb_x']
    * bill_stats['bv_id_y']
    * np.log1p(bill_stats['lifespan_days'])
)

bill_stats['controversy'] = (
    (bill_stats['polarization']
    * bill_stats['contention'])
    * np.log1p(bill_stats['procedural_intensity'])
)

In [306]:
with open('bill_labels.json', 'r') as f:
    bill_labels = json.load(f)

In [307]:
bill_stats['topic'] = bill_stats['bill_ID'].map(bill_labels)

In [308]:
bill_outcomes = pickle.load(open("bill_outcomes.pkl", "rb"))
bill_stats['outcome'] = bill_stats['bill_ID'].map(bill_outcomes).fillna(1.0)

In [309]:
pol_names = politicians[['Term', 'Last', 'full_name', 'chamber', 'clean_full_name']].drop_duplicates()

def strip_accents(text):
    try:
        text = str(text)
    except NameError:
        pass
    normalized_text = unicodedata.normalize('NFD', text)
    stripped_text = "".join(
        c for c in normalized_text if unicodedata.category(c) != 'Mn'
    )

    return str(stripped_text)

def fix_sponsors(row):
    if row['full_name'] != np.nan:
        return row['full_name']
    elif '-' in row['Name']:
        name = re.sub(r'-', ' ', row['Name'])
        full = pol_names.loc[pol_names['Last'] == name, 'full_name'].values[0]
        return full
    else:
        name = strip_accents(row['Name'])
        print(name)
        full = pol_names.loc[pol_names['clean_full_name'] == name, 'full_name'].values[0]
        return full

In [310]:
sponsors['name'] = sponsors.apply(fix_sponsors, axis=1)

In [311]:
sponsors['bill'] = sponsors['bill_ID'].map(bv_to_bill)
bill_authors = sponsors.loc[sponsors['full_name'].notna()].groupby('bill').agg({'full_name': lambda x: list(set(x))}).reset_index()

def reverse_names(name_list):
    hold = []
    for n in name_list:
        if ',' in n:
            hold.append(n.split(',')[1] + ' ' + n.split(',')[0])
        else:
            hold.append(n.split(' ')[1] + ' ' + n.split(' ')[0])
    return ", ".join(hold)
bill_authors['Name'] = bill_authors['full_name'].apply(reverse_names)

In [312]:
bill_authors = bill_authors.rename(columns={'bill': 'bill_ID'})
bill_stats = bill_stats.merge(bill_authors[['bill_ID', 'Name']], on='bill_ID', how='left')

In [313]:
bill_stats['Subject'] = bill_stats['bill_ID'].map(bill_subjects)

In [314]:
subject_embeddings = torch.load('subject_embeddings.pt')
subject_embeddings = {k: v.cpu().numpy().tolist() for k, v in subject_embeddings.items()}

In [315]:
bill_stats['embeddings'] = bill_stats['Subject'].apply(text_clean).map(subject_embeddings)

In [316]:
def author_tokens(s):
    if not isinstance(s, str):
        return []
    s = s.lower()
    s = re.sub(r'[^\w\s]', ' ', s)
    return list(set(s.split()))

bill_stats['author_tokens'] = bill_stats['Name'].apply(author_tokens)

In [317]:
bill_stats['term'] = bill_stats['bill_ID'].apply(lambda x: f"{x[:4]}-{x[4:8]}")

In [377]:
for col in ['controversy', 'procedural_intensity', 'n_votes', 'lifespan_days', 'contention']:
    bill_stats[f'{col}_z'] = (
        bill_stats
        .groupby('term')[col]
        .transform(lambda x: (x - x.mean()) / x.std())
    )

bill_stats["combined_index"] = (
    0.3 * bill_stats["procedural_intensity_z"]
  + 0.4 * bill_stats["controversy_z"]
  + 0.3 * bill_stats["contention_z"]
)

bill_stats['controversy_pct'] = (
    bill_stats
    .groupby('term')['controversy_z']
    .rank(pct=True) * 100
)

## Topics

In [378]:
bill_topic_stats = bill_stats.copy()

bill_topic_stats['term'] = bill_topic_stats['bill_ID'].apply(lambda x: f"{x[:4]}-{x[4:8]}")

In [379]:
topic_term_sum = (
    bill_topic_stats
    .groupby(['topic', 'term'], as_index=False)
    .agg(
        n_bills=('bill_ID', 'nunique'),
        avg_controversy=('controversy', 'mean'),
        total_votes=('n_votes', 'sum'),
        total_versions=('bv_id_y', 'sum'),
        avg_lifespan_days=('lifespan_days', 'mean'),
        pass_rate=('outcome', lambda x: (x == 1).mean())
    )
)

In [380]:
leg_topics['abs_impact'] = leg_topics['stance'].abs()
leg_topics['abs_denom'] = leg_topics.groupby(['name', 'term'])['abs_impact'].transform('sum')
leg_topics = leg_topics.loc[leg_topics['abs_denom'] > 0].copy()

leg_topics['w_abs'] = leg_topics['abs_impact'] / leg_topics['abs_denom']

leg_topics['w_signed'] = leg_topics['stance'] / leg_topics['abs_denom']

In [381]:
funding_alloc = (
    funding
    .merge(leg_topics, left_on=['full_name', 'term'], right_on=['name', 'term'], how='left', validate='many_to_many')
    .dropna(subset=['topic_id', 'stance', 'w_abs'])
    .assign(
        topic_amount=lambda d: d['amount'] * d['w_abs'],
        topic_amount_signed=lambda d: d['amount'] * d['w_signed']
    )
)

In [382]:
topic_term_funding = (
    funding_alloc
    .groupby(['term', 'topic_id'], as_index=False)
    .agg(
        topic_amount=('topic_amount', 'sum'),
        topic_amount_signed=('topic_amount_signed', 'sum'),
        n_edges=('amount', 'size'),
        n_funders=('Firm', 'nunique'),
        n_recipients=('full_name', 'nunique')
    )
)

topic_term_funding['topic'] = (
    topic_term_funding['topic_id']
    .astype(int).astype(str)
    .map(subject_key)
)

In [383]:
topic_term_summa = topic_term_sum.merge(
    topic_term_funding,
    on=['topic', 'term'],
    how='left'
).fillna({'total_topic_funding': 0})

In [384]:
latest_ = topic_term_summa.copy().loc[topic_term_summa['term'].isin(['2023-2024', '2025-2026'])]

In [385]:
def top_term_sum(df, year_fix=True):
    df['attention_score'] = (
        df['n_bills']
        * np.log1p(df['total_versions'])
    )

    df['controversy_rank'] = (
        df
        .groupby('term')['avg_controversy']
        .rank(pct=True)
    )

    df['funding_rank'] = (
        df
        .groupby('term')['topic_amount']
        .rank(pct=True)
    )
    df = df.sort_values(['topic', 'term'])

    df['delta_controversy'] = (
        df
        .groupby('topic')['avg_controversy']
        .diff()
    )

    df['delta_funding'] = (
        df
        .groupby('topic')['topic_amount']
        .diff()
    )

    df['delta_pass_rate'] = (
        df
        .groupby('topic')['pass_rate']
        .diff()
    )
    df['quadrant'] = np.select(
        [
            (df['controversy_rank'] > 0.75)
            & (df['funding_rank'] > 0.75),

            (df['controversy_rank'] < 0.25)
            & (df['funding_rank'] > 0.75)
            & (df['pass_rate'] > 0.7)
        ],
        [
            'High Controversy / High Funding',
            'Low Controversy / High Funding / High Success'
        ],
        default='Other'
    )

    if year_fix:
        df['year'] = df['term'].apply(lambda x: int(x[:4]))

    return df


In [386]:
topic_polarization = (
    bill_topic_stats
    .groupby(["topic", "term"])
    .apply(
        lambda g: np.average(
            g["polarization_adj"],
            weights=g["n_votes"]
        )
    )
    .rename("topic_polarization")
    .reset_index()
)


  .apply(


In [387]:
topic_term_summary = top_term_sum(topic_term_summa)

In [388]:
lhc = latest_.groupby('topic').agg(
    topic_amount=('topic_amount', 'sum'),
    n_bills=('n_bills', 'sum'),
    avg_controversy=('avg_controversy', 'mean'),
    total_versions=('total_versions', 'sum'),
    pass_rate=('pass_rate', 'mean')
)
lhc['term'] = 'current'

In [389]:
latest_high_conflict = top_term_sum(lhc, year_fix=False).reset_index()

In [390]:
latest_high_conflict = latest_high_conflict.loc[latest_high_conflict['quadrant'] == 'High Controversy / High Funding']

latest_high_conflict = (
    latest_high_conflict
    .loc[:, [
        'topic',
        'topic_amount',
        'funding_rank',
        'avg_controversy',
        'controversy_rank',
        'pass_rate'
    ]]
    .rename(columns={
        'topic': 'Topic',
        'topic_amount': 'Total Funding',
        'funding_rank': 'Funding Percentile',
        'avg_controversy': 'Relative Controversy',
        'controversy_rank': 'Controversy Percentile',
        'pass_rate': 'Pass Rate'
    })
)

latest_high_conflict = latest_high_conflict.assign(
    **{
        'Funding Percentile': latest_high_conflict['Funding Percentile'].round(2),
        'Controversy Percentile': latest_high_conflict['Controversy Percentile'].round(2),
        'Relative Controversy': latest_high_conflict['Relative Controversy'].round(2),
        'Pass Rate': (latest_high_conflict['Pass Rate'] * 100).round(1)
    }
)

In [391]:
topic_players = (
    actor_topic
    .loc[
        actor_topic['actor_type'].isin([
            'legislator_term',
            'committee',
            'donor',
            'lobby_firm'
        ])
    ]
    [['topic_id', 'actor_type', 'name', 'stance', "influence"]]
    .dropna(subset=['name', 'stance', "influence"])
)

In [392]:
from sklearn.preprocessing import PowerTransformer

topic_players[['Influence']] = PowerTransformer().fit_transform(topic_players[['influence']].values)

In [393]:
topic_key_players = (
    topic_players
    .groupby(['topic_id', 'actor_type', 'name'], as_index=False)
    .agg(
        avg_impact=('Influence', 'mean'),
        abs_impact=('Influence', lambda x: np.abs(x).mean())
    )
)

In [394]:
topic_key_players['rank_in_topic'] = (
    topic_key_players
    .groupby(['topic_id', 'actor_type'])['abs_impact']
    .rank(method='dense', ascending=False)
)

In [395]:
funding['Name'] = funding['full_name'].apply(lambda n: n.split(',')[1] + ' ' + n.split(',')[0] if ',' in n else n.split(' ')[1] + ' ' + n.split(' ')[0])

In [396]:
topic_term_summary = topic_term_summary.merge(topic_polarization, on=['topic', 'term'], how='left')

## Final Output

- `county_funding.parquet`
- `county_top_funders`
- `legislator_funding`
- `legislator_power`
- `legislator_behavior`
- `bill_stats`
- `topic_term_summary`
- `funding`
- `latest_high_conflict`

In [397]:
for df, n in zip([county_funding, county_term_funding, county_top_funders, legislator_funding, legislator_power, legislator_behavior, bill_stats, topic_term_summary, funding, latest_high_conflict], ['county_funding', 'county_term_funding', 'county_top_funders', 'legislator_funding', 'legislator_power', 'legislator_behavior', 'bill_stats', 'topic_term_summary', 'funding', "latest_high_conflict"]):
    print(n)
    print('\n')
    print(df.columns.tolist())
    print('\n')
    df.to_parquet(f"dashboard/shiny-app/data/{n}.parquet")

county_funding


['county_id', 'county_name', 'total_amount']


county_term_funding


['county_name', 'term', 'total_amount']


county_top_funders


['county_id', 'county_name', 'funder', 'total_amount', 'top_supported_topics', 'top_opposed_topics', 'firm_total', 'region_concentration']


legislator_funding


['full_name', 'term', 'house', 'party', 'donations', 'lobbying', 'total_funding', 'funding_pct_overall', 'funding_pct_house', 'funding_rank_house', 'funding_tier']


legislator_power


['full_name', 'term', 'house', 'party', 'donations', 'lobbying', 'total_funding', 'funding_pct_overall', 'funding_pct_house', 'funding_rank_house', 'funding_tier', 'top_topics', 'topic_concentration', 'overall_influence', 'overall_influence_z', 'influence_l1', 'influence_l2', 'n_topics', 'influence_tier', 'name']


legislator_behavior


['full_name', 'term', 'yes_rate', 'procedural_exposure', 'vote_volatility', 'procedural_leverage']


bill_stats


['bill_ID', 'yes_rate', 'n_votes', 'term', 'vote', 