In [1]:
import gc, datetime, json, pickle, re
from pathlib import Path
from torch_geometric.transforms import ToUndirected, RemoveIsolatedNodes
from torch_scatter import scatter_mean
from pathlib import Path
import torch, numpy as np, pandas as pd
import pyarrow as pa, pyarrow.parquet as pq

DATA_PATH = Path("../../../data3.pt")
EXPORT_PATH = Path("../../shiny/data")
OUT_PATH = EXPORT_PATH
DEVICE = torch.device("cpu")
FLOAT = torch.float32

TODO:
- identify and map topic labels to all dfs
- change column names to fit app
- add more data to the app

In [2]:
with open('../../../node_id_map.json', 'r') as f:
        node_id_map = json.load(f)

with open('../../../bill_labels_updated.json', 'r') as f:
    topic_cluster_labels_dict = json.load(f)

In [3]:
OUT_DIR = Path("../../../")

preds = torch.load(OUT_DIR / "model_outputs.pt", map_location=DEVICE)

In [4]:
def safe_normalize_timestamps(timestamps, eps=1e-8):
    timestamps = torch.nan_to_num(timestamps, nan=0.0, posinf=1e4, neginf=-1e4)
    p5 = torch.quantile(timestamps, 0.05)
    p95 = torch.quantile(timestamps, 0.95)

    if (p95 - p5) < eps:
        return torch.zeros_like(timestamps)

    timestamps = torch.clamp(timestamps, p5, p95)
    normalized = (timestamps - p5) / (p95 - p5)
    return torch.nan_to_num(normalized, nan=0.0)

def safe_standardize_time_format(time_data):
    times = []
    for t in time_data:
        try:
            if isinstance(t, (int, float)) and 1900 <= t  and t <= 2100:
                td = datetime.datetime(int(t), 6, 15).timestamp()
            elif (isinstance(t, str) or (isinstance(t, float))) and (float(t) < 2100 and float(t) > 1900):
                td = datetime.datetime(int(float(t)), 6, 15).timestamp()
            elif float(t) > 0 and float(t) < 1990:
                td = t
            elif float(t) > 17000000.0:
                td = float(t)
            elif isinstance(t, datetime.datetime):
                td = t.timestamp()
            else:
                td = float(t) * 1e9
        except:
            td = datetime.datetime(2000, 6, 15).timestamp()
        times.append(td)
    return torch.tensor(times, dtype=torch.float32)

def pull_timestamps(data):
    timestamp_edges = [
        ('donor', 'donated_to', 'legislator_term'),
        ('legislator_term', 'rev_donated_to', 'donor'),
        ('lobby_firm', 'lobbied', 'legislator_term'),
        ('lobby_firm', 'lobbied', 'committee'),
        ('committee', 'rev_lobbied', 'lobby_firm'),
        ('legislator_term', 'rev_lobbied', 'lobby_firm'),
        ('bill_version', 'rev_voted_on', 'legislator_term'),
        ('legislator_term', 'voted_on', 'bill_version'),
    ]
    timestamp_nodes = ['legislator_term', 'bill_version', 'bill']

    for et in timestamp_edges:
        if hasattr(data[et], 'edge_attr') and data[et].edge_attr is not None and len(data[et].edge_attr.size()) > 1:
            if data[et].edge_attr.size(1) > 1:
                edge_attr = data[et].edge_attr
                ts_col = edge_attr[:, -1]
                if ts_col.abs().max() > 1e8 or ts_col.min() < 0:
                    ts_col = safe_standardize_time_format(ts_col.tolist()).to(edge_attr.device)
                data[et].timestamp = safe_normalize_timestamps(ts_col)
                data[et].edge_attr = edge_attr[:, :-1]

    for nt in timestamp_nodes:
        if hasattr(data[nt], 'x') and data[nt].x is not None:
            try:
                if len(data[nt].x.size()) > 1:
                    if data[nt].x.size(1) > 1:
                        x = data[nt].x
                        ts_col = x[:, -1]
                        if ts_col.abs().max() > 1e8 or ts_col.min() < 0:
                            ts_col = safe_standardize_time_format(ts_col.tolist()).to(x.device)
                        if nt in timestamp_nodes or ts_col.abs().max() > 1e6:
                            data[nt].timestamp = safe_normalize_timestamps(ts_col)
                            data[nt].x = x[:, :-1]
            except:
                pass
    return data

def clean_features(data):
    for nt in data.node_types:
        x = data[nt].x
        x = torch.as_tensor(x, dtype=torch.float32)
        x = torch.nan_to_num(x.float(), nan=0.0, posinf=1e4, neginf=-1e4)
        mean = x.mean(0, keepdim=True)
        std = x.std(0, keepdim=True).clamp(min=1e-5)
        x = ((x - mean) / std).clamp(-10, 10)
        data[nt].x = x
        data[nt].x_mean = mean
        data[nt].x_std = std
    data = pull_timestamps(data)
    return data

def compute_controversiality(data):
    edge_type = ('legislator_term', 'voted_on', 'bill_version')
    if edge_type not in data.edge_index_dict:
        raise ValueError("Missing 'voted_on' edges in data.")

    ei = data[edge_type].edge_index
    ea = data[edge_type].edge_attr
    vote_signal = ea[:, 0]
    tgt_nodes = ei[1]

    num_bv = data['bill_version'].num_nodes
    device = vote_signal.device
    yes = torch.zeros(num_bv, device=device)
    no = torch.zeros(num_bv, device=device)

    yes.index_add_(0, tgt_nodes, (vote_signal > 0).float())
    no.index_add_(0, tgt_nodes, (vote_signal <= 0).float())
    total = yes + no + 1e-6
    controversy_bv = 4 * (yes / total) * (no / total)
    controversy_bv = controversy_bv.clamp(0, 1)
    data['bill_version'].controversy = controversy_bv

    is_version_et = ('bill_version', 'is_version', 'bill')
    if is_version_et not in data.edge_index_dict:
        raise ValueError("Missing 'is_version' edges for bill aggregation.")

    src, dst = data.edge_index_dict[is_version_et]
    agg_b = scatter_mean(controversy_bv[src], dst, dim=0, dim_size=data['bill'].num_nodes)
    data['bill'].controversy = agg_b

    return data


def load_and_preprocess_data(path='../../../data3.pt'):
    full_data = torch.load(path, weights_only=False)
    for nt in full_data.node_types:
        if hasattr(full_data[nt], 'x') and full_data[nt].x is not None:
            flat = torch.as_tensor(full_data[nt].x).flatten(start_dim=1)
            full_data[nt].x = flat
            full_data[nt].num_nodes = flat.size(0)

    for edge_type, edge_index in full_data.edge_index_dict.items():
        src_type, _, dst_type = edge_type
        max_src_idx = edge_index[0].max().item() if edge_index.size(1) > 0 else -1
        max_dst_idx = edge_index[1].max().item() if edge_index.size(1) > 0 else -1
        if max_src_idx >= full_data[src_type].num_nodes:
            print(f"Fixing {src_type} node count: {full_data[src_type].num_nodes} -> {max_src_idx + 1}")
            full_data[src_type].num_nodes = max_src_idx + 1

        if max_dst_idx >= full_data[dst_type].num_nodes:
            print(f"Fixing {dst_type} node count: {full_data[dst_type].num_nodes} -> {max_dst_idx + 1}")
            full_data[dst_type].num_nodes = max_dst_idx + 1
    full_data['bill'].y[np.where(full_data['bill'].y < 0)[0]] = 0
    full_data['bill'].y = torch.as_tensor(full_data['bill'].y, dtype=torch.float32)

    data = ToUndirected(merge=False)(full_data)
    del full_data
    gc.collect()
    data = RemoveIsolatedNodes()(data)
    data = compute_controversiality(clean_features(data))

    for nt in data.node_types:
        ids = torch.arange(data[nt].num_nodes, device='mps')
        data[nt].node_id = ids
    for store in data.stores:
        for key, value in store.items():
            if isinstance(value, torch.Tensor) and value.dtype == torch.float64:
                store[key] = value.float()

    return data

data = load_and_preprocess_data()

In [5]:
key1 = data['bill'].n_id.tolist()
key2 = data['bill'].node_id.tolist()
key = {k1: k2 for k1, k2 in zip(key1, key2)}
cluster_bill = {}
nids = []
for bill_nid, lab in topic_cluster_labels_dict.items():
        if bill_nid in key:
            cluster_bill[key[bill_nid]] = lab
            nids.append(key[bill_nid])

## Bills

In [6]:
bv_ts = pickle.loads(open('../../../bill_dates_map.pkl', 'rb').read())

In [7]:
bv_ids = {v: k for k, v in node_id_map['bill_version'].items()}

In [8]:
v2b_edge = tuple([et for et in data.edge_types
                if et[0] == "bill_version" and et[2] == "bill"])[0]
src, dst = data[v2b_edge].edge_index.numpy()

bv_df = pd.DataFrame({"bill_version": src, "bill_id": data['bill'].n_id[dst]})
bv_df['bill_version_id'] = bv_df['bill_version'].map(bv_ids)

In [9]:
bill_dates = pd.DataFrame(bv_ts).T.reset_index().rename(columns={'index': 'bill_id'})
bill_dates = bill_dates.loc[bill_dates['bill_id'].isin(bv_df['bill_id'].unique())]

In [10]:
controversy_df = pd.DataFrame.from_dict(preds['controversy'], orient='index', columns=['controversy']).reset_index().rename(columns={'index': 'bill_node'})
controversy_df['bill_id'] = controversy_df['bill_node'].map({v: k for k, v in key.items()})

In [11]:
outcome_df = pd.DataFrame({
    'bill_id': data['bill'].n_id,
    'outcome': data['bill'].y
})

In [12]:
bills = controversy_df.merge(outcome_df, on='bill_id', how='right')

In [13]:
bills['topic_cluster'] = bills['bill_id'].map(topic_cluster_labels_dict)

In [14]:
bill_dates['longevity'] = bill_dates['Last_action'] - bill_dates['First_action']

In [15]:
bill_df = bills.merge(bill_dates, on='bill_id', how='left')

In [16]:
with open('../../../bill_labels_updated.json', 'r') as f:
    bill_subjects = np.array(list(json.load(f).keys()))

In [17]:
subject_originals = pickle.load(open('../../../subjects_original.pkl', 'rb'))

In [18]:
with open('../../../bill_subjects.json', 'r') as f:
    bill_subjects_dict = json.load(f)

In [240]:
so = {k: subject_originals[v] for k, v in bill_subjects_dict.items() if v in subject_originals}

In [27]:
sampled_labels = pd.read_csv('../../../sampled_labels - sampled_labels.csv')
sol = {row['cluster']: row['Label'] for _, row in sampled_labels.iterrows()}

In [29]:
bill_df = bill_df.rename(columns={'topic_cluster': 'topic_id'})
bill_df['topic'] = bill_df['topic_id'].map(sol)

In [31]:
bill_df['controversy'] = bill_df['controversy'].fillna(0)

## Topics

In [32]:
topics = bill_df.loc[bill_df['topic_id'].notna()].copy()
topics['term'] = topics['bill_id'].apply(lambda x: x[:4]).astype(int)

In [33]:
topics_df = topics.groupby(['term', 'topic']).agg({'outcome': lambda x: len(x.loc[x == 1]) / len(x), 'controversy': 'mean', 'bill_id': 'nunique', 'longevity': 'mean'}).reset_index()

In [47]:
tids = bill_df[['topic', 'topic_id']].drop_duplicates().sort_values(by='topic_id').reset_index(drop=True).reset_index(names='tid')
topic_columns = tids['topic'].values.tolist() + ['Miscellaneous']

In [48]:
def actor_table(actor, key):
    if key == 'alignment':
        return pd.DataFrame.from_dict(preds[key][actor], orient='index', columns=topic_columns).reset_index().rename(columns={'index': f'{actor}_id'})
    else:
        return pd.DataFrame.from_dict(preds[key][actor], orient='index', columns=['influence']).reset_index().rename(columns={'index': f'{actor}_id'})

leg_align = actor_table('legislator', 'alignment')
leg_inf = actor_table('legislator', 'influence')
donor_align = actor_table('donor', 'alignment')
donor_inf = actor_table('donor', 'influence')
lobby_align = actor_table('lobby_firm', 'alignment')
lobby_inf = actor_table('lobby_firm', 'influence')
committee_align = actor_table('committee', 'alignment')
committee_inf = actor_table('committee', 'influence')

In [54]:
def add_top_n(df, n=5):
    top = (
        df[[t for t in topic_columns if t not in ['Miscellaneous', 'Education Finance']]]
          .apply(lambda r: r.nlargest(n).index.values, axis=1)
    )
    return top
leg_align['top_topics'] = add_top_n(leg_align, 5)
donor_align['top_topics'] = add_top_n(donor_align, 5)
lobby_align['top_topics'] = add_top_n(lobby_align, 5)
committee_align['top_topics'] = add_top_n(committee_align, 5)

In [None]:
def edge_year(ts_tensor):
    return pd.to_datetime(ts_tensor.cpu().numpy(), unit="s").year.astype(np.int16)

In [77]:
politicians = pd.read_csv(OUT_PATH / 'legislator_terms.csv')
lobbying = pd.read_csv('../../../calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})
expend_assembly = pd.read_csv('../../../calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})
expend_senate = pd.read_csv('../../../calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})
lobbying['expn_date'] = pd.to_datetime(lobbying['EXPN_DATE'])
lobbying['term'] = lobbying['expn_date'].dt.year.astype(int)
lobbying.loc[lobbying['expn_date'].dt.year > 2025, 'term'] = [2022, 2014]

In [78]:
lobbying.loc[(lobbying['term'].isin([int(r) for r in range(2000, 2026, 2)])) & (lobbying['expn_date'].dt.month < 11), 'term'] = lobbying['term'] - 1
lobbying.loc[(lobbying['term'].isin([int(r) for r in range(2000, 2026, 2)])) & (lobbying['expn_date'].dt.month >= 11), 'term'] = lobbying['term'] + 1

In [79]:
lob = lobbying.groupby(['clean_beneficiary', 'term']).agg({'AMOUNT': 'sum'}).reset_index()

In [80]:
expend_assembly = expend_assembly.loc[expend_assembly['term'].apply(lambda x: isinstance(x, str))]
expend_assembly['year'] = expend_assembly['term'].apply(lambda x: int(str(x).split('-')[0]))
expend_assembly.loc[expend_assembly['year'] // 2 == 0, 'year'] = expend_assembly.loc[expend_assembly['year'] // 2 == 0, 'year'] - 1
exp_as = expend_assembly[['Amount', 'year', 'matched_target_name']].drop_duplicates().groupby(['matched_target_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={'year': 'term'})

In [81]:
expend_senate['year'] = expend_senate['term'].apply(lambda x: int(x.split('-')[0]))
expend_senate.loc[expend_senate['year'] // 2 == 0, 'year'] = expend_senate.loc[expend_senate['year'] // 2 == 0, 'year'] - 1
exp_sen = expend_senate.groupby(['matched_target_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={"year": 'term'})

In [84]:
politicians['lower'] = politicians['full_name'].str.lower()

In [92]:
def name_swap(name):
    name = name.lower()
    names = name.split(' ')
    return names[-1] + ' ' + ' '.join(names[:-1])

politicians['name2'] = politicians['full_name'].apply(name_swap)
politicians.loc[politicians['name2'].isin([p for p in politicians['lower'].unique() if p in lob['clean_beneficiary'].unique()]), 'name2'] = politicians.loc[politicians['name2'].isin([p for p in politicians['lower'].unique() if p in lob['clean_beneficiary'].unique()]), 'lower']

In [None]:
pl = politicians.merge(lob, left_on=['term', 'name2'], right_on=['term', 'clean_beneficiary'], how='left').rename(columns={'AMOUNT': 'total_lobbying_'})

In [97]:
exp_as['name2'] = exp_as['matched_target_name'].apply(lambda x: re.sub(r'\,', '', x.lower()))
pld = pl.merge(exp_as, on=['term', 'name2'], how='left').rename(columns={'Amount': 'total_donations_'})

In [100]:
exp_sen['name2'] = exp_sen['matched_target_name'].apply(lambda x: re.sub(r'\,', '', x.lower()))

In [101]:
pldd = pld.merge(exp_sen, on=['term', 'name2'], how='left')
pldd['total_donations_'] = pldd[['total_donations_', 'Amount']].sum(skipna=True, axis=1)
pldd = pldd.drop(columns=['total_donations', 'total_lobbying', 'Amount', 'total_received']).rename(columns={'total_donations_': 'total_donations', 'total_lobbying_': 'total_lobbying'})

In [102]:
pldd['total_received'] = pldd['total_donations'] + pldd['total_lobbying']

In [103]:
for c in ['total_donations', 'total_lobbying', 'total_received']:
    pldd[c] = pldd[c].fillna(0).astype(float)

In [105]:
leg_df = leg_align.merge(leg_inf, on='legislator_id', how='left')

In [106]:
legislators = pickle.load(open('../../../legislators.pkl', 'rb'))

leg_ids = {v: k for k, v in node_id_map['legislator_term'].items()}

def leg_term_to_name(leg_term_id):
    if isinstance(leg_term_id, str):
        num = int(leg_term_id.split('_')[0])
        return legislators.get(num, None)
    else:
        return None

def leg_term_to_term(leg_term_id):
    if isinstance(leg_term_id, str):
        a = leg_term_id.split('_')[1]
        return int(a.split('-')[0]) if a else None
    else:
        return None

leg_df['legislator'] = leg_df['legislator_id'].astype(int).map(leg_ids).apply(leg_term_to_name)
leg_df['term'] = leg_df['legislator_id'].astype(int).map(leg_ids).apply(leg_term_to_term)

In [107]:
politicians = pd.read_csv("../../../ca_leg/legislation_data/politicians.csv")

In [108]:
fix = politicians.loc[politicians['District No.'].isna(), ['full_name', 'Term']].drop_duplicates()
fix['District No.'] = [51, 51, 51, 51, 57, 57, 57, 57, 36, 36, 36, 6]

In [109]:
for i, row in fix.iterrows():
    politicians.loc[(politicians['full_name'] == row['full_name']) & (politicians['Term'] == row['Term']), 'District No.'] = row['District No.']

In [145]:
pol = politicians[['District No.', 'Term', 'full_name', 'chamber', 'Party']].drop_duplicates()
pol['term'] = pol['Term'].apply(lambda x: x.split('-')[0]).astype(int)
pol['full_name'] = pol['full_name'].apply(lambda x: (x.split(',')[1] + ' ' + x.split(',')[0]).strip() if ',' in x else x)

In [147]:
lfund = pol.merge(pldd, on=['full_name', 'term'], how='left')

In [149]:
import geopandas as gpd
import tempfile, zipfile, pathlib

In [150]:
def read_zip(zip_path, crs=3857):
    tmp = tempfile.TemporaryDirectory()
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(tmp.name)
    shp = next(pathlib.Path(tmp.name).rglob("*.shp"))
    gdf = gpd.read_file(shp).set_crs(epsg=crs)
    gdf = gdf.to_crs(epsg=3857)
    return gdf, tmp

def district_cycle(year):
    if year <= 2012: return "2001"
    if year <= 2022: return "2011"
    return "current"

counties_gdf, _ = read_zip('../data/ca_counties.zip')
counties_gdf = counties_gdf[['COUNTYFP', 'NAMELSAD', 'geometry']]
counties_gdf['county_area'] = counties_gdf.geometry.area
counties_gdf['county_id'] = counties_gdf['COUNTYFP'].astype(int)

In [151]:
cgdf = counties_gdf.to_json(na='drop', to_wgs84=True)
with open(OUT_PATH / 'counties.geojson', 'w') as f:
    f.write(cgdf)

In [152]:
data_dir = pathlib.Path('../data')

asm11_zip = data_dir / '2011_assembly_state_shp.zip'
sen11_zip = data_dir / '2011_senate_state_shp.zip'
asmcur_zip = data_dir / '2021_AD_Final_shp.zip'
sencur_zip = data_dir / '2021_SD_Final_shp.zip'

dist_info = [
    (asm11_zip, "assembly", "2011", 4019),
    (sen11_zip, "senate",   "2011", 4019),
    (asmcur_zip, "assembly","current", 4269),
    (sencur_zip, "senate",  "current", 4269)
]

weight_records = []
tmps = []
for zp, house, cycle, crs in dist_info:
    gdf, tmp = read_zip(zp, crs)
    tmps.append(tmp)
    gdf = gdf.rename(columns={gdf.columns[0]: "district_id"})[["district_id", "geometry"]]
    gdf["house"] = house
    gdf["cycle"] = cycle
    gdf["dist_area"] = gdf.geometry.area

    inter = gpd.overlay(gdf, counties_gdf, how="intersection")
    inter["fragment_area"] = inter.geometry.area

    weight_records.append(
        inter[["house", "cycle", "district_id", "county_id", "fragment_area", 'county_area', 'dist_area']].reset_index(drop=True)
    )

weights = pd.concat(weight_records, ignore_index=True)

In [153]:
weights['weight'] = weights['fragment_area'] / weights['county_area']

In [154]:
lfund['District No.'] = lfund['District No.'].astype(str).apply(lambda x: re.sub(r'\s', '', x)).astype(float).astype(int)

In [155]:
from statistics import mode
from collections import defaultdict
import ast

In [156]:
term_topics = defaultdict(list)
for _, row in lfund.iterrows():
    try:
        for t in ast.literal_eval(row['top_topics']):
            term_topics[(row['Term'], row['District No.'], row['chamber_x'])].append(t)
    except:
        pass
term_topics_ = {k: mode(v) for k, v in term_topics.items()}

In [157]:
lfund_ = lfund.groupby(['Term', 'District No.', 'chamber_x']).agg({
    'total_donations': 'sum',
    'total_lobbying': 'sum',
    'total_received': 'sum',
    'top_topics': lambda x: list(x)
}).reset_index()
lfund_['cycle'] = lfund_['Term'].apply(lambda x: '2011' if int(x.split('-')[0]) <= 2012 else 'current')

In [158]:
reg_funds = lfund_.merge(weights, left_on=['cycle', 'District No.', 'chamber_x'], right_on=['cycle', 'district_id', 'house'], how='left')

reg_funds['total_donations'] *= reg_funds['weight']
reg_funds['total_lobbying'] *= reg_funds['weight']
reg_funds['total_received'] *= reg_funds['weight']

In [159]:
county_topics = defaultdict(list)
for _, row in reg_funds.iterrows():
    if row['top_topics'] == [np.nan] or row['top_topics'][0] is None:
        continue
    try:
        for t in ast.literal_eval(row['top_topics'][0]):
            if t not in ['Extraordinary Sessions', 'Health Facilities']:
                county_topics[row['county_id']].append(t)
    except:
        pass
county_topics_ = {k: mode(v) for k, v in county_topics.items()}

In [161]:
reg_funds_ = reg_funds.groupby(['county_id', 'house']).agg({
    'total_donations': 'sum',
    'total_lobbying': 'sum',
    'total_received': 'sum'
}).reset_index()

In [164]:
reg_funds_['topic'] = reg_funds_['county_id'].map(county_topics_)

In [165]:
co_cal = reg_funds_.merge(counties_gdf, on='county_id', how='left')
gpd.GeoDataFrame(co_cal, geometry='geometry').to_file(OUT_PATH / 'ca_legislator_funding.geojson', driver='GeoJSON')

In [166]:
reg_funds_.to_csv(OUT_PATH / 'ca_legislator_funding.csv', index=False)

## Legislators

In [169]:
ei = data[("legislator_term","wrote","bill_version")].edge_index.numpy()
ea = data[("legislator_term","wrote","bill_version")].edge_attr.numpy()
author_edge = pd.DataFrame({"legterm_id": ei[0], "bill_id": ei[1], "type": ea[:,0]})
author_edge['date'] = data["bill_version"].timestamp.numpy()[author_edge.bill_id]
author_edge.loc[author_edge.date == 0, 'date'] = datetime.datetime(2000, 6, 15).timestamp()
author_edge['date'] = pd.to_datetime(author_edge['date'], unit='s')

eib = data[('bill_version','is_version', 'bill')].edge_index.numpy()
eib = pd.DataFrame({"src": eib[0], "dst": eib[1], 'outcome': data['bill'].y[eib[1]]})
eib['src'] = eib['src'].astype(int)
eib['dst'] = eib['dst'].astype(int)
author_edge['bill_id'] = author_edge['bill_id'].astype(int)

author_edge = author_edge.merge(eib, left_on='bill_id', right_on='src', how='inner')
author_edge['outcome'] = (author_edge['outcome'] == 1).astype(int)
author_levels = {1: 'COAUTHOR', 2: 'PRINCIPAL_COAUTHOR', 3: 'LEAD_AUTHOR'}
author_edge['author_type'] = author_edge['type'].map(author_levels)

In [171]:
ve = data[('bill_version', 'rev_voted_on', 'legislator_term')].edge_index.numpy()
va = data[('bill_version', 'rev_voted_on', 'legislator_term')].edge_attr.numpy()
vote_edge = pd.DataFrame({'bill_version': ve[0], 'legislator_term': ve[1], 'vote_signal': va[:, 0]})
vote_edge = vote_edge.merge(eib, left_on='bill_version', right_on='src', how='left').merge(bv_df, on='bill_version', how='left')

In [172]:
vote_edge['full_name'] = vote_edge['legislator_term'].map(leg_ids).apply(leg_term_to_name)
vote_edge['term'] = vote_edge['legislator_term'].map(leg_ids).apply(leg_term_to_term)

In [173]:
signals = vote_edge.groupby('bill_id').agg({'outcome': 'max', 'vote_signal': lambda x: (x > 0).sum() / len(x)})
signals.loc[(signals['outcome'] == 0.0) & (signals['vote_signal'] == 1.0), 'vote_signal'] = 0.0

In [181]:
a3 = author_edge.merge(bv_df, left_on='bill_id', right_on='bill_version', how='left').groupby('legterm_id').agg({
    'outcome': 'mean',
    'author_type': lambda x: sum(x == 'LEAD_AUTHOR'),
    'bill_version': 'nunique'
}).reset_index()

a3['full_name'] = a3['legterm_id'].map(leg_ids).apply(leg_term_to_name)
a3['term'] = a3['legterm_id'].map(leg_ids).apply(leg_term_to_term)

In [182]:
a3['full_name'] = a3['full_name'].apply(lambda x: (x.split(',')[1] + ' ' + x.split(',')[0]).strip() if isinstance(x, str) and ',' in x else x)

In [183]:
a4 = a3.merge(lfund, on=['full_name', 'term'], how='left')

In [185]:
a4[['outcome_x', 'author_type_x', 'bill_version_x', 'top_topics',  'full_name', 'term', 'total_donations', 'total_lobbying', 'total_received', 'Party_x', 'chamber_x']].rename(columns={'outcome_x': 'outcome', 'author_type_x': 'author_type', 'bill_version_x': 'bill_versions', 'Party_x': 'Party', 'chamber_x': 'chamber'}).to_csv(OUT_PATH / 'legislator_terms.csv', index=False)

## Donations and Lobbying

In [197]:
donations = pd.concat([expend_senate.groupby('ExpenderName')['Amount'].sum().reset_index(), expend_assembly.groupby('ExpenderName')['Amount'].sum().reset_index()]).groupby('ExpenderName').sum().reset_index()

In [219]:
lobby = lobbying.groupby('FIRM_NAME')['AMOUNT'].sum().reset_index()
lobby['FIRM_NAME'] = lobby['FIRM_NAME'].str.lower()

In [220]:
def combine_don_df(don_align, don_inf, type):
    don = don_align[[f'{type}_id', 'top_topics']].merge(don_inf, on=f'{type}_id')
    don['name'] = don[f'{type}_id'].map({v: k for k, v in node_id_map[type].items()})
    if type == 'donor':
        don = don.merge(donations, left_on='name', right_on='ExpenderName', how='left').rename(columns={'Amount': 'total_spent'})
    else:
        don = don.merge(lobby, left_on='name', right_on='FIRM_NAME', how='left').rename(columns={'AMOUNT': 'total_spent'})
    don['total_spent'] = don['total_spent'].fillna(0)
    return don

donor_df = combine_don_df(donor_align, donor_inf, 'donor')
lobby_df = combine_don_df(lobby_align, lobby_inf, 'lobby_firm')

In [221]:
don = donor_df[['name', 'influence', 'total_spent', 'top_topics']].copy()
don['type'] = 'donor'
lob = lobby_df[['name', 'influence', 'total_spent', 'top_topics']].copy()
lob['type'] = 'lobby_firm'
donor_lobby = pd.concat([don, lob], ignore_index=True)

In [224]:
donor_lobby.to_csv(OUT_PATH / 'donor_lobby_topics.csv')

## Bills

In [225]:
author_edge['bill'] = data['bill'].n_id[author_edge['dst'].values]

In [226]:
authors = author_edge.groupby('bill').agg({'date': 'max'}).reset_index().merge(author_edge, on=['bill', 'date'], how='inner').groupby('bill').agg({
    'legterm_id': lambda x: ', '.join(x.astype(str).unique())}).reset_index()

In [227]:
def terms_to_names(terms):
    term_names = []
    for t in terms.split(', '):
        l = leg_ids.get(int(t.strip()), None)
        if l is not None:
            term_names.append(leg_term_to_name(l))
    return ', '.join([n for n in term_names if n is not None])

In [228]:
authors['authors'] = authors['legterm_id'].apply(terms_to_names)

In [229]:
b = bill_df.merge(authors, left_on='bill_id', right_on='bill', how='inner')

In [230]:
b['longevity'] = b['longevity'].dt.days

In [231]:
bi = b.merge(bv_df.groupby('bill_id')['bill_version_id'].nunique().reset_index(), on='bill_id', how='left').merge(bill_dates[['bill_id', 'First_action']], on='bill_id', how='left')

In [234]:
bi['term'] = bi['bill_id'].apply(lambda x: x[:4]).astype(int)
bi['First_action'] = bi['First_action_x'].dt.strftime('%Y-%m-%d')

In [235]:
bil = bi.merge(signals, on='bill_id', how='left')
bil['vote_signal'] = bil['vote_signal'].fillna(0.0)

In [242]:
bil['topic'] = bil['bill_id'].map(so)

In [244]:
pq.write_table(pa.Table.from_pandas(bil), OUT_PATH / 'bills.parquet')

In [245]:
bills = pq.read_table(OUT_PATH / 'bills.parquet').to_pandas()

In [246]:
bills.to_csv(OUT_PATH / 'bills.csv', index=False)

## Topics pt. 2

In [249]:
t = bi.groupby(['topic', 'term']).agg({
    'outcome': 'mean',
    'controversy': 'mean',
    'longevity': 'mean',
    'bill_id': 'nunique',
    'bill_version_id': 'mean'
}).reset_index()

In [271]:
leg_align['full_name'] = leg_align['legislator_id'].apply(lambda x: leg_ids.get(int(x), None)).apply(leg_term_to_name).apply(lambda x: (x.split(',')[1] + ' ' + x.split(',')[0]).strip() if ',' in x else x)
leg_align['term'] = leg_align['legislator_id'].apply(lambda x: leg_ids.get(int(x), None)).apply(leg_term_to_term)

In [266]:
parties = pol[['full_name', 'Party']].drop_duplicates()

In [274]:
leg_party_align = leg_align.merge(parties, on='full_name', how='left')

partisanship_by_term = {}

for topic in topic_columns:
    if topic in ['Miscellaneous']:
        continue

    term_party_means = leg_party_align.groupby(['term', 'Party'])[topic].mean().unstack(fill_value=0)

    for term in term_party_means.index:
        if term not in partisanship_by_term:
            partisanship_by_term[term] = {}

        if len(term_party_means.columns) >= 2:
            dem_mean_term = term_party_means.loc[term, 'D'] if 'D' in term_party_means.columns else 0
            rep_mean_term = term_party_means.loc[term, 'R'] if 'R' in term_party_means.columns else 0

            partisanship_term = abs(dem_mean_term - rep_mean_term)

            partisanship_by_term[term][topic] = {
                'party_difference': partisanship_term,
                'dem_alignment': dem_mean_term,
                'rep_alignment': rep_mean_term,
                'alignment_ratio': rep_mean_term / (dem_mean_term + 1e-6)
            }

term_partisanship_data = []
for term, topics in partisanship_by_term.items():
    for topic, scores in topics.items():
        term_partisanship_data.append({
            'term': term,
            'topic': topic,
            **scores
        })

term_alignments = pd.DataFrame(term_partisanship_data)

In [278]:
t_ = t.merge(term_alignments, on=['term', 'topic'], how='left')

In [279]:
t_.to_csv(OUT_PATH / 'topics_agg.csv', index=False)

In [289]:
committees = pickle.load(open('../../../committees.pkl', 'rb'))

In [290]:
def top5_actors(align, id):
    top5 = defaultdict(list)
    for topic in topic_columns:
        if topic in ['Miscellaneous']:
            continue
        results = align.iloc[align[topic].nlargest(5).index.tolist()][id].tolist()
        if id != 'full_name':
            if id != 'lobby_firm_id':
                name = id.split('_')[0]
            else:
                name = 'lobby_firm'
            if id != 'committee_id':
                codes = {v: k for k, v in node_id_map[name].items()}
            else:
                codes = committees
            results = [codes[r] for r in results if r in codes]

        top5[topic] = results

    return top5
top_leg = top5_actors(leg_align, 'full_name')
top_donors = top5_actors(donor_align, 'donor_id')
top_lobby = top5_actors(lobby_align, 'lobby_firm_id')
top_committee = top5_actors(committee_align, 'committee_id')

In [292]:
topents = pd.DataFrame({
    'topic': top_donors.keys(),
    'top_donors': top_donors.values(),
    'top_lobby': top_lobby.values(),
    'top_legislators': top_leg.values()
})

In [294]:
topents.to_csv(OUT_PATH / 'top_entities.csv', index=False)