In [1]:
import math, gc, warnings, json, datetime, torch, pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from torch.nn import functional as F
from torch_geometric.transforms import ToUndirected, RemoveIsolatedNodes
from torch_geometric.utils import to_networkx
import networkx as nx

In [2]:
def herfindahl(shares):
    return float(np.square(shares).sum())

def entropy(shares, eps=1e-9):
    shares = shares.clip(min=eps)
    return float(-(shares * np.log(shares)).sum())

def jsd(p, q, eps=1e-9):
    m = 0.5 * (p + q)
    return 0.5 * (
        F.kl_div(m.log(), p, reduction='none').sum(-1) +
        F.kl_div(m.log(), q, reduction='none').sum(-1))

def slope(y):
    if y.size < 2:
        return 0.0
    x = np.arange(y.size, dtype=np.float32)
    return float(np.polyfit(x, y, 1)[0])

def safe_ratio(n, d):
    return 0.0 if d == 0 else n / d

In [3]:
def union_parquet(glob_pat):
    files = Path('../data').glob(glob_pat)
    if not files:
        return pd.DataFrame()
    df = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)
    return df

bills = union_parquet('bills_kpis_*.parquet')
leg_term_kpi = union_parquet('legislator_kpis_*.parquet')
committee_kpi = union_parquet('committee_kpis_*.parquet')
donor_kpi = union_parquet('donor_kpis_*.parquet')
lobby_kpi = union_parquet('lobby_firm_kpis_*.parquet')
topic_snapshot = union_parquet('topic_snapshot_*.parquet')

In [4]:
def explode_topic_probs(df, actor_type):
    if "topic_probs" not in df.columns:
        return pd.DataFrame()
    out = (
        df[["node_id", "topic_probs"]]
        .explode("topic_probs")
        .reset_index(drop=True)
    )
    out["topic_id"]   = out.groupby("node_id").cumcount()
    out["topic_prob"] = out["topic_probs"].astype(float)
    out = out.drop(columns="topic_probs")
    out["actor_type"] = actor_type
    return out
SRC = Path('../data')
topic_prob_dfs = []
for nt in ["legislator", "committee", "donor", "lobby_firm"]:
    df = union_parquet(f"*{nt}_topic_probs_*.parquet")
    if not df.empty:
        topic_prob_dfs.append(explode_topic_probs(df, nt))
actor_topic_long = (
    pd.concat(topic_prob_dfs, ignore_index=True) if topic_prob_dfs
    else pd.DataFrame(columns=["node_id","topic_id","topic_prob","actor_type"])
)
actor_topic_long.to_parquet(SRC / "actor_topic_relevance.parquet", index=False)


In [2]:
def compute_controversiality(data):
    edge_type = ('legislator_term', 'voted_on', 'bill_version')
    if edge_type not in data.edge_index_dict:
        raise ValueError("Missing 'voted_on' edges in data.")

    ei = data[edge_type].edge_index
    ea = data[edge_type].edge_attr

    vote_signal = ea[:, 0]

    src_nodes = ei[0]
    tgt_nodes = ei[1]

    num_bills = data['bill_version'].num_nodes
    device = tgt_nodes.device

    yes_votes = torch.zeros(num_bills, device=device)
    no_votes = torch.zeros(num_bills, device=device)

    yes_votes.index_add_(0, tgt_nodes, (vote_signal > 0).float())
    no_votes.index_add_(0, tgt_nodes, (vote_signal < 0).float())

    total_votes = yes_votes + no_votes + 1e-6

    yes_ratio = yes_votes / total_votes
    no_ratio = no_votes / total_votes

    controversy = 4 * yes_ratio * no_ratio
    controversy = controversy.clamp(0, 1)
    data['bill_version'].controversy = controversy

    return data

def safe_normalize_timestamps(timestamps: torch.Tensor) -> torch.Tensor:
    timestamps = torch.nan_to_num(timestamps, nan=0.0, posinf=1e4, neginf=-1e4)
    min_time = timestamps.min()
    max_time = timestamps.max()
    if (max_time - min_time) < 1e-4:
        return torch.zeros_like(timestamps)
    return (timestamps - min_time) / (max_time - min_time)

def safe_standardize_time_format(time_data) -> torch.Tensor:
    times = []
    for t in time_data:
        try:
            if isinstance(t, (int, float)) and 1900 <= t  and t <= 2100:
                td = datetime.datetime(int(t), 6, 15).timestamp()
            elif (isinstance(t, str) or (isinstance(t, float))) and (float(t) < 2100 and float(t) > 1900):
                td = datetime.datetime(int(float(t)), 6, 15).timestamp()
            elif float(t) > 0 and float(t) < 1990:
                td = t
            elif float(t) > 17000000.0:
                td = float(t)
            elif isinstance(t, datetime.datetime):
                td = t.timestamp()
            else:
                td = float(t) * 1e9
        except:
            td = datetime.datetime(2000, 6, 15).timestamp()
        times.append(td)
    return torch.tensor(times, dtype=torch.float32)

def pull_timestamps(data):
    timestamp_edges = [
        ('donor', 'donated_to', 'legislator_term'),
        ('legislator_term', 'rev_donated_to', 'donor'),
        ('lobby_firm', 'lobbied', 'legislator_term'),
        ('lobby_firm', 'lobbied', 'committee'),
        ('committee', 'rev_lobbied', 'lobby_firm'),
        ('legislator_term', 'rev_lobbied', 'lobby_firm'),
        ('bill_version', 'rev_voted_on', 'legislator_term'),
        ('legislator_term', 'voted_on', 'bill_version'),
    ]
    timestamp_nodes = ['legislator_term', 'bill_version', 'bill']

    for et in timestamp_edges:
        if hasattr(data[et], 'edge_attr') and data[et].edge_attr is not None and len(data[et].edge_attr.size()) > 1:
            if data[et].edge_attr.size(1) > 1:
                edge_attr = data[et].edge_attr
                ts_col = edge_attr[:, -1]
                data[et].timestamp = safe_normalize_timestamps(ts_col)
                data[et].time = ts_col
                data[et].edge_attr = edge_attr[:, :-1]

    for nt in timestamp_nodes:
        if hasattr(data[nt], 'x') and data[nt].x is not None:
            try:
                if len(data[nt].x.size()) > 1:
                    if data[nt].x.size(1) > 1:
                        x = data[nt].x
                        ts_col = x[:, -1]
                        data[nt].timestamp = safe_normalize_timestamps(ts_col)
                        data[nt].x = x[:, :-1]
                        data[nt].time = ts_col
            except:
                pass
    return data
def clean_features(data):
    data = pull_timestamps(data)
    for nt in data.node_types:
        x = data[nt].x
        if not isinstance(x, torch.Tensor) or x.numel() == 0:
            data[nt].x = torch.from_numpy(np.vstack(x)).float()
            x = data[nt].x
        x = torch.nan_to_num(x.float(), nan=0.0, posinf=1e4, neginf=-1e4)
        if x.size(0) < 2 or torch.all(x == x[0]):
            mean = x.clone()
            std = torch.ones_like(x)
            x_clean = x.clone()
        else:
            mean = x.mean(dim=0, keepdim=True)
            std = x.std(dim=0, keepdim=True).clamp(min=1e-5)
            x_clean = (x - mean) / std
            x_clean = x_clean.clamp(-10.0, 10.0)
        data[nt].x = x_clean
        data[nt].x_mean = mean
        data[nt].x_std = std
    return data

def load_and_preprocess_data(path='../../../GNN/data2.pt'):
    full_data = torch.load(path, weights_only=False)
    for nt in full_data.node_types:
        if hasattr(full_data[nt], 'x') and full_data[nt].x is not None:
            full = torch.from_numpy(full_data[nt].x)
            s = full.size()
            full = torch.flatten(full, start_dim=1, end_dim=-1)
            full_data[nt].x = full
            full_data[nt].num_nodes = full.size(0)

    # Check and fix edge indices before transformation
    for edge_type, edge_index in full_data.edge_index_dict.items():
        src_type, _, dst_type = edge_type

        # Get max node indices
        max_src_idx = edge_index[0].max().item() if edge_index.size(1) > 0 else -1
        max_dst_idx = edge_index[1].max().item() if edge_index.size(1) > 0 else -1

        # Ensure node counts are sufficient
        if max_src_idx >= full_data[src_type].num_nodes:
            print(f"Fixing {src_type} node count: {full_data[src_type].num_nodes} -> {max_src_idx + 1}")
            full_data[src_type].num_nodes = max_src_idx + 1

        if max_dst_idx >= full_data[dst_type].num_nodes:
            print(f"Fixing {dst_type} node count: {full_data[dst_type].num_nodes} -> {max_dst_idx + 1}")
            full_data[dst_type].num_nodes = max_dst_idx + 1

    data = ToUndirected(merge=False)(full_data)
    del full_data
    gc.collect()
    data = RemoveIsolatedNodes()(data)
    data = compute_controversiality(clean_features(data))
    for store in data.stores:
        for key, value in store.items():
            if isinstance(value, torch.Tensor) and value.dtype == torch.float64:
                store[key] = value.float()
    return data

data = load_and_preprocess_data()

Fixing bill node count: 13164 -> 45350
Fixing legislator node count: 478 -> 508


In [6]:
for nt in data.node_types:
    data[nt].node_id = torch.arange(data[nt].num_nodes, dtype=torch.long)

In [5]:
data[('legislator_term', 'voted_on', 'bill_version')].edge_attr.size(1)

385

In [6]:
v2b_edge = next(et for et in data.edge_types
                if et[0] == "bill_version" and et[2] == "bill")
src, dst = data[v2b_edge].edge_index.numpy()
bv_ts  = data["bill_version"].time.numpy()
bv_df  = pd.DataFrame({"bill_version": src, "bill_id": dst, "ts": bv_ts[src]})
bill_dates = (
    bv_df.groupby("bill_id")["ts"]
         .agg(intro_date="min", last_action="max")
         .reset_index()
         .assign(
            intro_date=lambda d: pd.to_datetime(d.intro_date, unit='s'),
            last_action=lambda d: pd.to_datetime(d.last_action, unit='s'))
)
bill = bills.merge(bill_dates, on="bill_id", how="left")

NameError: name 'data' is not defined

In [8]:
bill["week"] = bill["intro_date"].dt.to_period("W").dt.start_time
bill["bill_velocity_days"] = (bill["last_action"] - bill["intro_date"]).dt.days

In [9]:
with open("../../../node_id_map.json", "r") as f:
    node_id_map = json.load(f)

In [10]:
with open("../../../committees.pkl", "rb") as f:
    committees = pickle.load(f)

In [11]:
com_names = {v:k for k,v in node_id_map["committee"].items()}
committee_ids = {}
for d in data['committee'].node_id.numpy():
    name = com_names.get(int(d), None)
    if name is not None:
        committee_ids[d] = name
committee_ids = pd.DataFrame(committee_ids.items(), columns=["committee_id", "committee_name"])
committee_ids['term'] = committee_ids['committee_name'].str.split('_').str[1].astype(int)
committee_ids['com_id'] = committee_ids['committee_name'].str.split('_').str[0].astype(int)
committee_ids['name'] = committee_ids['com_id'].map(committees)

In [12]:
committee = committee_ids.join(committee_kpi, on='committee_id', how='right')

In [13]:
policy_embs = torch.load("../../../GNN/policy_embeddings.pt", map_location='cpu', weights_only=False)
T = policy_embs.size(0)

In [14]:
bill_node_id_rep = pd.DataFrame.from_dict({v:k for k,v in node_id_map['bill'].items()}, orient='index').reset_index().rename(columns={0:'bill_id', 'index': 'original_node_id'}).sort_values('original_node_id').reset_index(names='node_id')

In [15]:
topics = ["K-12 Education",
    "Public Universities",
    "Technical Education & Job Readiness",
    "Affordable Housing",
    "Tenants\' Rights",
    "Homelessness",
    "Drought Management",
    "Wildfire Prevention",
    "Electricity Grid Reliability",
    "Energy Efficiency",
    "Air Quality",
    "Sea Level Rise & Coastal Resilience",
    "Public Transit Infrastructure",
    "Highways & Road Maintenance",
    "Broadband Internet Access",
    "Mental Health Services & Crisis Intervention",
    "Substance Use Disorder & Harm Reduction",
    "Child Welfare & Foster Care",
    "Elder Care & Aging Services",
    "Disability Rights & Accessibility",
    "Healthcare Access & Medi-Cal",
    "Public Health & Disease Control",
    "Food Insecurity & Nutrition Assistance",
    "Environmental Justice",
    "Water Pollution",
    "Agricultural Regulation",
    "Coastal Protection",
    "Parks and Public Lands",
    "Criminal Justice Reform",
    "Police Accountability",
    "Firearm Regulations",
    "Emergency Management & Disaster Response",
    "Human Trafficking Prevention",
    "Budget Reserves & Fiscal Stabilization",
    "Local Government Finance & Property Taxes",
    "State Tax Policy",
    "Cannabis Regulation",
    "Insurance Oversight & Consumer Protection",
    "Small Business Development",
    "Technology Regulation",
    "Political Transparency",
    "Voting Rights & Election Security",
    "Government Transparency & Public Records",
    "Immigration Protections & Reform",
    "Gender Equity & Reproductive Rights",
    "LGBTQ+ Rights",
    "Racial Equity & Anti-Discrimination",
    "Veterans Services & Support",
    "Labor Rights & Minimum Wage",
    "Paid Leave",
    "Workforce Development & Job Training",
    "State Employee Pensions & Public Retirement",
    "Public Transportation Safety & Accessibility",
    "Affordable Childcare",
    "Consumer Protection",
    "Fair Housing",
    "Wildlife Conservation & Endangered Species",
    "Renewable Energy",
    "Natural Gas Regulation",
    "Vehicle Emissions",
    "Oil Drilling & Fracking",
    "Tribal Affairs",
    "Military & Veterans Affairs",
    "Public Health & Disease Control",
    "Information Technology",
    "Prisons & Corrections",
    "Child Support",
    "Public Libraries",
    "Utilities Oversight",
    "Regional Investment & Job Creation",
    "Public Employee Relations",
    "Manufactured Housing Tenant Protections & Park Regulations",
    "Short Term Rental Regulation",
    "Public Utility Wildfire Mitigation",
    "Energy Storage",
    "Water Recycling",
    "Urban Heat Island Mitigation",
    "Stormwater Capture",
    "Salton Sea Restoration",
    "Public Employee Health Benefits",
    "State Procurement of Goods & Services",
    "Correctional Officer Rights",
    "Traffic Enforcement",
    "Public Charter Schools",
    "Ethnic Studies Curriculum",
    "Library Construction",
    "Toxic Chemical Disclosure",
    "Warehouse Labor Standards",
    "Worker Classification & Independent Contractors",
    "Flood Control",
    "Unclaimed Property",
    "Alcohol Beverage Control",
    "Youth Diversion Programs",
    "Sexual Harassment Prevention",
    "Unemployment Insurance",
    "Sea Level Rise Adaptation"
    ]

topic_ids = {i: t for i, t in enumerate(topics)}

In [16]:
bill['bill'] = bill['bill_id'].map(bill_node_id_rep.set_index('node_id')['bill_id'])

In [17]:
from torch_geometric.data import HeteroData

In [18]:
keep_nt = {"bill", "legislator_term", "committee", "donor", "lobby_firm"}
proj = HeteroData()
for nt in keep_nt: proj[nt].num_nodes = data[nt].num_nodes
proj_edges = [et for et in data.edge_types if et[0] in keep_nt and et[2] in keep_nt and hasattr(data[et], 'edge_index')]
for et in proj_edges:
    proj[et].edge_index = data[et].edge_index
G = to_networkx(proj)
pagerank = nx.pagerank(G, alpha=0.9)
degree = dict(G.degree())

def add_centrality(df, ntype):
    df = df.copy()
    df["pagerank"] = df.node_id.map(lambda i: pagerank.get((ntype,int(i)),0.0))
    df["degree"] = df.node_id.map(lambda i: degree.get((ntype,int(i)),0))
    return df

committee = add_centrality(committee, "committee")
donor = add_centrality(donor_kpi, "donor")
lobby = add_centrality(lobby_kpi, "lobby_firm")
leg_term = add_centrality(leg_term_kpi, "legislator_term")

In [19]:
committee['topic'] = committee['top_topic'].map(topic_ids)
donor['topic'] = donor['top_topic'].map(topic_ids)
lobby['topic'] = lobby['top_topic'].map(topic_ids)
leg_term['topic'] = leg_term['top_topic'].map(topic_ids)

In [20]:
src, dst = data[("bill_version","rev_wrote","legislator_term")].edge_index.numpy()
sponsor_map = pd.DataFrame({"bill_id": src, "node_id": dst})
sponsor_infl = leg_term.set_index("node_id")["influence"]
valid_sponsor_map = sponsor_map[sponsor_map["node_id"].isin(sponsor_infl.index)]
sponsor_power = (valid_sponsor_map.groupby("bill_id")["node_id"]
                                 .agg(lambda ids: sponsor_infl.loc[ids].mean())
                                 .rename("sponsor_power"))
bill = bill.join(sponsor_power, on="bill_id", how="left")

In [21]:
bill['outcome'] = data['bill'].y[bill['bill_id'].values]

In [22]:
def attach_topk_topics(df, actor_type, K=3):
    if actor_topic_long.empty: return df
    tk = (actor_topic_long[actor_topic_long.actor_type == actor_type]
            .sort_values(["node_id","topic_prob"], ascending=[True,False])
            .groupby("node_id").head(K))
    wide = (tk.set_index(["node_id", tk.groupby("node_id").cumcount() + 1])
              .unstack(level=1))
    wide.columns = [f"{col[0]}_{col[1]}" for col in wide.columns]
    return df.merge(wide.reset_index(), on="node_id", how="left")

committee = attach_topk_topics(committee, "committee")
donor = attach_topk_topics(donor, "donor")
lobby = attach_topk_topics(lobby, "lobby_firm")
leg_term = attach_topk_topics(leg_term, "legislator_term")

In [23]:
def edge_df(et):
    ei = data[et].edge_index.numpy()
    ea = data[et].edge_attr.numpy() if data[et].edge_attr is not None else None
    if ea is None or ea.shape[1] == 0:
        return pd.DataFrame(columns=["src","dst","amount"])
    amt = ea[:,0]
    return pd.DataFrame({"src": ei[0], "dst": ei[1], "amount": amt})

don_edge = edge_df(("donor","donated_to","legislator_term"))
don_edge["donor_id"] = don_edge.src
don_edge["legterm_id"] = don_edge.dst
don_edge["type"] = "donor"

lob_edge = edge_df(("lobby_firm","lobbied","legislator_term"))
lob_edge["lobby_id"] = lob_edge.src
lob_edge["legterm_id"] = lob_edge.dst
lob_edge["type"] = "lobby"

fund_edge = pd.concat([don_edge, lob_edge], ignore_index=True)
term_ts = data["legislator_term"].time.numpy()
leg_term["session_start"] = term_ts[leg_term.node_id].astype(int)

leg_map = leg_term[["node_id","topic","session_start"]].rename(
           columns={"node_id":"legterm_id"})

fund_edge = fund_edge.merge(leg_map, on="legterm_id", how="left")

In [None]:
ei = data[("legislator_term","wrote","bill_version")].edge_index.numpy()
ea = data[("legislator_term","wrote","bill_version")].edge_attr.numpy()
author_edge = pd.DataFrame({"legterm_id": ei[0], "bill_id": ei[1], "type": ea[:,0]})
author_edge['date'] = data["bill_version"].time.numpy()[author_edge.bill_id]
author_edge.loc[author_edge.date == 0, 'date'] = datetime.datetime(2000, 6, 15).timestamp()
author_edge['date'] = pd.to_datetime(author_edge['date'], unit='s')

eib = data[('bill_version','is_version', 'bill')].edge_index.numpy()
eib = pd.DataFrame({"src": eib[0], "dst": eib[1], 'outcome': data['bill'].y[eib[1]]})
eib['src'] = eib['src'].astype(int)
eib['dst'] = eib['dst'].astype(int)
author_edge['bill_id'] = author_edge['bill_id'].astype(int)

author_edge = author_edge.merge(eib, left_on='bill_id', right_on='src', how='inner')
author_edge['outcome'] = (author_edge['outcome'] == 1).astype(int)
author_levels = {1: 'COAUTHOR', 3: 'PRINCIPAL_COAUTHOR', 4: 'LEAD_AUTHOR'}
author_edge['author_type'] = author_edge['type'].map(author_levels)

In [25]:
author_edge = author_edge.merge(bill[['dominant_topic', 'bill_id']], left_on='dst', right_on='bill_id', how='left')

In [26]:
a_edge = author_edge.groupby('legterm_id').agg({
    'dst': 'count',
    'outcome': ['mean', 'sum'],
    'author_type': lambda x: x.mode().iat[0] if not x.empty and len(x.mode()) > 0 else None,
    'dominant_topic': lambda x: x.mode().iat[0] if not x.empty and len(x.mode()) > 0 else None
}).reset_index()

In [27]:
a_edge.columns = ['node_id', 'num_authored_bills', 'pct_passed', 'num_passed', 'mode_author_type', 'topic_focus']

In [28]:
a_edge['topic_focus'] = a_edge['topic_focus'].map(topic_ids)
a_edge['total_funding'] = a_edge['node_id'].map(fund_edge.drop_duplicates(subset=['src', 'dst', 'amount', 'session_start', 'type']).groupby('dst')['amount'].sum().to_dict())

In [29]:
leg_term = leg_term.merge(a_edge, on='node_id', how='left')

In [30]:
bill['year'] = bill.intro_date.dt.year
bill['session'] = bill.apply(lambda x: (x.year - 1 if x.year % 2 == 0 else x.year) if pd.notnull(x.year) else np.nan, axis=1).astype('Int64')

In [31]:
bill['topic'] = bill['dominant_topic'].map(topic_ids)

In [33]:
leg_ids = {v: k for k, v in node_id_map['legislator_term'].items()}

In [34]:
import re

leg_swap = fund_edge['legterm_id'].map(leg_ids)
fund_edge['leg_id'] = leg_swap.apply(lambda x: x.split('_')[0])
fund_edge['leg_term'] = leg_swap.apply(lambda x: re.search(r'(?<=_)\d+(?=-)', x).group(0) if re.search(r'(?<=_)\d+(?=-)', x) else None)

In [35]:
don_swap = don_edge['legterm_id'].map(leg_ids)
don_edge['leg_id'] = don_swap.apply(lambda x: x.split('_')[0])
don_edge['leg_term'] = don_swap.apply(lambda x: re.search(r'(?<=_)\d+(?=-)', x).group(0) if re.search(r'(?<=_)\d+(?=-)', x) else None)

In [36]:
fund_edge['leg_id'] = fund_edge['leg_id'].astype(int)
don_edge['leg_id'] = don_edge['leg_id'].astype(int)
fund_edge['leg_term'] = fund_edge['leg_term'].astype(float)
don_edge['leg_term'] = don_edge['leg_term'].astype(float)

In [37]:
fund_edge = fund_edge.merge(leg_term[['node_id', 'topic_focus_y']], left_on='leg_id', right_on='node_id', how='left').drop_duplicates()
don_edge = don_edge.merge(leg_term[['node_id', 'topic_focus_y']], left_on='leg_id', right_on='node_id', how='left').drop_duplicates()

In [38]:
donor = donor.merge(don_edge.groupby(['donor_id', 'topic_focus_y']).agg({'amount': 'sum'}).reset_index(), left_on='node_id', right_on='donor_id', how='left').drop_duplicates()

In [39]:
lobby = lobby.merge(fund_edge.groupby(['lobby_id', 'topic_focus_y']).agg({'amount': 'sum'}).reset_index(), left_on='node_id', right_on='lobby_id', how='left').drop_duplicates()

In [40]:
policy_weekly = (
    bill.groupby(["topic","session"])
        .agg(n_bills=("bill_id","size"),
             avg_polar=("polarisation_score","mean"))
        .reset_index()
)

fund_weekly = (fund_edge.groupby(["topic","session_start"])
                        .agg(total_funding=("amount","sum"))
                        .reset_index()
                        .rename(columns={'session_start':"session"}))

policy_weekly = policy_weekly.merge(fund_weekly, on=["topic","session"], how="left")
policy_weekly["total_funding"] = policy_weekly.total_funding.fillna(0.0)
policy_weekly["fund_volatility"] = (policy_weekly.sort_values("session")
                                      .groupby("topic")["total_funding"]
                                      .transform(lambda s: s.rolling(4,min_periods=2).std()))

policy_weekly.to_parquet(SRC/"policy_session.parquet", index=False)

In [41]:
topic_base = (bill.groupby("topic")
                 .agg(n_bills=("bill_id","size"),
                      avg_success=("success_risk","mean"),
                      avg_polar=("polarisation_score","mean"),
                      avg_velocity=("bill_velocity_days","mean"),
                      avg_sponsor_power=("sponsor_power","mean"))
                 .reset_index()
                 )

In [42]:
topic_funding = fund_edge.drop_duplicates().groupby(['topic', 'session_start']).agg({'amount': 'sum'}).reset_index()

In [43]:
def slope(y):
    if len(y)<2: return 0.0
    x = np.arange(len(y)); return np.polyfit(x,y,1)[0]

pol_slope = (policy_weekly.groupby("topic")["avg_polar"]
                        .apply(slope).reset_index()
                        .rename(columns={"avg_polar":"polarisation_slope"}))

In [44]:
actor_infl = pd.concat([
    donor[["topic_focus_y","influence"]],
    lobby[["topic_focus_y","influence"]],
    committee[["topic","influence"]].rename(columns={"topic":"topic_focus_y"}),
    leg_term[["topic_focus_y","influence"]]],
    ignore_index=True).dropna().rename(columns={"topic_focus_y":"topic"})

power_conc = (actor_infl.groupby("topic")
                        .agg(power_concentration=("influence", herfindahl))
                        .reset_index())

In [45]:
topic_snapshot['topic'] = topic_snapshot['topic_id'].map(topic_ids)

In [46]:
bip_gap = (bill.assign(bipart = lambda d: d.polarisation_score<0.25)
               .groupby("topic")["bipart"]
               .agg(lambda s: s.mean()*2 -1)
               .reset_index()
               .rename(columns={"bipart":"bipartisan_gap"}))
topic_summary = (topic_base
                 .merge(topic_funding.groupby('topic')['amount'].sum().reset_index(), on="topic", how="left")
                 .merge(power_conc, on="topic", how="left")
                 .merge(bip_gap, on="topic", how="left")
                 .merge(pol_slope, on="topic", how="left")
                 .fillna({"total_dollars":0.0}))

In [47]:
topic_summary.to_parquet(SRC/"topic_summary.parquet", index=False)

In [48]:
legislators = pickle.load(open('../../../legislators.pkl', 'rb'))

In [49]:
bill_version_ids = {v: k for k, v in node_id_map['bill_version'].items()}

In [50]:
def leg_term_to_name(leg_term_id):
    if isinstance(leg_term_id, str):
        num = int(leg_term_id.split('_')[0])
        return legislators.get(num, None)
    else:
        return None

In [51]:
def legislator_node_matching(node_id):
    n = node_id_map['legislator'].get(str(node_id), None)
    if n is not None:
        name = legislators.get(n, None)
        if name is not None:
            return name
    return None

In [52]:
leg_term['name'] = leg_term['node_id'].map(leg_ids).apply(leg_term_to_name)

In [56]:
donor_ids = {v: k for k, v in node_id_map['donor'].items()}

In [63]:
donor['donor'] = donor['donor_id'].map(donor_ids)
donor['topic_id_1'] = donor['topic_id_1'].map(topic_ids)
donor['topic_id_2'] = donor['topic_id_2'].map(topic_ids)
donor['topic_id_3'] = donor['topic_id_3'].map(topic_ids)

In [66]:
lobby_ids = {v: k for k, v in node_id_map['lobby_firm'].items()}
lobby['lobby'] = lobby['node_id'].map(lobby_ids)
lobby['topic_id_1'] = lobby['topic_id_1'].map(topic_ids)
lobby['topic_id_2'] = lobby['topic_id_2'].map(topic_ids)
lobby['topic_id_3'] = lobby['topic_id_3'].map(topic_ids)

In [67]:
leg_term.to_parquet(SRC/"legislator_term.parquet", index=False)
donor.to_parquet(SRC/"donor.parquet", index=False)
lobby.to_parquet(SRC/"lobby_firm.parquet", index=False)
committee.to_parquet(SRC/"committee.parquet", index=False)
bill.to_parquet(SRC/"bill.parquet", index=False)

In [68]:
leg_term

Unnamed: 0,node_id,top_topic,topic_focus_x,influence,leverage,bipartisan_score,pagerank,degree,topic,session_start,num_authored_bills,pct_passed,num_passed,mode_author_type,topic_focus_y,total_funding,name
0,406,57,0.011050,-0.001432,-0.000016,1.000000,0.0,0,Renewable Energy,2011,,,,,,,Wilmer Amina Carter
1,264,70,0.011031,0.005450,0.000060,0.999999,0.0,0,Public Employee Relations,2003,204.0,0.235294,48.0,PRINCIPAL_COAUTHOR,Wildfire Prevention,3.370040e+03,Kevin McCarthy
2,382,10,0.011038,0.005624,0.000062,1.000000,0.0,0,Air Quality,2007,1644.0,0.481752,792.0,COAUTHOR,Air Quality,1.253607e+06,Jim Beall
3,173,88,0.011033,-0.002348,-0.000026,1.000000,0.0,0,Worker Classification & Independent Contractors,2007,840.0,0.357143,300.0,PRINCIPAL_COAUTHOR,Air Quality,,Joe Simitian
4,445,57,0.011021,0.001375,0.000015,0.999999,0.0,0,Renewable Energy,2011,,,,,,,Cathleen Galgiani
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,381,10,0.010916,-0.002687,-0.000029,1.000000,0.0,0,Air Quality,2017,432.0,0.444444,192.0,COAUTHOR,Wildfire Prevention,1.005180e+03,Joel Anderson
1306,51,77,0.011030,0.002996,0.000033,0.999999,0.0,0,Stormwater Capture,2005,588.0,0.183673,108.0,PRINCIPAL_COAUTHOR,Oil Drilling & Fracking,7.732700e+02,Lynn Daucher
1307,131,78,0.011012,0.006329,0.000070,1.000000,0.0,0,Salton Sea Restoration,2001,864.0,0.277778,240.0,PRINCIPAL_COAUTHOR,Air Quality,2.535134e+05,Carole Migden
1308,338,77,0.011041,-0.002070,-0.000023,1.000000,0.0,0,Stormwater Capture,2005,1008.0,0.238095,240.0,PRINCIPAL_COAUTHOR,Air Quality,7.569554e+04,Johan Klehs
