In [6]:
import gc, datetime, json, pickle, re, collections
from torch_geometric.transforms import ToUndirected, RemoveIsolatedNodes
from pathlib import Path
import torch, numpy as np, pandas as pd
import pyarrow as pa, pyarrow.parquet as pq
from torch_scatter import scatter_add
DATA_PATH = Path("../../../data3.pt")
EXPORT_PATH = Path("../data")
OUT_PATH = EXPORT_PATH
DEVICE = torch.device("cpu")
FLOAT = torch.float32

In [7]:
with open('../../../node_id_map.json', 'r') as f:
        node_id_map = json.load(f)

with open('../../../bill_labels.json', 'r') as f:
    topic_cluster_labels_dict = json.load(f)

In [8]:
actor_topic_prob = torch.load('../../raw_model_outputs/actor_topic_prob.pt', weights_only=False)

In [9]:
num_topics = 375

In [10]:
def safe_normalize_timestamps(timestamps):
    timestamps = torch.nan_to_num(timestamps, nan=0.0, posinf=1e4, neginf=-1e4)
    min_time = timestamps.min()
    max_time = timestamps.max()
    if (max_time - min_time) < 1e-4:
        return torch.zeros_like(timestamps)
    return (timestamps - min_time) / (max_time - min_time)

def safe_standardize_time_format(time_data):
    times = []
    for t in time_data:
        try:
            if isinstance(t, (int, float)) and 1900 <= t  and t <= 2100:
                td = datetime.datetime(int(t), 6, 15).timestamp()
            elif (isinstance(t, str) or (isinstance(t, float))) and (float(t) < 2100 and float(t) > 1900):
                td = datetime.datetime(int(float(t)), 6, 15).timestamp()
            elif float(t) > 0 and float(t) < 1990:
                td = t
            elif float(t) > 17000000.0:
                td = float(t)
            elif isinstance(t, datetime.datetime):
                td = t.timestamp()
            else:
                td = float(t) * 1e9
        except:
            td = datetime.datetime(2000, 6, 15).timestamp()
        times.append(td)
    return torch.tensor(times, dtype=torch.float32)

def pull_timestamps(data):
    timestamp_edges = [
        ('donor', 'donated_to', 'legislator_term'),
        ('legislator_term', 'rev_donated_to', 'donor'),
        ('lobby_firm', 'lobbied', 'legislator_term'),
        ('lobby_firm', 'lobbied', 'committee'),
        ('committee', 'rev_lobbied', 'lobby_firm'),
        ('legislator_term', 'rev_lobbied', 'lobby_firm'),
        ('bill_version', 'rev_voted_on', 'legislator_term'),
        ('legislator_term', 'voted_on', 'bill_version'),
    ]
    timestamp_nodes = ['legislator_term', 'bill_version', 'bill']

    for et in timestamp_edges:
        if hasattr(data[et], 'edge_attr') and data[et].edge_attr is not None and len(data[et].edge_attr.size()) > 1:
            if data[et].edge_attr.size(1) > 1:
                edge_attr = data[et].edge_attr
                ts_col = edge_attr[:, -1]
                if ts_col.abs().max() > 1e8 or ts_col.min() < 0:
                    ts_col = safe_standardize_time_format(ts_col.tolist()).to(edge_attr.device)
                data[et].timestamp = safe_normalize_timestamps(ts_col)
                data[et].time = ts_col
                data[et].edge_attr = edge_attr[:, :-1]

    for nt in timestamp_nodes:
        if hasattr(data[nt], 'x') and data[nt].x is not None:
            try:
                if len(data[nt].x.size()) > 1:
                    if data[nt].x.size(1) > 1:
                        x = data[nt].x
                        ts_col = x[:, -1]
                        if ts_col.abs().max() > 1e8 or ts_col.min() < 0:
                            ts_col = safe_standardize_time_format(ts_col.tolist()).to(x.device)
                        if nt in timestamp_nodes or ts_col.abs().max() > 1e6:
                            data[nt].timestamp = safe_normalize_timestamps(ts_col)
                            data[nt].time = ts_col
                            data[nt].x = x[:, :-1]
            except:
                pass
    return data

def clean_features(data):
    for nt in data.node_types:
        x = data[nt].x
        x = torch.as_tensor(x, dtype=torch.float32)
        x = torch.nan_to_num(x.float(), nan=0.0, posinf=1e4, neginf=-1e4)
        mean = x.mean(0, keepdim=True)
        std = x.std(0, keepdim=True).clamp(min=1e-5)
        x = ((x - mean) / std).clamp(-10, 10)
        data[nt].x = x
        data[nt].x_mean = mean
        data[nt].x_std = std
    data = pull_timestamps(data)
    return data

def compute_controversiality(data):
    edge_type = ('legislator_term', 'voted_on', 'bill_version')
    if edge_type not in data.edge_index_dict:
        raise ValueError("Missing 'voted_on' edges in data.")

    ei = data[edge_type].edge_index
    ea = data[edge_type].edge_attr

    vote_signal = ea[:, 0]
    tgt_nodes = ei[1]

    num_bills = data['bill_version'].num_nodes
    device = tgt_nodes.device

    yes_votes = torch.zeros(num_bills, device=device)
    no_votes = torch.zeros(num_bills, device=device)

    yes_votes.index_add_(0, tgt_nodes, (vote_signal > 0).float())
    no_votes.index_add_(0, tgt_nodes, (vote_signal < 0).float())

    total_votes = yes_votes + no_votes + 1e-6

    yes_ratio = yes_votes / total_votes
    no_ratio = no_votes / total_votes

    controversy = 4 * yes_ratio * no_ratio
    controversy = controversy.clamp(0, 1)
    data['bill_version'].controversy = controversy

    return data

def load_and_preprocess_data(path='../../../data3.pt'):
    full_data = torch.load(path, weights_only=False)
    for nt in full_data.node_types:
        if hasattr(full_data[nt], 'x') and full_data[nt].x is not None:
            flat = torch.as_tensor(full_data[nt].x).flatten(start_dim=1)
            full_data[nt].x = flat
            full_data[nt].num_nodes = flat.size(0)

    for edge_type, edge_index in full_data.edge_index_dict.items():
        src_type, _, dst_type = edge_type
        max_src_idx = edge_index[0].max().item() if edge_index.size(1) > 0 else -1
        max_dst_idx = edge_index[1].max().item() if edge_index.size(1) > 0 else -1
        if max_src_idx >= full_data[src_type].num_nodes:
            print(f"Fixing {src_type} node count: {full_data[src_type].num_nodes} -> {max_src_idx + 1}")
            full_data[src_type].num_nodes = max_src_idx + 1

        if max_dst_idx >= full_data[dst_type].num_nodes:
            print(f"Fixing {dst_type} node count: {full_data[dst_type].num_nodes} -> {max_dst_idx + 1}")
            full_data[dst_type].num_nodes = max_dst_idx + 1

    full_data['bill'].y[np.where(full_data['bill'].y < 0)[0]] = 0
    data = ToUndirected(merge=False)(full_data)
    del full_data
    gc.collect()
    data = RemoveIsolatedNodes()(data)
    data = compute_controversiality(clean_features(data))

    for store in data.stores:
        for key, value in store.items():
            if isinstance(value, torch.Tensor) and value.dtype == torch.float64:
                store[key] = value.float()
    for nt in data.node_types:
        data[nt].node_id = torch.tensor([i for i in range(data[nt].num_nodes)], dtype=torch.int64, device=DEVICE)
    return data

data = load_and_preprocess_data()

## Bills

In [11]:
bv_ts = pickle.loads(open('../../../bill_dates_map.pkl', 'rb').read())

In [12]:
bv_ids = {v: k for k, v in node_id_map['bill_version'].items()}

In [13]:
v2b_edge = tuple([et for et in data.edge_types
                if et[0] == "bill_version" and et[2] == "bill"])[0]
src, dst = data[v2b_edge].edge_index.numpy()

bv_df = pd.DataFrame({"bill_version": src, "bill_id": data['bill'].n_id[dst]})
bv_df['bill_version_id'] = bv_df['bill_version'].map(bv_ids)

In [14]:
bill_dates = pd.DataFrame(bv_ts).T.reset_index().rename(columns={'index': 'bill_id'})
bill_dates = bill_dates.loc[bill_dates['bill_id'].isin(bv_df['bill_id'].unique())]

In [15]:
controversy_df = pd.DataFrame({
    'controversy': data['bill_version'].controversy[bv_df['bill_version'].unique()].numpy(),
    'bill_version': bv_df['bill_version'].unique()
})

In [16]:
outcome_df = pd.DataFrame({
    'bill_id': data['bill'].n_id,
    'outcome': data['bill'].y
})

In [17]:
bills = bv_df.merge(controversy_df, on='bill_version', how='left').merge(outcome_df, on='bill_id', how='left')
bills['topic_cluster'] = bills['bill_id'].map(topic_cluster_labels_dict)

In [18]:
bill_dates['longevity'] = bill_dates['Last_action'] - bill_dates['First_action']

In [19]:
bill_df = bills.groupby('bill_id').agg({'outcome': 'max', 'controversy': 'max', 'topic_cluster': 'max'}).merge(bill_dates[['bill_id', 'longevity']], on='bill_id')

In [20]:
with open('../../../bill_subjects.json', 'r') as f:
    bill_subjects = np.array(list(json.load(f).keys()))

## Topics

In [21]:
groupings = pd.read_csv('../../../groupings.csv')
groupings.loc[groupings['label'] == 0, 'label_name'] =  'Native American resources and heritage commission'
groupings.loc[groupings['label'] == 1, 'label_name'] =  'School Districts'

cluster_groups = pd.read_csv('../../../groupings_clean.csv')
cluster_groups['label_name'] = cluster_groups['label_name'].apply(lambda x: x.strip())
groupings['label_name'] = groupings['label_name'].apply(lambda x: x.strip())

cluster_groups = cluster_groups.merge(groupings[['group_150', 'label_name', 'label']].drop_duplicates(), on=['group_150', 'label_name'],  how='inner')

def aggregate_by_group(df_alignment, level, agg='sum'):
    topic_cols = [c for c in df_alignment.columns if c.startswith('topic_')]
    label_to_group = cluster_groups.copy().set_index('label')[level].to_dict()
    col_to_group = {col: label_to_group[int(col.split('_')[1])] for col in topic_cols}
    transposed = df_alignment[topic_cols].T
    transposed.index = transposed.index.map(lambda x: col_to_group[x])
    aggregated = transposed.groupby(level=0).agg(agg).T
    return pd.concat([df_alignment.drop(columns=topic_cols), aggregated], axis=1)

In [22]:
topics = bill_df.loc[bill_df['topic_cluster'].notna()].copy()
topics['term'] = topics['bill_id'].apply(lambda x: x[:4]).astype(int)
topics = topics.merge(cluster_groups[['label', 'C150', 'C75']], left_on='topic_cluster', right_on='label', how='left')

In [23]:
topics_df = topics.groupby(['C150', 'term', 'label']).agg({'outcome': lambda x: len(x.loc[x == 1]) / len(x), 'controversy': lambda x: np.mean(x.loc[x > 0]), 'bill_id': 'nunique', 'longevity': 'mean'}).reset_index()

In [24]:
bv_topics = bills[['bill_version', 'topic_cluster']].loc[bills['topic_cluster'].notna()].drop_duplicates().set_index('bill_version').to_dict()

In [25]:
bv_cluster = torch.full(
    (data["bill_version"].num_nodes,),
    -1, dtype=torch.long)

for bv_id, topic_id in bv_topics['topic_cluster'].items():
    bv_cluster[bv_id] = int(topic_id)

In [26]:
num_lt = data["legislator_term"].num_nodes
num_cm  = data["committee"].num_nodes
leg_topic_cnt = torch.zeros(num_lt, num_topics, dtype=FLOAT)
vote_et = ('legislator_term','voted_on','bill_version')
vi, va  = data[vote_et].edge_index, data[vote_et].edge_attr
mask_yes = va[:,0]>0
src_lt, dst_bv = vi[0][mask_yes], vi[1][mask_yes]
clusters = bv_cluster[dst_bv]; ok = clusters>=0
flat_idx = src_lt[ok]*num_topics + clusters[ok]
leg_topic_cnt_flat = torch.zeros(num_lt*num_topics, dtype=FLOAT)
leg_topic_cnt_flat.index_add_(0, flat_idx, torch.ones_like(flat_idx,dtype=FLOAT))
leg_topic_cnt = leg_topic_cnt_flat.view(num_lt, num_topics)
write_et = ('legislator_term','wrote','bill_version')
if write_et in data.edge_index_dict:
    wi = data[write_et].edge_index
    src_w, dst_w = wi
    clusters = bv_cluster[dst_w]; ok = clusters>=0
    flat_idx = src_w[ok]*num_topics + clusters[ok]
    leg_topic_cnt_flat.index_add_(0, flat_idx, torch.ones_like(flat_idx,dtype=FLOAT)*10)
    leg_topic_cnt = leg_topic_cnt_flat.view(num_lt, num_topics)

leg_topic_dist = leg_topic_cnt / leg_topic_cnt.sum(1,keepdim=True).clamp(min=1e-8)

comm_topic_cnt = torch.zeros(num_cm, num_topics, dtype=FLOAT)
cm_edge = ('committee','rev_member_of','legislator_term')
ci = data[cm_edge].edge_index
scatter_add(leg_topic_dist[ci[1]],
            ci[0].unsqueeze(-1).expand(-1,num_topics),
            dim=0, out=comm_topic_cnt)
comm_topic_dist = comm_topic_cnt / comm_topic_cnt.sum(1,keepdim=True).clamp(min=1e-8)

In [27]:
vote_et = ('legislator_term','voted_on','bill_version')
vi, va  = data[vote_et].edge_index, data[vote_et].edge_attr

voting_df = pd.DataFrame({
    'legislator_term': vi[0].numpy(),
    'bill_version': vi[1].numpy(),
    'vote_signal': va[:, 0].numpy()
})
voting_df['version_id'] = voting_df['bill_version'].map(bv_ids)
voting_df['topic_cluster'] = voting_df['bill_version'].map(bv_topics['topic_cluster'])

In [28]:
vdf = voting_df.merge(bv_df[['bill_id', 'bill_version_id']].rename(columns={'bill_version_id': 'version_id'}), on='version_id', how='inner').merge(bill_dates)

In [29]:
vdf['term'] = vdf['First_action'].dt.year
vdf['term'] = vdf['term'].apply(lambda x: x - 1  if x % 2 == 0 else x)
vdf = vdf.merge(cluster_groups[['label', 'C150', 'C75']], left_on='topic_cluster', right_on='label', how='left')

tops_df = vdf.groupby(['term', 'C150']).agg({
    'vote_signal': lambda x: len(x[x > 0]) / len(x),
    'bill_id': 'nunique',
    'longevity': 'mean'
}).reset_index()

In [30]:
leg_topics = pd.DataFrame(leg_topic_dist.numpy())
leg_topics.columns = [f'topic_{i}' for i in range(num_topics)]
leg_topics['legislator_term'] = data['legislator_term'].node_id.numpy()

In [31]:
def edge_year(ts_tensor):
    return pd.to_datetime(ts_tensor.cpu().numpy(), unit="s").year.astype(np.int16)

def money_by_topic(edge_key, tgt_topic_mat, src_size):
    ei = data.edge_index_dict[edge_key]
    amt = data[edge_key].edge_attr[:, 0].abs().to(FLOAT)
    years = torch.as_tensor(edge_year(data[edge_key].time), dtype=torch.int16)
    src, dst = ei
    total = scatter_add(amt, src, dim=0, dim_size=src_size)
    topic_mat = torch.zeros(src_size, num_topics, dtype=FLOAT)

    for t in range(num_topics):
        scatter_add(amt * tgt_topic_mat[dst, t], src, dim=0,
                    dim_size=src_size, out=topic_mat[:, t])

    term_ids, term_idx = years.unique(return_inverse=True)
    term_topic = torch.zeros(len(term_ids), num_topics, dtype=FLOAT)
    for t in range(num_topics):
        scatter_add(amt * tgt_topic_mat[dst, t], term_idx, dim=0,
                    dim_size=len(term_ids), out=term_topic[:, t])

    return total, topic_mat, term_ids.numpy(), term_topic

In [32]:
don_total, donor_topic, don_term_ids, don_term_topic = money_by_topic(
    ('donor', 'donated_to', 'legislator_term'),
    leg_topic_dist,
    src_size=data['donor'].num_nodes
)
lob_lt = money_by_topic(
    ('lobby_firm', 'lobbied', 'legislator_term'),
    leg_topic_dist,
    src_size=data['lobby_firm'].num_nodes
)
lob_cm = money_by_topic(
    ('lobby_firm', 'lobbied', 'committee'),
    comm_topic_dist,
    src_size=data['lobby_firm'].num_nodes
)
lobby_total = lob_lt[0] + lob_cm[0]
lobby_topic = lob_lt[1] + lob_cm[1]

In [33]:
donor_topics = pd.DataFrame(donor_topic.numpy())
donor_topics.columns = [f'topic_{i}' for i in range(num_topics)]
donor_topics = donor_topics.convert_dtypes()
donor_topics['donor'] = data['donor'].node_id.numpy()
donor_topics['total'] = don_total.numpy()
lobby_topics = pd.DataFrame(lobby_topic.numpy())
lobby_topics.columns = [f'topic_{i}' for i in range(num_topics)]
lobby_topics = lobby_topics.convert_dtypes()
lobby_topics['lobby_firm'] = data['lobby_firm'].node_id.numpy()
lobby_topics['total'] = lobby_total.numpy()

In [34]:
donor_ids = {v: k for k, v in node_id_map['donor'].items()}
donor_topics['name'] = donor_topics['donor'].map(donor_ids)
lobby_ids = {v: k for k, v in node_id_map['lobby_firm'].items()}
lobby_topics['name'] = lobby_topics['lobby_firm'].map(lobby_ids)

In [35]:
def aggregate_top_10(df):
    df2 = aggregate_by_group(df, 'C75')
    top_10s = []
    if 'lobby_firm' in df2.columns:
        name = 'lobby_firm'
    else:
        name = 'donor'
    for i, row in df2.iterrows():
        numeric_values = row[[c for c in row.index if c not in [name, 'total', 'name']]].astype(float)
        tops = numeric_values.nlargest(10).sort_values(ascending=False).index.tolist()
        top_10s.append(tops)
    top = pd.DataFrame(np.stack(top_10s))
    top.columns = [f'topic_{i}' for i in range(10)]
    top['type'] = name
    top['name'] = df2['name']
    top['total'] = df2['total']
    return top

In [36]:
donor_topics = aggregate_top_10(donor_topics)
lobby_topics = aggregate_top_10(lobby_topics)

In [37]:
topics_donors = pd.concat([donor_topics, lobby_topics])

In [None]:
topics_donors.to_csv(OUT_PATH / 'donor_lobby_topics.csv', index=False)

In [38]:
all_terms = np.unique(np.concatenate([don_term_ids, lob_lt[2], lob_cm[2]]))
row_of = {int(y): i for i, y in enumerate(all_terms)}

def remap(ids, mat):
    out = torch.zeros(len(all_terms), num_topics)
    for i, y in enumerate(ids):
        out[row_of[int(y)]] += mat[i]
    return out

don_term_topic = remap(don_term_ids, don_term_topic)
lob_term_topic = remap(lob_lt[2], lob_lt[3]) + remap(lob_cm[2], lob_cm[3])

In [39]:
topics_df["donations_term"] = topics_df.apply(
    lambda r: float(don_term_topic[int(row_of.get(r.term, -1)), int(r.label)])
              if r.term in row_of else 0.0,
    axis=1
)
topics_df["lobbying_term"] = topics_df.apply(
    lambda r: float(lob_term_topic[int(row_of.get(r.term, -1)), int(r.label)])
              if r.term in row_of else 0.0,
    axis=1
)

In [40]:
topics = topics_df.merge(tops_df, on=['term', 'C150'], how='inner')

In [252]:
topics.to_csv(OUT_PATH / 'topics.csv', index=False)

In [41]:
leg_don_in_total = scatter_add(
    data[('donor','donated_to','legislator_term')].edge_attr[:,0].abs(),
    data.edge_index_dict[('donor','donated_to','legislator_term')][1],
    dim=0, dim_size=data['legislator_term'].num_nodes)
leg_lob_in_total = scatter_add(
    data[('lobby_firm','lobbied','legislator_term')].edge_attr[:,0].abs(),
    data.edge_index_dict[('lobby_firm','lobbied','legislator_term')][1],
    dim=0, dim_size=data['legislator_term'].num_nodes)

cm_lob_in_total = scatter_add(
    data[('lobby_firm','lobbied','committee')].edge_attr[:,0].abs(),
    data.edge_index_dict[('lobby_firm','lobbied','committee')][1],
    dim=0, dim_size=data['committee'].num_nodes)

In [42]:
leg_funding = pd.DataFrame({
    'legislator_term': data['legislator_term'].node_id,
    'donations_total': leg_don_in_total.numpy(),
    'lobbying_total': leg_lob_in_total.numpy()
})

In [43]:
legislators = pickle.load(open('../../../legislators.pkl', 'rb'))

leg_ids = {v: k for k, v in node_id_map['legislator_term'].items()}
def leg_term_to_name(leg_term_id):
    if isinstance(leg_term_id, str):
        num = int(leg_term_id.split('_')[0])
        return legislators.get(num, None)
    else:
        return None

leg_funding['name'] = leg_funding['legislator_term'].map(leg_ids).apply(leg_term_to_name)
leg_funding['term'] = leg_funding['legislator_term'].map(leg_ids).apply(lambda x: x.split('_')[1] if x else None).apply(lambda x: x.split('-')[0] if x else None).astype(int)

In [44]:
leg_topics['name'] = leg_topics['legislator_term'].map(leg_ids).apply(leg_term_to_name)
leg_topics['term'] = leg_topics['legislator_term'].map(leg_ids).apply(lambda x: x.split('_')[1] if x else None).apply(lambda x: x.split('-')[0] if x else None).astype(int)

In [45]:
politicians = pd.read_csv('../../../ca_leg/legislation_data/politicians.csv')

In [46]:
fix = politicians.loc[politicians['District No.'].isna(), ['full_name', 'Term']].drop_duplicates()
fix['District No.'] = [51, 58, 8, 58, 58, 29, 39, 48, 43, 48, 10, 43, 48, 48, 6]

In [47]:
for i, row in fix.iterrows():
    politicians.loc[(politicians['full_name'] == row['full_name']) & (politicians['Term'] == row['Term']), 'District No.'] = row['District No.']

In [48]:
pol = politicians[['District No.', 'Term', 'full_name', 'chamber', 'Party']].drop_duplicates()
pol['term'] = pol['Term'].apply(lambda x: x.split('-')[0]).astype(int)

In [49]:
lfund = leg_funding.merge(pol, left_on=['term', 'name'], right_on=['term', 'full_name'], how='left')

In [50]:
ltopic = leg_topics.merge(pol, left_on=['term', 'name'], right_on=['term', 'full_name'], how='left')

In [51]:
import geopandas as gpd
import tempfile, zipfile, pathlib

In [52]:
def read_zip(zip_path, crs=3857):
    tmp = tempfile.TemporaryDirectory()
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(tmp.name)
    shp = next(pathlib.Path(tmp.name).rglob("*.shp"))
    gdf = gpd.read_file(shp).set_crs(epsg=crs)
    gdf = gdf.to_crs(epsg=3857)
    return gdf, tmp

def district_cycle(year):
    if year <= 2012: return "2001"
    if year <= 2022: return "2011"
    return "current"

counties_gdf, _ = read_zip('../data/ca_counties.zip')
counties_gdf = counties_gdf[['COUNTYFP', 'NAMELSAD', 'geometry']]
counties_gdf['county_area'] = counties_gdf.geometry.area
counties_gdf['county_id'] = counties_gdf['COUNTYFP'].astype(int)

In [53]:
data_dir = pathlib.Path('../data')

asm11_zip = data_dir / '2011_assembly_state_shp.zip'
sen11_zip = data_dir / '2011_senate_state_shp.zip'
asmcur_zip = data_dir / '2021_AD_Final_shp.zip'
sencur_zip = data_dir / '2021_SD_Final_shp.zip'

dist_info = [
    (asm11_zip, "assembly", "2011", 4019),
    (sen11_zip, "senate",   "2011", 4019),
    (asmcur_zip, "assembly","current", 4269),
    (sencur_zip, "senate",  "current", 4269)
]

weight_records = []
tmps = []
for zp, house, cycle, crs in dist_info:
    gdf, tmp = read_zip(zp, crs)
    tmps.append(tmp)
    gdf = gdf.rename(columns={gdf.columns[0]: "district_id"})[["district_id", "geometry"]]
    gdf["house"] = house
    gdf["cycle"] = cycle
    gdf["dist_area"] = gdf.geometry.area

    inter = gpd.overlay(gdf, counties_gdf, how="intersection")
    inter["fragment_area"] = inter.geometry.area

    weight_records.append(
        inter[["house", "cycle", "district_id", "county_id", "fragment_area", 'county_area', 'dist_area']].reset_index(drop=True)
    )

weights = pd.concat(weight_records, ignore_index=True)

In [54]:
lfund['District No.'] = lfund['District No.'].astype(str).apply(lambda x: re.sub(r'\s', '', x)).astype(float).astype(int)

In [55]:
ltopic['District No.'] = ltopic['District No.'].astype(str).apply(lambda x: re.sub(r'\s', '', x)).astype(float).astype(int)

In [56]:
ltopic_ = ltopic.groupby(['District No.', 'Term', 'chamber'])[[f'topic_{i}' for i in range(num_topics)]].sum().reset_index()
ltopic_['cycle'] = ltopic_['Term'].apply(lambda x: '2011' if int(x.split('-')[0]) <= 2012 else 'current')

In [57]:
lfund_ = lfund.groupby(['Term', 'District No.', 'chamber']).agg({
    'donations_total': 'sum',
    'lobbying_total': 'sum'
}).reset_index()
lfund_['Total'] = lfund_['donations_total'] + lfund_['lobbying_total']
lfund_['cycle'] = lfund_['Term'].apply(lambda x: '2011' if int(x.split('-')[0]) <= 2012 else 'current')

In [58]:
agg_w = weights.groupby(["house", "cycle", "district_id", "county_id"], as_index=False).agg({"fragment_area": "sum", "dist_area": "first"})
agg_w['weight'] = agg_w['fragment_area'] / agg_w['dist_area']

In [59]:
reg_funds = lfund_.merge(agg_w, left_on=['cycle', 'District No.', 'chamber'], right_on=['cycle', 'district_id', 'house'], how='left')
reg_funds = reg_funds.groupby(['house', 'Term', 'county_id']).agg({
    'donations_total': lambda x: sum(x * reg_funds.loc[x.index, 'weight']),
    'lobbying_total': lambda x: sum(x * reg_funds.loc[x.index, 'weight']),
    'Total': lambda x: sum(x * reg_funds.loc[x.index, 'weight'])
}).reset_index().merge(counties_gdf[['county_id', 'NAMELSAD', 'geometry']], on='county_id', how='left')

In [126]:
reg_funds.to_csv(OUT_PATH / 'ca_legislator_funding.csv', index=False)

In [60]:
reg_topics = aggregate_by_group(ltopic_.merge(agg_w, left_on=['cycle', 'District No.', 'chamber'], right_on=['cycle', 'district_id', 'house'], how='inner'), 'C75')

In [61]:
topic_cols = reg_topics.columns.tolist()[10:]

In [62]:
reg_topics = reg_topics.groupby(['house', 'Term', 'county_id']).agg(
    {top: lambda x: sum(x * reg_topics.loc[x.index, 'weight']) for top in topic_cols}
).reset_index().merge(counties_gdf[['county_id', 'NAMELSAD']], on='county_id', how='left')

In [63]:
grouping_cols = ['house', 'Term', 'county_id', 'NAMELSAD']
value_cols = [col for col in reg_topics.columns if col not in grouping_cols]
reg_topics['max_topic_column'] = reg_topics[value_cols].idxmax(axis=1)

In [152]:
reg_topics[['house', 'Term', 'county_id', 'NAMELSAD', 'max_topic_column']].to_csv(
    OUT_PATH / 'ca_legislator_topics.csv', index=False
)

In [64]:
leg_money_topic = leg_topic_dist * (leg_don_in_total + leg_lob_in_total).unsqueeze(-1)
cm_money_topic  = comm_topic_dist * cm_lob_in_total.unsqueeze(-1)

In [65]:
ei = data[("legislator_term","wrote","bill_version")].edge_index.numpy()
ea = data[("legislator_term","wrote","bill_version")].edge_attr.numpy()
author_edge = pd.DataFrame({"legterm_id": ei[0], "bill_id": ei[1], "type": ea[:,0]})
author_edge['date'] = data["bill_version"].time.numpy()[author_edge.bill_id]
author_edge.loc[author_edge.date == 0, 'date'] = datetime.datetime(2000, 6, 15).timestamp()
author_edge['date'] = pd.to_datetime(author_edge['date'], unit='s')

eib = data[('bill_version','is_version', 'bill')].edge_index.numpy()
eib = pd.DataFrame({"src": eib[0], "dst": eib[1], 'outcome': data['bill'].y[eib[1]]})
eib['src'] = eib['src'].astype(int)
eib['dst'] = eib['dst'].astype(int)
author_edge['bill_id'] = author_edge['bill_id'].astype(int)

author_edge = author_edge.merge(eib, left_on='bill_id', right_on='src', how='inner')
author_edge['outcome'] = (author_edge['outcome'] == 1).astype(int)
author_levels = {1: 'COAUTHOR', 2: 'PRINCIPAL_COAUTHOR', 3: 'LEAD_AUTHOR'}
author_edge['author_type'] = author_edge['type'].map(author_levels)

In [66]:
a1 = aggregate_by_group(ltopic, 'C75')
a2 = a1.merge(lfund[['legislator_term', 'donations_total', 'lobbying_total', 'name', 'term', 'District No.', 'Term', 'full_name', 'chamber', 'Party']], how='left')
a2 = a2[list(set(a2.columns))]
a3 = author_edge.merge(bv_df, left_on='bill_id', right_on='bill_version', how='left').groupby('legterm_id').agg({
    'outcome': 'mean',
    'author_type': lambda x: sum(x == 'LEAD_AUTHOR'),
    'bill_version': 'nunique'
}).reset_index()
a4 = a2.merge(a3, left_on='legislator_term', right_on='legterm_id', how='left').reset_index(drop=True)
a4 = a4.loc[a4['legterm_id'].notna()]

In [67]:
top_10s = []
for _, row in a4.iterrows():
    numeric_values = row[[c for c in row.index if c in cluster_groups['C75'].drop_duplicates().values]].astype(float)
    tops = numeric_values.nlargest(10).sort_values(ascending=False).index.tolist()
    top_10s.append(tops)
    top = pd.DataFrame(np.stack(top_10s))

In [68]:
top = pd.DataFrame(np.stack(top_10s))
top.columns = [f'top_{i}' for i in range(10)]
a5 = pd.concat([a3, top], axis=1)

In [69]:
a5['lid'] = a5['legterm_id'].astype(int).map(leg_ids)
a5['legislator'] = a5['lid'].apply(leg_term_to_name)
a5['term'] = a5['lid'].apply(lambda x: x.split('_')[1] if x else None).apply(lambda x: x.split('-')[0] if x else None).astype(int)
a5 = a5.merge(a4[['donations_total', 'lobbying_total', 'legterm_id', 'Party', 'chamber']], on='legterm_id', how='left')

In [70]:
leg_terms = a5[['outcome', 'author_type', 'bill_version', 'top_0', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'top_6', 'top_7', 'top_8', 'top_9', 'legislator', 'term', 'donations_total', 'lobbying_total', 'Party', 'chamber']].copy()

leg_totals = a5[['outcome', 'author_type', 'bill_version', 'top_0', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'top_6', 'top_7', 'top_8', 'top_9', 'legislator', 'term', 'donations_total', 'lobbying_total', 'Party', 'chamber']].copy().groupby('legislator').agg({
    'outcome': 'mean',
    'author_type': 'sum',
    'bill_version': 'sum',
    'donations_total': 'sum',
    'lobbying_total': 'sum',
    'term': 'nunique',
    'Party': 'first',
    'chamber': 'first'
}).reset_index()

In [71]:
leg_topics_top = {}

for i, g in a5[['top_0', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'top_6', 'top_7', 'top_8', 'top_9', 'legislator']].groupby('legislator'):
    values = g[['top_0', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'top_6', 'top_7', 'top_8', 'top_9']].values.flatten()
    u, c = np.unique(values, return_counts=True)
    leg_topics_top[i] = u[np.argmax(c)]

In [72]:
leg_totals['top_topic'] = leg_totals['legislator'].map(leg_topics_top)

In [409]:
leg_totals.to_csv(OUT_PATH / 'legislator_totals.csv', index=False)
leg_terms.to_csv(OUT_PATH / 'legislator_terms.csv', index=False)

In [78]:
l = (leg_terms
    .melt(
        id_vars=[col for col in leg_terms.columns if not col.startswith('top_')],
        value_vars=['top_0', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'top_6', 'top_7', 'top_8', 'top_9'],
        var_name='topic_rank',
        value_name='topic_name'
    )
    .assign(Party=lambda df: (df['Party'] == 'D').astype(int))
    .groupby(['topic_name', 'term'])
    .agg({'Party': 'mean'})
    .reset_index()
    .rename(columns={'Party': 'partisanship', 'topic_name': 'C75'})
)

In [73]:
t = topics.merge(cluster_groups[['label', 'C150', 'C75']], on=['label', 'C150'])[['C150', 'term', 'outcome', 'donations_term', 'lobbying_term', 'vote_signal', 'bill_id_y', 'longevity_y', 'C75']].rename(columns={'longevity_y': 'longevity', 'bill_id_y': 'bill_id'})
t['longevity'] = t['longevity'].dt.days

In [80]:
t = t.merge(l, on=['term', 'C75'], how='left')

In [81]:
t.to_csv(OUT_PATH / 'topics_aggregated.csv', index=False)

In [85]:
author_edge['bill'] = data['bill'].n_id[author_edge['dst'].values]

In [86]:
bi = vdf.groupby('bill_id').agg({
    'vote_signal': 'mean',
    'topic_cluster': 'first',
    'C150': 'first',
    'C75': 'first',
    'term': 'first'
}).reset_index().merge(bill_dates.merge(bills[['bill_id', 'outcome']], on='bill_id', how='left').drop_duplicates(), on='bill_id', how='left')

In [87]:
authors = author_edge.groupby('bill').agg({'date': 'max'}).reset_index().merge(author_edge, on=['bill', 'date'], how='inner').groupby('bill').agg({
    'legterm_id': lambda x: ', '.join(x.astype(str).unique())}).reset_index()

In [88]:
def terms_to_names(terms):
    term_names = []
    for t in terms.split(', '):
        l = leg_ids.get(int(t.strip()), None)
        if l is not None:
            term_names.append(leg_term_to_name(l))
    return ', '.join([n for n in term_names if n is not None])

In [89]:
authors['authors'] = authors['legterm_id'].apply(terms_to_names)

In [90]:
b = bi.merge(authors, left_on='bill_id', right_on='bill', how='left')

In [106]:
b['longevity'] = b['longevity'].dt.days

In [107]:
pq.write_table(pa.Table.from_pandas(b[['bill_id', 'vote_signal', 'topic_cluster', 'C150', 'C75', 'term', 'outcome', 'longevity', 'authors', 'First_action']]), OUT_PATH / 'bills.parquet')

In [108]:
leg_terms

Unnamed: 0,outcome,author_type,bill_version,top_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,top_9,legislator,term,donations_total,lobbying_total,Party,chamber
0,0.000000,182,1809,Education Regulations,Community College and University Costs,Resource Management and Oversight,State Employee Benefits,"Health care enrollment, cost sharing, and cove...",Peace Officers and Law Enforcement,Emergency Medical Services,Consumer Privacy and Protections,Alternative and Supportive Education Programs,Sexual Assault and Domestic Violence Programs,Elaine Alquist,2001,3.297494e+05,2065.010498,D,assembly
1,0.719219,217,666,Sexual Assault and Domestic Violence Programs,Education Regulations,"Health care enrollment, cost sharing, and cove...",Elections and Voting,Resource Management and Oversight,Criminal Justice System,Emergency Medical Services,Early Childhood Education and Developmental Se...,Communicable disease and bloodborne infections,Vehicle Ownership and Traffic Violations,Elaine Alquist,2005,0.000000e+00,519.170044,D,senate
2,0.825160,194,469,"Health care enrollment, cost sharing, and cove...",Labor and Unemployment,Elderly and Dependent Adults,Early Childhood Education and Developmental Se...,Education Regulations,Resource Management and Oversight,Communicable disease and bloodborne infections,Criminal Justice System,Coastal Resource Management,Sexual Assault and Domestic Violence Programs,Elaine Alquist,2007,0.000000e+00,2699.600098,D,senate
3,0.852761,227,489,"Health care enrollment, cost sharing, and cove...",Medi-Cal Eligibility and Costs,Resource Management and Oversight,Education Regulations,Water Resources and Safety,Consumer Privacy and Protections,Cancer Screening and Research,Alternative and Supportive Education Programs,Community College and University Costs,Finances of Local Agencies,Elaine Alquist,2009,0.000000e+00,1002.379761,D,senate
4,0.000000,243,1470,"Health care enrollment, cost sharing, and cove...",State Employee Benefits,Mental Health Services,Resource Management and Oversight,Peace Officers and Law Enforcement,Education Regulations,Climate Standards for State Projects,Labor and Unemployment,Community College and University Costs,Coastal Resource Management,Dion Aroner,2001,0.000000e+00,1486.240112,D,assembly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,0.000000,0,1,State Budget,Emergency Medical Services,Telecommunications Infrastructure,CalWORKs and CalFresh,Consumer Privacy and Protections,Telehealth services,Personal Income Tax Exemptions,Animal Safety,Physical Education,Body Modification,Jerry McNerney,2025,0.000000e+00,0.000000,D,senate
1280,0.714286,3,7,State Budget,Emergency Medical Services,Telecommunications Infrastructure,CalWORKs and CalFresh,Consumer Privacy and Protections,Telehealth services,Personal Income Tax Exemptions,Animal Safety,Physical Education,Body Modification,Laura Richardson,2025,7.097700e+03,0.000000,D,senate
1281,0.666667,0,3,Emergency Medical Services,Telecommunications Infrastructure,CalWORKs and CalFresh,Consumer Privacy and Protections,Telehealth services,Personal Income Tax Exemptions,Animal Safety,Physical Education,Body Modification,Transportation Regulations,Suzette Martinez Valladares,2025,2.513790e+06,0.000000,R,senate
1282,0.000000,1,2,State Budget,Emergency Medical Services,Telecommunications Infrastructure,CalWORKs and CalFresh,Consumer Privacy and Protections,Telehealth services,Personal Income Tax Exemptions,Animal Safety,Physical Education,Body Modification,Dr Aisha Wahab,2025,0.000000e+00,0.000000,D,senate


In [109]:
bi

Unnamed: 0,bill_id,vote_signal,topic_cluster,C150,C75,term,First_action,Last_action,longevity,outcome
0,200120020AB1,0.981308,,,,1999,2000-12-04,2001-09-06,276 days,0
1,200120020AB10,0.409091,,,,1999,2000-12-04,2001-03-19,105 days,0
2,200120020AB1000,0.936709,152.0,Public Contracts,Public Contracts,2001,2001-02-23,2002-08-30,553 days,0
3,200120020AB1003,0.965217,,,,2001,2001-02-23,2001-07-21,148 days,0
4,200120020AB1004,1.000000,,,,2001,2001-02-23,2001-09-12,201 days,0
...,...,...,...,...,...,...,...,...,...,...
28243,202520261ACR1,0.642202,,,,2025,2025-01-29,2025-02-06,8 days,1
28244,202520261SB1,0.552147,,,,2023,2024-12-02,2025-02-07,67 days,1
28245,202520261SB2,0.480712,84.0,State Budget,State Budget,2025,2025-01-08,2025-02-07,30 days,1
28246,202520261SB3,0.972727,,,,2025,2025-01-20,2025-01-23,3 days,0
