In [3]:
import gc, datetime, json, pickle, re
from pathlib import Path
from torch_geometric.transforms import ToUndirected, RemoveIsolatedNodes
from pathlib import Path
import torch, numpy as np, pandas as pd
import pyarrow as pa, pyarrow.parquet as pq

DATA_PATH = Path("../../../data3.pt")
EXPORT_PATH = Path("../../shiny/data")
OUT_PATH = EXPORT_PATH
DEVICE = torch.device("cpu")
FLOAT = torch.float32

In [2]:
politicians = pd.read_csv(OUT_PATH / 'legislator_terms.csv')
lobbying = pd.read_csv('../../../calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})
expend_assembly = pd.read_csv('../../../calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})
expend_senate = pd.read_csv('../../../calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})
lobbying['expn_date'] = pd.to_datetime(lobbying['EXPN_DATE'])
lobbying['term'] = lobbying['expn_date'].dt.year.astype(int)
lobbying.loc[lobbying['expn_date'].dt.year > 2025, 'term'] = [2022, 2014, 2018, 2018, 2018, 2018, 2018]

ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
lobbying.loc[(lobbying['term'].isin([int(r) for r in range(2000, 2026, 2)])) & (lobbying['expn_date'].dt.month < 11), 'term'] = lobbying['term'] - 1
lobbying.loc[(lobbying['term'].isin([int(r) for r in range(2000, 2026, 2)])) & (lobbying['expn_date'].dt.month >= 11), 'term'] = lobbying['term'] + 1

In [98]:
lob = lobbying.groupby(['clean_beneficiary', 'term']).agg({'AMOUNT': 'sum'}).reset_index()

In [99]:
expend_assembly = expend_assembly.loc[expend_assembly['term_y'].apply(lambda x: isinstance(x, str))]
expend_assembly['year'] = expend_assembly['term_y'].apply(lambda x: int(str(x).split('-')[0]))
expend_assembly.loc[expend_assembly['year'] // 2 == 0, 'year'] = expend_assembly.loc[expend_assembly['year'] // 2 == 0, 'year'] - 1
exp_as = expend_assembly[['Amount', 'year', 'full_name']].drop_duplicates().groupby(['full_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={'year': 'term'})

In [100]:
expend_senate['year'] = expend_senate['term'].apply(lambda x: int(x.split('-')[0]))
expend_senate.loc[expend_senate['year'] // 2 == 0, 'year'] = expend_senate.loc[expend_senate['year'] // 2 == 0, 'year'] - 1
exp_sen = expend_senate.groupby(['full_name', 'year']).agg({'Amount': 'sum'}).reset_index().rename(columns={"year": 'term'})

In [101]:
politicians['lower'] = politicians['full_name'].str.lower()
pl = politicians.merge(lob, left_on=['term', 'lower'], right_on=['term', 'clean_beneficiary'], how='left').rename(columns={'AMOUNT': 'total_lobbying_'})
pld = pl.merge(exp_as, on=['term', 'full_name'], how='left').rename(columns={'Amount': 'total_donations_'})

In [102]:
pldd = pld.merge(exp_sen, on=['full_name', 'term'], how='left')
pldd['total_donations_'] = pldd[['total_donations_', 'Amount']].sum(skipna=True, axis=1)
pldd = pldd.drop(columns=['total_donations', 'total_lobbying', 'Amount', 'total_received']).rename(columns={'total_donations_': 'total_donations', 'total_lobbying_': 'total_lobbying'})

In [103]:
pldd['total_received'] = pldd['total_donations'] + pldd['total_lobbying']

In [105]:
pldd[['total_donations', 'total_lobbying', 'total_received']] = pldd[['total_donations', 'total_lobbying', 'total_received']].fillna(0)

  pldd[['total_donations', 'total_lobbying', 'total_received']] = pldd[['total_donations', 'total_lobbying', 'total_received']].fillna(0)


In [106]:
pldd.to_csv(OUT_PATH / 'politicians2.csv', index=False)

TODO:
- identify and map topic labels to all dfs
- change column names to fit app
- add more data to the app

In [4]:
with open('../../../node_id_map.json', 'r') as f:
        node_id_map = json.load(f)

with open('../../../bill_labels_updated.json', 'r') as f:
    topic_cluster_labels_dict = json.load(f)

In [5]:
OUT_DIR = Path("../..")

embeddings = torch.load(OUT_DIR / "node_embeddings.pt", map_location=DEVICE)
preds = torch.load(OUT_DIR / "predictions.pt", map_location=DEVICE)


bill_logits = preds["bill_logits"].softmax(-1)
bill_success_p = preds["success_logit"].sigmoid()
actor_align = preds["actor_align"]
actor_influence = preds["actor_influence"]
K_TOPICS = bill_logits.size(1)

In [6]:
def safe_normalize_timestamps(timestamps, eps=1e-8):
    timestamps = torch.nan_to_num(timestamps, nan=0.0, posinf=1e4, neginf=-1e4)
    p5 = torch.quantile(timestamps, 0.05)
    p95 = torch.quantile(timestamps, 0.95)

    if (p95 - p5) < eps:
        return torch.zeros_like(timestamps)

    timestamps = torch.clamp(timestamps, p5, p95)
    normalized = (timestamps - p5) / (p95 - p5)
    return torch.nan_to_num(normalized, nan=0.0)

def safe_standardize_time_format(time_data):
    times = []
    for t in time_data:
        try:
            if isinstance(t, (int, float)) and 1900 <= t  and t <= 2100:
                td = datetime.datetime(int(t), 6, 15).timestamp()
            elif (isinstance(t, str) or (isinstance(t, float))) and (float(t) < 2100 and float(t) > 1900):
                td = datetime.datetime(int(float(t)), 6, 15).timestamp()
            elif float(t) > 0 and float(t) < 1990:
                td = t
            elif float(t) > 17000000.0:
                td = float(t)
            elif isinstance(t, datetime.datetime):
                td = t.timestamp()
            else:
                td = float(t) * 1e9
        except:
            td = datetime.datetime(2000, 6, 15).timestamp()
        times.append(td)
    return torch.tensor(times, dtype=torch.float32)

def pull_timestamps(data):
    timestamp_edges = [
        ('donor', 'donated_to', 'legislator_term'),
        ('legislator_term', 'rev_donated_to', 'donor'),
        ('lobby_firm', 'lobbied', 'legislator_term'),
        ('lobby_firm', 'lobbied', 'committee'),
        ('committee', 'rev_lobbied', 'lobby_firm'),
        ('legislator_term', 'rev_lobbied', 'lobby_firm'),
        ('bill_version', 'rev_voted_on', 'legislator_term'),
        ('legislator_term', 'voted_on', 'bill_version'),
    ]
    timestamp_nodes = ['legislator_term', 'bill_version', 'bill']

    for et in timestamp_edges:
        if hasattr(data[et], 'edge_attr') and data[et].edge_attr is not None and len(data[et].edge_attr.size()) > 1:
            if data[et].edge_attr.size(1) > 1:
                edge_attr = data[et].edge_attr
                ts_col = edge_attr[:, -1]
                if ts_col.abs().max() > 1e8 or ts_col.min() < 0:
                    ts_col = safe_standardize_time_format(ts_col.tolist()).to(edge_attr.device)
                data[et].timestamp = safe_normalize_timestamps(ts_col)
                data[et].time = data[et].timestamp
                data[et].edge_attr = edge_attr[:, :-1]

    for nt in timestamp_nodes:
        if hasattr(data[nt], 'x') and data[nt].x is not None:
            try:
                if len(data[nt].x.size()) > 1:
                    if data[nt].x.size(1) > 1:
                        x = data[nt].x
                        ts_col = x[:, -1]
                        if ts_col.abs().max() > 1e8 or ts_col.min() < 0:
                            ts_col = safe_standardize_time_format(ts_col.tolist()).to(x.device)
                        if nt in timestamp_nodes or ts_col.abs().max() > 1e6:
                            data[nt].timestamp = safe_normalize_timestamps(ts_col)
                            data[nt].time = data[nt].timestamp
                            data[nt].x = x[:, :-1]
            except:
                pass
    return data
def clean_features(data):
    for nt in data.node_types:
        x = data[nt].x
        x = torch.as_tensor(x, dtype=torch.float32)
        x = torch.nan_to_num(x.float(), nan=0.0, posinf=1e4, neginf=-1e4)
        mean = x.mean(0, keepdim=True)
        std = x.std(0, keepdim=True).clamp(min=1e-5)
        x = ((x - mean) / std).clamp(-10, 10)
        data[nt].x = x
        data[nt].x_mean = mean
        data[nt].x_std = std
    data = pull_timestamps(data)
    return data

def compute_controversiality(data):
    edge_type = ('legislator_term', 'voted_on', 'bill_version')
    if edge_type not in data.edge_index_dict:
        raise ValueError("Missing 'voted_on' edges in data.")

    ei = data[edge_type].edge_index
    ea = data[edge_type].edge_attr

    vote_signal = ea[:, 0]

    src_nodes = ei[0]
    tgt_nodes = ei[1]

    num_bills = data['bill_version'].num_nodes
    device = tgt_nodes.device

    yes_votes = torch.zeros(num_bills, device=device)
    no_votes = torch.zeros(num_bills, device=device)

    yes_votes.index_add_(0, tgt_nodes, (vote_signal > 0).float())
    no_votes.index_add_(0, tgt_nodes, (vote_signal < 0).float())

    total_votes = yes_votes + no_votes + 1e-6

    yes_ratio = yes_votes / total_votes
    no_ratio = no_votes / total_votes

    controversy = 4 * yes_ratio * no_ratio
    controversy = controversy.clamp(0, 1)
    data['bill_version'].controversy = controversy

    return data

def load_and_preprocess_data(path='../../../data3.pt'):
    full_data = torch.load(path, weights_only=False)
    for nt in full_data.node_types:
        if hasattr(full_data[nt], 'x') and full_data[nt].x is not None:
            flat = torch.as_tensor(full_data[nt].x).flatten(start_dim=1)
            full_data[nt].x = flat
            full_data[nt].num_nodes = flat.size(0)

    for edge_type, edge_index in full_data.edge_index_dict.items():
        src_type, _, dst_type = edge_type
        max_src_idx = edge_index[0].max().item() if edge_index.size(1) > 0 else -1
        max_dst_idx = edge_index[1].max().item() if edge_index.size(1) > 0 else -1
        if max_src_idx >= full_data[src_type].num_nodes:
            print(f"Fixing {src_type} node count: {full_data[src_type].num_nodes} -> {max_src_idx + 1}")
            full_data[src_type].num_nodes = max_src_idx + 1

        if max_dst_idx >= full_data[dst_type].num_nodes:
            print(f"Fixing {dst_type} node count: {full_data[dst_type].num_nodes} -> {max_dst_idx + 1}")
            full_data[dst_type].num_nodes = max_dst_idx + 1
    full_data['bill'].y[np.where(full_data['bill'].y < 0)[0]] = 0
    full_data['bill'].y = torch.as_tensor(full_data['bill'].y, dtype=torch.float32)

    data = ToUndirected(merge=False)(full_data)
    del full_data
    gc.collect()
    data = RemoveIsolatedNodes()(data)
    data = compute_controversiality(clean_features(data))

    for nt in data.node_types:
        ids = torch.arange(data[nt].num_nodes, device='mps')
        data[nt].node_id = ids
    for store in data.stores:
        for key, value in store.items():
            if isinstance(value, torch.Tensor) and value.dtype == torch.float64:
                store[key] = value.float()

    return data

data = load_and_preprocess_data()

In [7]:
for nt in data.node_types:
    print(f"{nt}: {data[nt].num_nodes} nodes, {data[nt].x.size(1) if hasattr(data[nt], 'x') else 0} features")

bill: 46054 nodes, 769 features
bill_version: 198160 nodes, 389 features
legislator: 509 nodes, 385 features
legislator_term: 1434 nodes, 2 features
committee: 1537 nodes, 385 features
lobby_firm: 321 nodes, 384 features
donor: 234 nodes, 384 features


In [None]:
edge_type = ('legislator_term', 'voted_on', 'bill_version')
if edge_type not in data.edge_index_dict:
    raise ValueError("Missing 'voted_on' edges in data.")

ei = data[edge_type].edge_index
ea = data[edge_type].edge_attr

vote_signal = ea[:, 0]
tgt_nodes = ei[1]

num_bills = data['bill_version'].num_nodes
device = tgt_nodes.device

yes_votes = torch.zeros(num_bills, device=device)
no_votes = torch.zeros(num_bills, device=device)

yes_votes.index_add_(0, tgt_nodes, (vote_signal > 0).float())
no_votes.index_add_(0, tgt_nodes, (vote_signal <= 0).float())

total_votes = yes_votes + no_votes + 1e-6

yes_ratio = yes_votes / total_votes
no_ratio = no_votes / total_votes

controversy = 4 * yes_ratio * no_ratio
controversy = controversy.clamp(0, 1)

tensor([1.1500e+02, 1.0000e-06, 1.0000e-06,  ..., 7.4000e+01, 1.0000e-06,
        1.0000e-06])

In [22]:
no_ratio

tensor([0.0174, 0.0000, 0.0000,  ..., 0.4459, 0.0000, 0.0000])

In [5]:
key1 = data['bill'].n_id.tolist()
key2 = data['bill'].node_id.tolist()
key = {k1: k2 for k1, k2 in zip(key1, key2)}
cluster_bill = {}
nids = []
for bill_nid, lab in topic_cluster_labels_dict.items():
        if bill_nid in key:
            cluster_bill[key[bill_nid]] = lab
            nids.append(key[bill_nid])

## Bills

In [6]:
bv_ts = pickle.loads(open('../../../bill_dates_map.pkl', 'rb').read())

In [7]:
bv_ids = {v: k for k, v in node_id_map['bill_version'].items()}

In [8]:
v2b_edge = tuple([et for et in data.edge_types
                if et[0] == "bill_version" and et[2] == "bill"])[0]
src, dst = data[v2b_edge].edge_index.numpy()

bv_df = pd.DataFrame({"bill_version": src, "bill_id": data['bill'].n_id[dst]})
bv_df['bill_version_id'] = bv_df['bill_version'].map(bv_ids)

In [9]:
bill_dates = pd.DataFrame(bv_ts).T.reset_index().rename(columns={'index': 'bill_id'})
bill_dates = bill_dates.loc[bill_dates['bill_id'].isin(bv_df['bill_id'].unique())]

In [10]:
controversy_df = pd.DataFrame({
    'controversy': data['bill_version'].controversy[bv_df['bill_version'].unique()].numpy(),
    'bill_version': bv_df['bill_version'].unique()
})

In [11]:
outcome_df = pd.DataFrame({
    'bill_id': data['bill'].n_id,
    'outcome': data['bill'].y
})

In [12]:
bills = bv_df.merge(controversy_df, on='bill_version', how='left').merge(outcome_df, on='bill_id', how='left')
bills['topic_cluster'] = bills['bill_id'].map(topic_cluster_labels_dict)

In [13]:
bill_dates['longevity'] = bill_dates['Last_action'] - bill_dates['First_action']

In [14]:
bill_df = bills.groupby('bill_id').agg({'outcome': 'max', 'controversy': 'max', 'topic_cluster': 'max'}).merge(bill_dates[['bill_id', 'longevity']], on='bill_id')

In [15]:
with open('../../../bill_labels_updated.json', 'r') as f:
    bill_subjects = np.array(list(json.load(f).keys()))

In [16]:
with open('../../../bill_labels_updated.json', 'r') as f:
    bill_labels = json.load(f)

In [17]:
subject_originals = pickle.load(open('../../../subjects_original.pkl', 'rb'))

In [18]:
with open('../../../bill_subjects.json', 'r') as f:
    bill_subjects_dict = json.load(f)

In [19]:
so = {k: subject_originals[v] for k, v in bill_subjects_dict.items() if v in subject_originals}

## Topics

In [20]:
topics = bill_df.loc[bill_df['topic_cluster'].notna()].copy()
topics['term'] = topics['bill_id'].apply(lambda x: x[:4]).astype(int)

In [21]:
topics_df = topics.groupby(['term', 'topic_cluster']).agg({'outcome': lambda x: len(x.loc[x == 1]) / len(x), 'controversy': lambda x: np.mean(x.loc[x > 0]), 'bill_id': 'nunique', 'longevity': 'mean'}).reset_index()

In [22]:
bv_topics = bills[['bill_version', 'topic_cluster']].loc[bills['topic_cluster'].notna()].drop_duplicates().set_index('bill_version').to_dict()

In [23]:
bv_cluster = torch.full(
    (data["bill_version"].num_nodes,),
    -1, dtype=torch.long)

for bv_id, topic_id in bv_topics['topic_cluster'].items():
    bv_cluster[bv_id] = int(topic_id)

In [24]:
lt_to_leg = data.edge_index_dict[('legislator', 'samePerson', 'legislator_term')]
leg_of_lt, lt_idx = lt_to_leg

leg_align = actor_align["legislator"]
leg_topic_prob = torch.zeros(
    data['legislator_term'].num_nodes, K_TOPICS, dtype=FLOAT
)

In [25]:
def topic_cols(K):
    return [f"topic_{k}" for k in range(K)]

In [26]:
def legislator_term_topic_df(id_name_map):
    K = actor_align["legislator"].size(1)
    leg_infl = actor_influence["legislator"]

    infl_term = torch.zeros(data['legislator_term'].num_nodes, dtype=FLOAT)
    infl_term.index_copy_(0, lt_idx, leg_infl[leg_of_lt])

    df = pd.DataFrame(
        leg_topic_prob.numpy(),
        columns=[f"topic_{k}" for k in range(K)]
    )
    df["influence"] = infl_term.numpy()
    df["legislator_term"] = range(len(df))
    df["name"] = df["legislator_term"].map(id_name_map)
    return df, infl_term.numpy()

leg_topics_df, infl_term = legislator_term_topic_df(node_id_map["legislator_term"])

In [27]:
def actor_topic_df(nt):
    prob = actor_align[nt]
    infl = actor_influence[nt]
    df = pd.DataFrame(prob.numpy(), columns=topic_cols(K_TOPICS))
    df[nt] = np.arange(len(df))
    df["name"] = data[nt].n_id
    df["influence"] = infl.numpy()
    return df

donor_df = actor_topic_df("donor")
lobby_df = actor_topic_df("lobby_firm")
comm_df = actor_topic_df("committee")
leg_df = actor_topic_df("legislator")
lt_df = pd.DataFrame(leg_topic_prob.numpy(), columns=topic_cols(K_TOPICS))
lt_df["influence"] = infl_term

In [28]:
N_LEG = data['legislator'].num_nodes
N_LT = data['legislator_term'].num_nodes
N_BILL = data['bill'].num_nodes
N_COMM = data['committee'].num_nodes
N_LOB = data['lobby_firm'].num_nodes
N_DON = data['donor'].num_nodes

In [29]:
src_don, _ = data.edge_index_dict[('donor','donated_to','legislator_term')]
don_out = torch.zeros(N_DON)
don_out.index_add_(0, src_don,
                   data[('donor','donated_to','legislator_term')].edge_attr[:,0].abs())
donor_df["total_spent"] = don_out.numpy()

src_lo1, _ = data.edge_index_dict[('lobby_firm','lobbied','legislator_term')]
src_lo2, _ = data.edge_index_dict[('lobby_firm','lobbied','committee')]

lob_out = torch.zeros(N_LOB)
for src_lo, et in [(src_lo1, ('lobby_firm','lobbied','legislator_term')),
                   (src_lo2, ('lobby_firm','lobbied','committee'))]:
    lob_out.index_add_(0, src_lo,
        data[et].edge_attr[:,0].abs())
lobby_df["total_spent"] = lob_out.numpy()

In [30]:
don_src, don_dst = data.edge_index_dict[('donor','donated_to','legislator_term')]
lob_src1, lob_dst1 = data.edge_index_dict[('lobby_firm','lobbied','legislator_term')]

lt_in = torch.zeros(N_LT)
lt_in.index_add_(0, don_dst,
    data[('donor','donated_to','legislator_term')].edge_attr[:,0].abs())
donations = lt_in.clone()
lt_in.index_add_(0, lob_dst1,
    data[('lobby_firm','lobbied','legislator_term')].edge_attr[:,0].abs())

leg_in = torch.zeros(N_LEG)
leg_in.index_add_(0, leg_of_lt, lt_in[lt_idx])
don_in = torch.zeros(N_LEG)
don_in.index_add_(0, leg_of_lt, donations[lt_idx])
leg_df["total_received"] = leg_in.numpy()
leg_df["total_donations"] = don_in.numpy()
leg_df['total_lobbying'] = leg_df['total_received'] - leg_df['total_donations']

_, com_dst = data.edge_index_dict[('lobby_firm','lobbied','committee')]
com_in = torch.zeros(N_COMM)
com_in.index_add_(0, com_dst,
    data[('lobby_firm','lobbied','committee')].edge_attr[:,0].abs())
comm_df["total_received"] = com_in.numpy()

In [31]:
def edge_year(ts_tensor):
    return pd.to_datetime(ts_tensor.cpu().numpy(), unit="s").year.astype(np.int16)

def money_by_topic(edge_key, src_df):
    src_idx, dst_idx = data.edge_index_dict[edge_key]
    dollars  = data[edge_key].edge_attr[:,0].abs().cpu()

    prob_src = torch.from_numpy(src_df[topic_cols(K_TOPICS)].to_numpy())
    infl_src = torch.from_numpy(src_df["influence"].to_numpy())

    w = prob_src[src_idx] * infl_src[src_idx,None]
    topic_dollars = torch.zeros(K_TOPICS)
    topic_dollars.index_add_(0, torch.arange(K_TOPICS).repeat(len(w)),
                             (w*dollars[:,None]).flatten())
    return topic_dollars.numpy()


In [32]:
sl = pd.read_csv('../../../sampled_labels - sampled_labels.csv')
big_labels = {row['cluster']: row['Label'] for _, row in sl.iterrows()}

In [33]:
def add_top_n(df, n=5):
    top = (
        df[topic_cols(K_TOPICS)]
          .apply(lambda r: r.nlargest(n).index.str[6:], axis=1)
    )
    topics = top.apply(lambda r: [big_labels[int(t)] for t in r])
    df['top_topics'] = topics
    return df

In [34]:
for _df in (donor_df, lobby_df, comm_df, leg_df):
    add_top_n(_df)

In [35]:
legislators = pickle.load(open('../../../legislators.pkl', 'rb'))

leg_ids = {v: k for k, v in node_id_map['legislator_term'].items()}

def leg_term_to_name(leg_term_id):
    if isinstance(leg_term_id, str):
        num = int(leg_term_id.split('_')[0])
        return legislators.get(num, None)
    else:
        return None

def leg_term_to_term(leg_term_id):
    if isinstance(leg_term_id, str):
        a = leg_term_id.split('_')[1]
        return int(a.split('-')[0]) if a else None
    else:
        return None

leg_df['legislator'] = leg_df['name'].astype(int).map(leg_ids).apply(leg_term_to_name)
leg_df['term'] = leg_df['name'].astype(int).map(leg_ids).apply(leg_term_to_term)

In [107]:
politicians = pd.read_csv('../../../ca_leg/legislation_data/politicians.csv')

In [108]:
fix = politicians.loc[politicians['District No.'].isna(), ['full_name', 'Term']].drop_duplicates()
fix['District No.'] = [51, 58, 8, 58, 58, 29, 39, 48, 43, 48, 10, 43, 48, 48, 6]

In [109]:
for i, row in fix.iterrows():
    politicians.loc[(politicians['full_name'] == row['full_name']) & (politicians['Term'] == row['Term']), 'District No.'] = row['District No.']

In [110]:
pol = politicians[['District No.', 'Term', 'full_name', 'chamber', 'Party']].drop_duplicates()
pol['term'] = pol['Term'].apply(lambda x: x.split('-')[0]).astype(int)

In [113]:
lfund = pol.merge(pldd, on=['full_name', 'term'], how='left')

In [115]:
import geopandas as gpd
import tempfile, zipfile, pathlib

In [116]:
def read_zip(zip_path, crs=3857):
    tmp = tempfile.TemporaryDirectory()
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(tmp.name)
    shp = next(pathlib.Path(tmp.name).rglob("*.shp"))
    gdf = gpd.read_file(shp).set_crs(epsg=crs)
    gdf = gdf.to_crs(epsg=3857)
    return gdf, tmp

def district_cycle(year):
    if year <= 2012: return "2001"
    if year <= 2022: return "2011"
    return "current"

counties_gdf, _ = read_zip('../data/ca_counties.zip')
counties_gdf = counties_gdf[['COUNTYFP', 'NAMELSAD', 'geometry']]
counties_gdf['county_area'] = counties_gdf.geometry.area
counties_gdf['county_id'] = counties_gdf['COUNTYFP'].astype(int)

In [117]:
cgdf = counties_gdf.to_json(na='drop', to_wgs84=True)
with open(OUT_PATH / 'counties.geojson', 'w') as f:
    f.write(cgdf)

In [118]:
data_dir = pathlib.Path('../data')

asm11_zip = data_dir / '2011_assembly_state_shp.zip'
sen11_zip = data_dir / '2011_senate_state_shp.zip'
asmcur_zip = data_dir / '2021_AD_Final_shp.zip'
sencur_zip = data_dir / '2021_SD_Final_shp.zip'

dist_info = [
    (asm11_zip, "assembly", "2011", 4019),
    (sen11_zip, "senate",   "2011", 4019),
    (asmcur_zip, "assembly","current", 4269),
    (sencur_zip, "senate",  "current", 4269)
]

weight_records = []
tmps = []
for zp, house, cycle, crs in dist_info:
    gdf, tmp = read_zip(zp, crs)
    tmps.append(tmp)
    gdf = gdf.rename(columns={gdf.columns[0]: "district_id"})[["district_id", "geometry"]]
    gdf["house"] = house
    gdf["cycle"] = cycle
    gdf["dist_area"] = gdf.geometry.area

    inter = gpd.overlay(gdf, counties_gdf, how="intersection")
    inter["fragment_area"] = inter.geometry.area

    weight_records.append(
        inter[["house", "cycle", "district_id", "county_id", "fragment_area", 'county_area', 'dist_area']].reset_index(drop=True)
    )

weights = pd.concat(weight_records, ignore_index=True)

In [119]:
weights['weight'] = weights['fragment_area'] / weights['county_area']

In [120]:
lfund['District No.'] = lfund['District No.'].astype(str).apply(lambda x: re.sub(r'\s', '', x)).astype(float).astype(int)

In [141]:
from statistics import mode
from collections import defaultdict
import ast

In [199]:
term_topics = defaultdict(list)
for _, row in lfund.iterrows():
    try:
        for t in ast.literal_eval(row['top_topics']):
            term_topics[(row['Term'], row['District No.'], row['chamber_x'])].append(t)
    except:
        pass
term_topics_ = {k: mode(v) for k, v in term_topics.items()}

In [193]:
lfund_ = lfund.groupby(['Term', 'District No.', 'chamber_x']).agg({
    'total_donations': 'sum',
    'total_lobbying': 'sum',
    'total_received': 'sum',
    'top_topics': lambda x: list(x)
}).reset_index()
lfund_['cycle'] = lfund_['Term'].apply(lambda x: '2011' if int(x.split('-')[0]) <= 2012 else 'current')

In [195]:
reg_funds = lfund_.merge(weights, left_on=['cycle', 'District No.', 'chamber_x'], right_on=['cycle', 'district_id', 'house'], how='left')

reg_funds['total_donations'] *= reg_funds['weight']
reg_funds['total_lobbying'] *= reg_funds['weight']
reg_funds['total_received'] *= reg_funds['weight']

In [224]:
county_topics = defaultdict(list)
for _, row in reg_funds.iterrows():
    if row['top_topics'] == [np.nan] or row['top_topics'][0] is None:
        continue
    try:
        for t in ast.literal_eval(row['top_topics'][0]):
            if t not in ['Extraordinary Sessions', 'Health Facilities']:
                county_topics[row['county_id']].append(t)
    except:
        pass
county_topics_ = {k: mode(v) for k, v in county_topics.items()}

In [226]:
reg_funds_ = reg_funds.groupby(['county_id', 'house']).agg({
    'total_donations': 'sum',
    'total_lobbying': 'sum',
    'total_received': 'sum'
}).reset_index()

In [None]:
ca_legislator_topics = pd.read_csv('../../shiny/data/ca_legislator_topics.csv')
ca_legislator_funding = pd.read_csv('../../shiny/data/ca_legislator_funding.csv')
last_topics = ca_legislator_topics.loc[ca_legislator_topics['Term'].apply(lambda x: int(x.split('-')[0]) == 2025)]
cal = ca_legislator_funding.merge(last_topics, on=['county_id', 'house'])


In [230]:
reg_funds_['topic'] = reg_funds_['county_id'].map(county_topics_)

In [233]:
co_cal = reg_funds_.merge(counties_gdf, on='county_id', how='left')
gpd.GeoDataFrame(co_cal, geometry='geometry').to_file(OUT_PATH / 'ca_legislator_funding.geojson', driver='GeoJSON')

In [51]:
reg_funds_.to_csv(OUT_PATH / 'ca_legislator_funding.csv', index=False)

In [52]:
reg_topics = ltopic_.merge(weights, left_on=['cycle', 'District No.', 'chamber'], right_on=['cycle', 'district_id', 'house'], how='right')

for i in range(K_TOPICS):
    reg_topics[f'topic_{i}'] *= reg_topics['weight']

In [53]:
reg_topics_ = reg_topics.groupby(['house', 'Term', 'county_id'])[topic_cols(K_TOPICS)].sum().reset_index().merge(counties_gdf[['county_id', 'NAMELSAD']], on='county_id', how='left')

In [54]:
reg_topics_ = add_top_n(reg_topics_)

In [56]:
reg_topics_[['house', 'Term', 'county_id', 'NAMELSAD', 'top_topics']].to_csv(
    OUT_PATH / 'ca_legislator_topics.csv', index=False
)

In [57]:
ei = data[("legislator_term","wrote","bill_version")].edge_index.numpy()
ea = data[("legislator_term","wrote","bill_version")].edge_attr.numpy()
author_edge = pd.DataFrame({"legterm_id": ei[0], "bill_id": ei[1], "type": ea[:,0]})
author_edge['date'] = data["bill_version"].time.numpy()[author_edge.bill_id]
author_edge.loc[author_edge.date == 0, 'date'] = datetime.datetime(2000, 6, 15).timestamp()
author_edge['date'] = pd.to_datetime(author_edge['date'], unit='s')

eib = data[('bill_version','is_version', 'bill')].edge_index.numpy()
eib = pd.DataFrame({"src": eib[0], "dst": eib[1], 'outcome': data['bill'].y[eib[1]]})
eib['src'] = eib['src'].astype(int)
eib['dst'] = eib['dst'].astype(int)
author_edge['bill_id'] = author_edge['bill_id'].astype(int)

author_edge = author_edge.merge(eib, left_on='bill_id', right_on='src', how='inner')
author_edge['outcome'] = (author_edge['outcome'] == 1).astype(int)
author_levels = {1: 'COAUTHOR', 2: 'PRINCIPAL_COAUTHOR', 3: 'LEAD_AUTHOR'}
author_edge['author_type'] = author_edge['type'].map(author_levels)

In [58]:
ve = data[('bill_version', 'rev_voted_on', 'legislator_term')].edge_index.numpy()
va = data[('bill_version', 'rev_voted_on', 'legislator_term')].edge_attr.numpy()
vote_edge = pd.DataFrame({'bill_version': ve[0], 'legislator_term': ve[1], 'vote_signal': va[:, 0]})
vote_edge = vote_edge.merge(eib, left_on='bill_version', right_on='src', how='left').merge(bv_df, on='bill_version', how='left')

In [59]:
vote_edge['full_name'] = vote_edge['legislator_term'].map(leg_ids).apply(leg_term_to_name)
vote_edge['term'] = vote_edge['legislator_term'].map(leg_ids).apply(leg_term_to_term)

In [60]:
signals = vote_edge.groupby('bill_id').agg({'outcome': 'max', 'vote_signal': lambda x: (x > 0).sum() / len(x)})
signals.loc[(signals['outcome'] == 0.0) & (signals['vote_signal'] == 1.0), 'vote_signal'] = 0.0

In [61]:
a3 = author_edge.merge(bv_df, left_on='bill_id', right_on='bill_version', how='left').groupby('legterm_id').agg({
    'outcome': 'mean',
    'author_type': lambda x: sum(x == 'LEAD_AUTHOR'),
    'bill_version': 'nunique'
}).reset_index()

a3['full_name'] = a3['legterm_id'].map(leg_ids).apply(leg_term_to_name)
a3['term'] = a3['legterm_id'].map(leg_ids).apply(leg_term_to_term)

In [62]:
a4 = a3.merge(lfund, on=['full_name', 'term'], how='left')

In [63]:
a4[['outcome', 'author_type', 'bill_version', 'top_topics',  'full_name', 'term', 'total_donations', 'total_lobbying', 'total_received', 'Party', 'chamber']].copy().to_csv(OUT_PATH / 'legislator_terms.csv', index=False)

In [64]:
don = donor_df[['name', 'influence', 'total_spent', 'top_topics']].copy()
don['type'] = 'donor'
lob = lobby_df[['name', 'influence', 'total_spent', 'top_topics']].copy()
lob['type'] = 'lobby_firm'
donor_lobby = pd.concat([don, lob], ignore_index=True)

In [65]:
donor_lobby.to_csv(OUT_PATH / 'donor_lobby_topics.csv')

In [66]:
author_edge['bill'] = data['bill'].n_id[author_edge['dst'].values]

In [67]:
authors = author_edge.groupby('bill').agg({'date': 'max'}).reset_index().merge(author_edge, on=['bill', 'date'], how='inner').groupby('bill').agg({
    'legterm_id': lambda x: ', '.join(x.astype(str).unique())}).reset_index()

In [68]:
def terms_to_names(terms):
    term_names = []
    for t in terms.split(', '):
        l = leg_ids.get(int(t.strip()), None)
        if l is not None:
            term_names.append(leg_term_to_name(l))
    return ', '.join([n for n in term_names if n is not None])

In [69]:
authors['authors'] = authors['legterm_id'].apply(terms_to_names)

In [70]:
b = bill_df.merge(authors, left_on='bill_id', right_on='bill', how='inner')

In [71]:
b['longevity'] = b['longevity'].dt.days

In [72]:
bi = b.merge(bv_df.groupby('bill_id')['bill_version_id'].nunique().reset_index(), on='bill_id', how='left').merge(bill_dates[['bill_id', 'First_action']], on='bill_id', how='left')

In [73]:
bi['term'] = bi['bill_id'].apply(lambda x: x[:4]).astype(int)
bi['First_action'] = bi['First_action'].dt.strftime('%Y-%m-%d')

In [74]:
bil = bi.merge(signals, on='bill_id', how='left')
bil['vote_signal'] = bil['vote_signal'].fillna(0.0)

In [93]:
bil['topic'] = bil['bill_id'].map(so)

In [95]:
pq.write_table(pa.Table.from_pandas(bil), OUT_PATH / 'bills.parquet')

In [2]:
bills = pq.read_table(OUT_PATH / 'bills.parquet').to_pandas()

In [4]:
bills.to_csv(OUT_PATH / 'bills.csv', index=False)

In [77]:
t = bi.groupby(['topic_cluster', 'term']).agg({
    'outcome': 'mean',
    'controversy': 'mean',
    'longevity': 'mean',
    'bill_id': 'nunique',
    'bill_version_id': 'mean'
}).reset_index()

In [78]:
f = lfund.groupby(['chamber', 'term', 'Party'])[topic_cols(K_TOPICS)].mean().reset_index()
f = add_top_n(f)

In [79]:
f_ = f.pivot_table(
    index='term',
    columns='Party',
    values=topic_cols(K_TOPICS),
    aggfunc='mean'
)
f_.columns = [f"{col[0]}_{col[1]}" for col in f_.columns]

f_ = f_[[c for c in f_.columns if not c.endswith('_I')]]
f_ = f_.fillna(0).reset_index()

In [80]:
def partisan_split(row):
    splits = {}
    for t in topic_cols(K_TOPICS):
        d = row[f"{t}_D"]
        r = row[f"{t}_R"]
        splits[t] = d - r
    return splits

f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')

  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in topic_cols(K_TOPICS)]] = f_.apply(partisan_split, axis=1, result_type='expand')
  f_[[f"{t}_split" for t in 

In [81]:
fg = f_.melt(
    id_vars=['term'],
    value_vars=[f"{t}_split" for t in topic_cols(K_TOPICS)],
    var_name='topic',
    value_name='partisan_split'
)
fg['topic_cluster'] = fg['topic'].apply(lambda x: re.search(r'_(\d+)', x).group(1) if '_' in x else x).astype(float)

In [82]:
t_ = t.merge(fg[['term', 'topic_cluster', 'partisan_split']], on=['term', 'topic_cluster'], how='left')

In [83]:
t_['topic'] = t_['topic_cluster'].map(big_labels)

In [90]:
t_.to_csv(OUT_PATH / 'topics_agg.csv', index=False)

In [99]:
t_.columns

Index(['topic_cluster', 'term', 'outcome', 'controversy', 'longevity',
       'bill_id', 'bill_version_id', 'partisan_split', 'topic'],
      dtype='object')

In [107]:
def top5(df, col_name):
    df = df.copy().set_index(col_name)
    top5_dict = {
        int(topic.split('_')[1]): df[topic].nlargest(5).index.to_list()
        for topic in df.columns if topic.startswith('topic_')
    }
    return top5_dict

top_donors = top5(donor_df, 'name')
top_lobby = top5(lobby_df, 'name')
top_leg = top5(leg_df, 'legislator')

In [110]:
topents = pd.DataFrame({
    'topic': top_donors.keys(),
    'top_donors': top_donors.values(),
    'top_lobby': top_lobby.values(),
    'top_legislators': top_leg.values()
})

In [114]:
topents['subject'] = topents['topic'].map(big_labels)

In [117]:
topents.to_csv(OUT_PATH / 'top_entities.csv', index=False)