In [1]:
import pandas as pd
import re
import numpy as np
import warnings
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

## Data

In [2]:
summary_votes = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')

In [3]:
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})

In [4]:
authors = pd.read_csv('ca_leg/legislation_data/authors.csv')

In [5]:
history = pd.read_csv('ca_leg/legislation_data/history.csv')

In [6]:
versions = pd.read_csv('ca_leg/legislation_data/bill_versions.csv')

In [7]:
bill_votes = pd.read_csv('ca_leg/legislation_data/bill_detail_vote_tbl.csv', parse_dates=['session_date'])

In [8]:
bill_summary = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')

In [9]:
bill_motions = pd.read_csv('ca_leg/legislation_data/bill_motion_tbl.csv')

In [10]:
locations = pd.read_csv('ca_leg/legislation_data/committee_codes.csv')


In [11]:
politicians = pd.read_csv('ca_leg/legislation_data/politicians.csv')


In [12]:
lobbying = pd.read_csv('calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})


In [13]:
expend_assembly = pd.read_csv('calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})


In [14]:
expend_senate = pd.read_csv('calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})


In [15]:
digests = pd.read_csv('ca_leg/legislation_data/digest.csv')


In [16]:
hearings = pd.read_csv('ca_leg/legislation_data/committee_hearing_tbl.csv')

## Cleaning

In [17]:
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time']).apply(lambda x: x.strftime('%Y-%m-%d'))

In [18]:
bill_votes['legislator_name'] = bill_votes['legislator_name'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

In [19]:
ACTION_KEYWORDS = [
    "Assembly Third Reading", "Assembly 3rd reading", 'senate 3rd reading', "Senate Third Reading","Concurrence - Urgency Added", "Concurrence in Senate Amendments", "Do pass as amended, and re-refer", "Do pass as amended, but re-refer", "Do pass as amended", "Do pass and be re-referred",
    "Concurrence", "Consent Calendar", "Urgency Clause", "Special Consent",
    "Motion to Reconsider", "Do pass", "Reconsideration", "Committee amendments",
    "W/O REF. TO FILE", "Be re-referred to the Committee",
    "Lay on the Table", "Amend by", "Unfinished Business", "Placed on Appropriations Suspense File",
]

def extract_action(motion_text):
    if not isinstance(motion_text, str) or motion_text is None:
        return None
    motion = motion_text.upper()

    action = next((act for act in ACTION_KEYWORDS if act.upper() in motion), None)
    if action != 'Reconsideration' and 'RECONSIDER' in motion:
        if action is not None:
            action += ' Reconsideration'
        else:
            action = 'Reconsideration'


    return action if action else None

In [20]:
bill_motions['simplified_motion'] = bill_motions['motion_text'].apply(extract_action)

In [21]:
clean_coms = {}
for i, row in locations.iterrows():
    if row['committee_code'].startswith('CZ'):
        continue
    name = row['committee_name']
    if row['committee_code'].startswith('CS'):
        if name.startswith('Sen.'):
            cname = re.sub(r'Sen. ', 'senate ', name).lower()
        elif name.startswith('Senate '):
            cname = name.lower()
        else:
            cname = 'senate ' + name.lower()
    elif row['committee_code'].startswith('CX'):
        if name.lower().startswith('assembly'):
            cname = name.lower()
        else:
            cname = 'assembly ' + name.lower()
    if re.search(r'x\d$', cname) is not None:
        cname = re.sub(r'x(?=\d$)', 'no. ', cname)
    clean_coms[row['committee_code']] = cname

In [22]:
from rapidfuzz import fuzz, process

leg_committees = [f"{row['chamber']} {row['committee_clean']}".lower() for _, row in politicians[['committee_clean', 'chamber']].drop_duplicates().iterrows()]

def match_committees(_names, clean_coms, threshold=92):
    clean_c = list(clean_coms.values())
    clean_codes = list(clean_coms.keys())
    name_mapping = {}
    for i, clean in enumerate(clean_c):
        code = clean_codes[i]
        matches = []
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=threshold
        ))
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.partial_ratio,
            score_cutoff=threshold
        ))
        valid_matches = [m for m in matches if m is not None]
        if len(valid_matches) > 0:
            best_match = max(valid_matches, key=lambda x: x[1])
            name_mapping[code] = best_match[0]
        else:
            fall_back = process.extractOne(
                clean,
                _names,
                scorer=fuzz.token_sort_ratio,
                score_cutoff=threshold - 8
            )
            if fall_back is not None:
                name_mapping[code] = fall_back[0]
            else:
                name_mapping[code] = None
    return name_mapping

committee_matches = match_committees(leg_committees, clean_coms)

In [23]:
locations['committee_clean'] = locations['committee_code'].map(committee_matches)

In [24]:
locations.loc[locations['committee_name'] == 'EDUCATION X5', 'committee_clean'] = 'Budget and Fiscal Review: Education'
locations.loc[locations['committee_code'] == 'CX12', 'committee_clean'] = 'Budget No. 1 on Health and Human Services'
locations.loc[locations['committee_code'] == 'CS68', 'committee_clean'] = 'Budget No. 3 - Health and Human Services'
locations.loc[locations['committee_code'] == 'CS66', 'committee_clean'] = 'Senate Veterans Affairs'
locations.loc[locations['committee_code'] == 'CS56', 'committee_clean'] = 'Senate Public Employment and Retirement'
locations.loc[locations['committee_code'] == 'CS62', 'committee_clean'] = 'Senate Budget and Fiscal Review'
locations.loc[locations['committee_code'] == 'CX23', 'committee_clean'] = 'Assembly Utilities and Commerce'

In [25]:
motion_codes = {
    row['motion_id']: row['simplified_motion']
    for _, row in bill_motions.iterrows()
}

In [26]:
summary_votes['motion_text'] = summary_votes['motion_id'].map(motion_codes)

In [27]:
def repair_bill_id(id):
    front, end = id[:4], id[4:]
    if re.search(r'\d{4}$', front):
        return f"{front}{int(front) + 1}{end}"
    else:
        return id

In [28]:
versions['ID'] = versions['bill_id'].apply(lambda x: repair_bill_id(x))

In [29]:
bill_vers = versions.loc[versions['bill_id'].str.startswith('2')]
for i, row in bill_vers.iterrows():
    tail = f"{row['VersionNum']}{row['MeasureState']}"
    repaired = repair_bill_id(re.sub(tail, '', row['bill_id']))
    end = int(repaired[-4:])

    bill_vers.loc[i, 'bill_ID'] = f"{repaired[:-4]}{end}"

In [30]:
legislators = {i: pol for i, pol in enumerate(politicians['full_name'].unique().tolist())}

leg_parties = {row['full_name']: row['Party'] for _, row in politicians[['full_name', 'Party']].drop_duplicates().iterrows()}
leg_occupations = {row['full_name']: row['Occupation'] for _, row in politicians[['full_name', 'Occupation']].drop_duplicates().iterrows()}
committees = {i: com for i, com in enumerate(politicians['committee_clean'].unique().tolist())}
lobby_firms = {i: firm for i, firm in enumerate(lobbying['FIRM_NAME'].unique().tolist())}

donor_names = list(set(expend_assembly['ExpenderName'].unique().tolist() + expend_senate['ExpenderName'].unique().tolist()))
donors = {i: donor for i, donor in enumerate(donor_names)}

In [31]:
import pickle

def save_dict(d, filename):
    with open(filename, 'wb') as f:
        pickle.dump(d, f)

In [32]:
for name, d in zip(
    ['legislators', 'committees', 'lobby_firms', 'donors'],
    [legislators, committees, lobby_firms, donors]
):
    save_dict(d, f'{name}.pkl')

In [33]:
bill_titles = {row['bill_ID']: row['Title'] for _, row in bill_vers[['bill_ID', 'Title']].drop_duplicates().iterrows()}

In [34]:
bill_subjects = {row['bill_ID']: row['GeneralSubject'] for _, row in bill_vers.loc[bill_vers['GeneralSubject'].apply(lambda x: x is not None and isinstance(x, str)), ['bill_ID', 'GeneralSubject']].drop_duplicates().iterrows()}

In [35]:
bill_ids = list(set(bill_votes.loc[bill_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist() + summary_votes.loc[summary_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist()))

In [36]:
bill_id_codes = {row['bill_id']: row['bill_ID'] for _, row in bill_vers.drop_duplicates(subset=['bill_id', 'bill_ID']).iterrows()}
history['bill_ID'] = history['bill_id'].map(bill_id_codes)

In [37]:
history['Date'] = pd.to_datetime(history['Date'])

In [38]:
introduction_dates = {}
for v, group in history.loc[history['bill_ID'].isin(bill_ids)].groupby('bill_ID'):
    introduction_dates[v] = {'Dates': group['Date'].unique().tolist(), 'Actions': group.sort_values('Date', ascending=True).drop_duplicates(subset=['Action', 'Date'])['Action'].tolist()}

In [39]:
version_id_mapping = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['ID']}

In [40]:
save_dict(version_id_mapping, 'version_id_mapping.pkl')

In [41]:
version_id_mapping2 = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['bill_id']}
bv2b = {v: k for k, val in version_id_mapping2.items() for v in val}
history['bill_ID'] = history['bill_id'].map(bv2b)
save_dict(bv2b, 'bill_id_mapping.pkl')

In [42]:
date_ranges = {}

for k, v in introduction_dates.items():
    first, last = min(v['Dates']), max(v['Dates'])
    date_ranges[k] = {'First_action': first, 'Last_action': last}

save_dict(date_ranges, 'bill_dates_map.pkl')

In [43]:
outcomes = history.loc[history['bill_ID'].notna()].sort_values('Date', ascending=False).groupby('bill_ID').first().reset_index()[['bill_ID', 'Action']]
outcomes.loc[outcomes['Action'].isin(['CHAPTERED', 'ENROLLED', 'FILED', 'APPROVED']), 'Outcome'] = 1
outcomes.loc[outcomes['Action'] == 'VETOED', 'Outcome'] = -1
outcomes.loc[outcomes['Outcome'].isna(), 'Outcome'] = 0

In [44]:
outcome = outcomes.set_index('bill_ID')['Outcome'].to_dict()

In [45]:
save_dict(outcome, 'bill_outcomes.pkl')

In [46]:
vote_bill_ids = {}
for i in summary_votes.loc[summary_votes['bill_id'].isin(bill_ids)].groupby(['year', 'motion_id'])['bill_id'].value_counts().index:
    year, motion_id, bill_id = i
    if (year, motion_id) not in vote_bill_ids.keys():
        vote_bill_ids[(year, motion_id)] = [bill_id]
    else:
        vote_bill_ids[(year, motion_id)].append(bill_id)


In [47]:
bill_vers_dig = bill_vers.merge(digests, on='bill_id', how='inner')

In [48]:
legislators_last_names = {}
for _, row in politicians[['chamber', 'Last', 'Term', 'full_name']].drop_duplicates().iterrows():
    legislators_last_names[(row['chamber'], row['Last'].lower(), row['Term'])] = row['full_name']

In [49]:
features = {row['ID']: {
    'digest': row['DigestText'],
    'MeasureState': row['MeasureState'],
    'VoteRequired': row['VoteRequired'] if row['VoteRequired'] is not None else 'No',
    'VersionNum': row['VersionNum'] if row['VersionNum'] is not None else 'No',
    'LocalProgram': row['LocalProgram'] if row['LocalProgram'] is not None else 'No',
    'FiscalCommittee': row['FiscalCommittee'] if row['FiscalCommittee'] is not None else 'No',
    'TaxLevy': row['TaxLevy'] if row['TaxLevy'] is not None else 'No',
    'Urgency': row['Urgency'] if row['Urgency'] is not None else 'No'} for _, row in bill_vers_dig.iterrows()}

In [50]:
legislator_codes = {v: k for k, v in legislators.items()}

In [51]:
committee_codes = {v.lower(): k for k, v in committees.items()}

In [52]:
bill_votes['chamber'] = bill_votes['location_code'].apply(lambda x: 'assembly' if x == 'AFLOOR' or x.startswith('CX') else 'senate' if x == 'SFLOOR' or x.startswith('CS') else 'full')
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time'])
bill_votes['term'] = bill_votes['vote_date_time'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1  else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year + 1}-{x.year + 2}")

In [53]:
author_locations = authors.loc[(authors['House'] == 'UNKNOWN') & (authors['bill_id'].map(bill_id_codes).isin(bill_ids)), ['bill_id', 'Name']].drop_duplicates()
for i, row in author_locations.iterrows():
    if 'AB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Assembly ' + row['Name']
    elif 'SB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Senate ' + row['Name']
    else:
        author_locations.loc[i, 'name'] = 'Joint ' + row['Name']

In [54]:
from fuzzywuzzy import fuzz

def fuzzy_strings(source_list, target_list):
    def preprocess_name(name):
        if not isinstance(name, str):
            return ""
        name = name.lower()
        name = re.sub(r'\(.*?\)', '', name)
        name = re.sub(r'committee on', '', name)
        name = re.sub(r'[^a-z\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        return name

    clean_source = [preprocess_name(c) for c in source_list]
    clean_target = [preprocess_name(c) for c in target_list]

    keywords = ["education", "health", "finance", "budget", "transportation",
                "judiciary", "environment", "agriculture", "energy", "labor",
                "housing", "veterans affairs", "public safety", "insurance", "banking", "public health", "small business", "redistricting",
                "public utilities", "natural resources", "water",
                "technology", "communications", "elections", "government",
                "appropriations", "rules", "ethics", 'criminal justice', "environmental protection", "college and university", "human services", "reproductive health", "mental health", "technology", "aggriculture", "urban development", "renewable energy", "gun violence", "commerce", "privacy", "cybersecurity", "infrastructure", "disaster preparedness", "prisons", "aging"]

    def get_committee_keywords(name):
        return set(kw for kw in keywords if kw in name)

    target_keywords = [get_committee_keywords(name) for name in clean_target]

    def calculate_similarity(source_idx, target_idx):
        source = clean_source[source_idx]
        target = clean_target[target_idx]

        if not source or not target:
            return 0

        if source == target:
            return 100

        token_sort = fuzz.token_sort_ratio(source, target)
        token_set = fuzz.token_set_ratio(source, target)
        partial = fuzz.partial_ratio(source, target)

        source_kw = get_committee_keywords(source)
        keyword_overlap = len(source_kw.intersection(target_keywords[target_idx]))
        keyword_bonus = min(20, keyword_overlap * 10)
        weighted_score = (token_sort * 0.3) + (token_set * 0.5) + (partial * 0.2) + keyword_bonus

        return weighted_score

    matches = {}
    for i, source in enumerate(source_list):
        scores = [calculate_similarity(i, j) for j in range(len(target_list))]

        if not scores or max(scores) < 60:
            matches[source] = None
        else:
            best_idx = np.argmax(scores)
            confidence = scores[best_idx]

            if confidence >= 60:
                matches[source] = target_list[best_idx]
            else:
                matches[source] = None

    return matches

In [55]:
author_com_matches = fuzzy_strings(author_locations['name'].unique().tolist(), leg_committees)

In [56]:
author_locations['name'] = author_locations['name'].map(author_com_matches)

In [57]:
sponsors = authors.loc[authors['bill_id'].map(bill_id_codes).isin(bill_ids)]
sponsors['term'] = sponsors['bill_id'].apply(lambda x: f"{x[:4]}-{int(x[:4]) + 1}" if int(x[:4]) % 2 == 1 else f"{int(x[:4]) - 1}-{x[:4]}" if int(x[:4]) % 2 == 0 and int(x[:4]) < 2009 else f"{x[:4]}-{int(x[:4]) + 1}")

In [58]:
lob = lobbying.loc[lobbying['clean_beneficiary'].notna(), ['FIRM_NAME', 'EXPN_DSCR', 'clean_beneficiary', 'EXPN_DATE', 'BENE_AMT']]
lob['EXPN_DATE'] = pd.to_datetime(lob['EXPN_DATE'])
lob['term'] = lob['EXPN_DATE'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1  else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year}-{x.year + 1}")

In [59]:
for i, row in politicians.loc[politicians['full_name'].apply(lambda x: isinstance(x, float)), ['Term', 'Last', 'chamber']].drop_duplicates().iterrows():
    term, last = row['Term'], row['Last']
    a = politicians.loc[(politicians['Last'] == last) & (politicians['Term'] == term) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
        continue
    else:
        a = politicians.loc[(politicians['Last'] == last) & (politicians['full_name'].apply(lambda x: isinstance(x, str)))]
    if len(a) > 0:
        politicians.loc[(politicians['Term'] == term) & (politicians['Last'] == last) & (politicians['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
    else:
        print(last, term)


In [60]:
pol_names_terms = {}
for _, row in politicians[['full_name', 'Term', 'chamber']].drop_duplicates().iterrows():
    if ',' in row['full_name']:
        name = row['full_name'].split(',')[1].strip() + ' ' + row['full_name'].split(',')[0].strip()
    else:
        name = row['full_name']
    pol_names_terms[(row['full_name'].lower(), row['Term'])] = {'chamber': row['chamber'], 'name': name}

In [61]:
expend_assembly = expend_assembly.rename(columns={'term': 'Term'})
expend_assembly['chamber'] = 'assembly'
expend_senate = expend_senate.rename(columns={'term': 'Term'})
expend_senate['chamber'] = 'senate'

In [62]:
campaign_contributions = pd.concat([expend_assembly.loc[expend_assembly['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd']), expend_senate.loc[expend_senate['matched_target_name'].notna(), ['ExpenderName', 'Amount', 'matched_target_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'matched_target_name', 'DateEnd'])])

In [63]:
campaign_contributions['DateEnd'] = pd.to_datetime(campaign_contributions['DateEnd'])

In [64]:
sponsors['bill_ID'] = sponsors['bill_id'].apply(repair_bill_id)

In [65]:
voting = history.merge(bill_votes, left_on=['bill_ID', 'Date'], right_on=['bill_id', 'vote_date_time'], how='inner').rename(columns={'bill_id_x': 'bill_version'}).drop('bill_id_y', axis=1)
voting['bv_id'] = voting['bill_version'].apply(repair_bill_id)

In [66]:
voting_places = {}
for i, row in voting.groupby(['motion_id', 'term', 'chamber', 'Date']).agg({'legislator_name': lambda x: list(x)}).iterrows():
    motion_id, term, chamber, date = i
    g = politicians.loc[(politicians['chamber'] == chamber) & (politicians['Term'] == term) & (politicians['Last'].isin(row['legislator_name']))]
    voting_places[(motion_id, term, chamber, date)] = {
        'most_common_committee': g.groupby('committee_clean').size().sort_values(ascending=False).head(1).index[0] if len(g) > 0 else None
    }
voting['voting_place'] = voting.apply(lambda row: voting_places.get((row['motion_id'], row['term'], row['chamber'], row['Date']), {}).get('most_common_committee', None), axis=1)

## Embeddings

### Saved

In [67]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='mps')
REFRESH_FLAG = False

In [68]:
def text_clean(title):
    if not isinstance(title, str):
        return ''
    title = re.sub(r'\(.*?\)', '', title)
    title = re.sub(r'[^a-zA-Z0-9\s]', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title.lower()

def batched_embeddings(values, output_dims=384):
    vals = [text_clean(v) for v in values if isinstance(v, str)]
    vals = list(set([v for v in vals if v != '']))
    embeddings = model.encode(vals, batch_size=64, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True, num_workers=4, output_dims=output_dims)
    embs = {v: e for v, e in zip(vals, embeddings)}
    return embs

In [69]:
subjects = list(set([t for t in versions.loc[versions['bill_id'].str.startswith('2')]['GeneralSubject'].tolist() if (isinstance(t, str) and t is not None)]))

In [70]:
subjects_original = {text_clean(t): t for t in subjects}
if REFRESH_FLAG == True:
    save_dict(subjects_original, 'subjects_original.pkl')
    subject_embeddings = batched_embeddings(subjects)
    torch.save(subject_embeddings, 'subject_embeddings.pt')
else:
    subject_embeddings = torch.load('subject_embeddings.pt')

In [71]:
occ_embeddings = {}
for occ in list(set(list(leg_occupations.values()))):
    if isinstance(occ, str) and len(occ) > 0 and text_clean(occ) != '':
        occ_embeddings[occ] = model.encode(
            text_clean(occ),
            convert_to_tensor=True,
            normalize_embeddings=True,
            output_dim=32)

In [72]:
titles = [t for t in bill_vers['Title'].unique().tolist() if (isinstance(t, str) and t not in [None, '', np.nan])]
title_embeddings = batched_embeddings(titles, output_dims=64)

Batches:   0%|          | 0/1319 [00:00<?, ?it/s]

In [73]:
lobbying_firms_embeddings = batched_embeddings([firm for firm in lobbying.loc[(lobbying['clean_beneficiary'].notna()) & (lobbying['FIRM_NAME'])]['FIRM_NAME'].unique().tolist() if isinstance(firm, str)], output_dims=128)

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

In [74]:
committee_embeddings = {}
for committee in politicians['committee_clean'].unique().tolist():
    co = re.sub(r'assembly|senate|committee|subcommittee', '', committee.lower())
    committee_embeddings[committee.lower()] = model.encode(co,  convert_to_tensor=True, normalize_embeddings=True, truncate_dim=64)

In [75]:
donor_embeddings = {}

for donor in tqdm(donor_names):
    donor_embeddings[donor] = model.encode(donor, convert_to_tensor=True, normalize_embeddings=True, truncate_dim=64)

100%|██████████| 506/506 [00:35<00:00, 14.29it/s]


In [76]:
motion_embeddings = {}
for motion in tqdm([t for t in pd.DataFrame.from_dict(motion_codes, orient='index').reset_index().rename({'index': 'motion_id', 0: 'motion_text'}, axis=1)['motion_text'].drop_duplicates().tolist() if t is not None]):
    motion_embeddings[motion] = model.encode(motion, convert_to_tensor=True, normalize_embeddings=True, truncate_dim=16)

100%|██████████| 33/33 [00:00<00:00, 37.06it/s]


In [77]:
digest_embeddings = torch.load('digests.pt')
if (len([a for a in (list(set([t if (isinstance(t, str) and t is not None) else '' for t in digests.loc[digests['bill_id'].str.startswith('2')]['DigestText'].tolist()]))) if a not in digest_embeddings.keys()]) == 0) & (REFRESH_FLAG == False):
    pass
else:
    for digest in tqdm(list(set([t if (isinstance(t, str) and t is not None) else '' for t in digests.loc[digests['bill_id'].str.startswith('2')]['DigestText'].tolist()]))):
        if digest not in digest_embeddings.keys():
            digest_embeddings[digest] = model.encode(digest, convert_to_tensor=True, normalize_embeddings=True, truncate_dim=64)
    torch.save(digest_embeddings, 'digests.pt')

In [78]:
hear = hearings[['bill_id', 'location_code']].merge(locations[['committee_code', 'committee_clean']], left_on='location_code', right_on='committee_code', how='left')[['bill_id', 'committee_clean']].drop_duplicates()
hear['year'] = hear['bill_id'].apply(lambda x: int(x[:4]))

### Load

In [79]:
positions = {p: 'Democratic Alternate' if re.search(r'Democratic\s*Alternate', p) is not None else 'Vice Chair' if re.search(r'V\s*i\s*c\s*e\s*-*\s*C\s*h\s*a\s*i\s*r\s*', p) is not None else 'Co-Chair' if re.search(r'Co\s*-\s*Chair', p) is not None else 'Chair' if re.search(r'Cha\s*i\s*r', p) is not None else 'Republican Alternate' if re.search(r'\s*Republican\s*Alternate', p) is not None else p for p in politicians['position'].unique()}

In [80]:
vnums = bill_vers.set_index('ID')['VersionNum'].to_dict()
vid_map = {v: k for k, val in version_id_mapping.items() for v in val}

In [81]:
sponsors['chamber'] = sponsors['House'].apply(lambda x: x.lower() if isinstance(x, str) else None)
sponsors = sponsors.merge(politicians[['Term', 'Last', 'chamber', 'full_name']].drop_duplicates(), left_on=['chamber', 'Name', 'term'], right_on=['chamber', 'Last', 'Term'], how='left')

## OOP Graph-Construction

In [82]:
from collections import defaultdict
import json
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import HeteroData
import numpy as np

author_type_map = {
    'LEAD_AUTHOR': 'LEAD_AUTHOR',
    'PRINCIPAL_COAUTHOR': 'PRINCIPAL_COAUTHOR',
    'COAUTHOR': 'COAUTHOR',
    'data="COAUTHOR"': 'COAUTHOR',
    'data="LEAD_AUTHOR"': 'LEAD_AUTHOR',
    'data="PRINCIPAL_COAUTHOR"': 'PRINCIPAL_COAUTHOR',
    'nan': 'AUTHOR'
}
author_levels = {
    'AUTHOR': 1,
    'COAUTHOR': 1,
    'PRINCIPAL_COAUTHOR': 2,
    'LEAD_AUTHOR': 3
}

def features_vote_required(vr):
    if not isinstance(vr, str):
        return 'MAJORITY'
    if "FOUR_FIFTHS" in vr:
        return '80_PCT'
    elif "THREE_FOURTHS" in vr:
        return '75_PCT'
    elif "SEVENTY_PERCENT" in vr or "70%" in vr:
        return '70_PCT'
    elif "TWO_THIRDS" in vr:
        return '66-67_PCT'
    elif "55%" in vr:
        return '55_PCT'
    else:
        return 'MAJORITY'

vote_required_codes = {
    'MAJORITY': 0,
    '55_PCT': 1,
    '66-67_PCT': 2,
    '70_PCT': 3,
    '75_PCT': 4,
    '80_PCT': 5
}

def bool_correction(val):
    if not isinstance(val, str):
        return 0
    if 'YES' in val:
        return 1
    else:
        return 0

measure_types = bill_vers['MeasureType'].unique()
parties = politicians['Party'].unique()
chambers = politicians['chamber'].unique()
outcome_mapping = {'CHAPTERED': 1, 'VETOED': 0, 'FAILED': -1, 'ENROLLED': 1}
measure_encoder = LabelEncoder()
measure_encoder.fit(measure_types)
party_encoder = LabelEncoder()
party_encoder.fit(parties)
chamber_encoder = LabelEncoder()
chamber_encoder.fit(chambers)
pos = list(positions.values()) + ['member']
pos_encoder = LabelEncoder()
pos_encoder.fit(pos)

class Node:
    def __init__(self, id, type, features=None):
        self.id = id
        self.type = type
        self.features = features or {}

class Edge:
    def __init__(self, source, target, relation, attributes=None):
        self.source = source
        self.target = target
        self.relation = relation
        self.attributes = attributes or {}

class Bill(Node):
    def __init__(self, bill_id, title, subject, measure_type):
        measure_type = measure_encoder.transform([measure_type])[0] if measure_type in measure_encoder.classes_ else -1
        features = {
            'title': title,
            'subject': subject,
            'measure_type': measure_type,
            'date': None,
        }
        super().__init__(bill_id, "bill", features)
        self.actions = None
        self.order_df = None
        self.outcome = None

    def add_actions(self, actions):
        if self.actions is None:
            self.actions = actions
        else:
            self.actions = pd.concat([self.actions, actions], ignore_index=True).drop_duplicates()

    def add_date(self, date):
        self.features['date'] = date

    def add_order_df(self, order_df):
        if self.order_df is None:
            self.order_df = order_df
        else:
            self.order_df = pd.concat([self.order_df, order_df], ignore_index=True).drop_duplicates()

    def add_outcome(self, outcome):
        outcome = outcome_mapping.get(outcome, -1)
        self.outcome = outcome

    def align_actions_versions(self, bill, versions_, dates):
        dates = pd.Series(dates).sort_values(ascending=True).drop_duplicates().tolist()
        actions = [i for i in introduction_dates.get(bill, {}).get('Actions', []) if i != 'FILED']
        if len(actions) > len(dates):
            if actions[-2:] == ['ENROLLED', 'CHAPTERED'] or actions[-2:] == ['APPROVED', 'CHAPTERED'] or len(actions) <= 4 and actions[-1] == 'ENROLLED' or abs(len(dates) - len(actions)) >= 2 and actions[-1] == 'ENROLLED' or actions == ['INTRODUCED', 'ENROLLED', 'AMENDED_SENATE'] or actions[-1] == 'APPROVED' or len(actions) == 3 and all(a.startswith('PASSED_') for a in actions[-2:]) or actions == ['ENROLLED', 'INTRODUCED'] or actions == ['INTRODUCED', 'ENROLLED'] or actions == ['INTRODUCED', 'REVISED'] or len(actions) > 3 and actions[-3:] == ['ENROLLED', 'CORRECTED', 'CHAPTERED'] or len(actions) > 3 and actions[-3:] == ['PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_SENATE'] or list(set(['INTRODUCED', 'PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_SENATE'])) == list(set(actions)) or list(set(['INTRODUCED', 'PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_ASSEMBLY'])) == list(set(actions)) or actions[-2] == 'ENROLLED' and actions[-1].startswith('PASSED_') or len(actions) > 5 and actions[-4] == 'CHAPTERED' and actions[-1].startswith('PASSED_') or actions[-2:] == ['PASSED_SENATE', 'PASSED_ASSEMBLY'] or actions == ['INTRODUCED', 'AMENDED_SENATE', 'PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_ASSEMBLY'] or len(actions) == 9 and actions[-6:] == ['APPROVED', 'CHAPTERED', 'ENROLLED', 'AMENDED_SENATE', 'PASSED_ASSEMBLY', 'PASSED_SENATE'] or len(actions) > 5 and actions[-4:] == ['ENROLLED', 'PASSED_SENATE', 'APPROVED', 'PASSED_ASSEMBLY'] or dates == [pd.Timestamp('2008-12-08 00:00:00'), pd.Timestamp('2008-12-18 00:00:00')]:
                dates.append(dates[-1])
                if len(actions) > len(dates):
                    dates.append(dates[-1])
                    if len(actions) > len(dates):
                        dates.append(dates[-1])
                        if len(actions) > len(dates):
                            dates.append(dates[-1])
                            if len(actions) > len(dates):
                                dates.append(dates[-1])
            if len(dates) == 1 and len(actions) > 1:
                for _ in range(len(actions) - len(dates)):
                    dates.append(dates[0])
            if actions[-2:] == ['INTRODUCED', 'PASSED_ASSEMBLY']:
                dates = [dates[0]] + dates
            if len(actions) >= 6 and actions[:3] == ['INTRODUCED', 'AMENDED_ASSEMBLY', 'ENROLLED'] and actions[-3:] == ['CHAPTERED', 'APPROVED', 'CORRECTED']:
                dates = dates[:2] + [dates[2]] + dates[2:-2] + [dates[-2]] + dates[-2:]
            elif ('PASSED_ASSEMBLY' in actions and 'AMENDED_ASSEMBLY' in actions) or ('PASSED_SENATE' in actions and 'AMENDED_SENATE' in actions):
                if len(dates) == 3:
                    dates = dates[:1] + [dates[1]] + dates[1:]
                    if all(a for a in ['PASSED_ASSEMBLY', 'AMENDED_ASSEMBLY', 'PASSED_SENATE', 'AMENDED_SENATE'] if a in actions):
                        dates = dates[:1] + [dates[1]] + dates[1:]
                elif 'PROPOSED_CONFERENCE_REPORT_1' in actions and len(actions) - len(dates) == 2:
                    dates = dates[:2] + [dates[2]] + [dates[2]] + dates[2:]
                elif len(dates) > 3 and len(actions) - len(dates) > 0 and not (actions[-4:] == ['PASSED_ASSEMBLY', 'ENROLLED', 'PASSED_SENATE', 'CHAPTERED'] and 'AMENDED_SENATE' in actions):
                    dates = dates[:2] + [dates[2]] + dates[2:]
                if len(actions) > 4 and actions[-3:] == ['ENROLLED', 'PASSED_SENATE', 'CHAPTERED']:
                    dates = dates[:-2] + [dates[-2]] + dates[-2:]

            if len(actions) - len(dates) == 1:
                if 'CORRECTED' in actions:
                    actions.remove('CORRECTED')
                elif 'RESCIND' in actions:
                    actions.remove('RESCIND')

            if actions[-1] == 'CORRECTED' and len(actions) - len(dates) == 2:
                if len(dates) >= 5:
                    dates = dates[:3] + [dates[3]] + [dates[3]] + [dates[4]] + dates[4:]
                elif len(dates) == 2:
                    dates = [dates[0]] + [dates[0]] + dates[0:]
                else:
                    dates = dates[:2] + [dates[2]] + [dates[2]] + dates[2:]
            if actions[-1] == 'CHAPTERED' and len(actions) - len(dates) == 3:
                dates = dates + [dates[-1]] + [dates[-1]] + [dates[-1]]
            if actions[-2:] == ['ENROLLED', 'VETOED'] and len(actions) - len(dates) > 0 :
                dates = dates[:-4] + [dates[-4]]  + [dates[-3]] + dates[-3:]

            if len(dates) < len(actions) and'ENROLLED' in actions and actions.index('ENROLLED') < len(actions) - 1:
                for i in range(len(actions) - actions.index('ENROLLED')):
                    dates = dates + [dates[-1]]
        if len(actions) + 1 == len(dates):
            dates = dates[:-1]
        try:
            action_df = pd.DataFrame({'date': dates, 'action': actions})
        except:
            return None, None, None
        action_df['date'] = pd.to_datetime(action_df['date'], errors='coerce')
        order_df = action_df.loc[~action_df['action'].isin(['FILED', 'PASSED_ASSEMBLY', 'PASSED_SENATE', 'APPROVED'])]
        repair_flag = False
        if order_df.shape[0] > len(versions_):
            version_ends = [re.search(r'INT|AMD|ENR|CHP|PRO', v).group() for v in versions_]
            if 'ENR' in version_ends:
                v_enr = version_ends.index('ENR')
                extension = [versions_[v_enr - 1] if v_enr - 1 != 0 else versions_[v_enr] for _ in range(len(order_df) - len(versions_))]
                versions_ = versions_[:v_enr] + extension + versions_[v_enr:]
            else:
                repair_flag = True
        vr = pd.DataFrame({'version': versions_})
        if vr.shape[0] == 0:
            return None, None, None
        try:
            vr['v_num'] = vr['version'].apply(lambda x: re.search(r'\d{2}(?=INT|AMD|ENR|CHP|PRO)', x).group()).astype(int)
        except:
            return None, None, None
        vr = vr.sort_values('v_num', ascending=False).reset_index(drop=True)
        if repair_flag:
            last_v = vr.loc[vr['version'].notna()].iloc[-1]['version']
            last_v_num = float(re.search(r'\d{2}(?=INT|AMD|ENR|CHP|PRO)', last_v).group())
            for i in range(len(order_df) - len(versions_)):
                vr.loc[len(vr) + i, 'version'] = last_v
                vr.loc[len(vr) + i, 'v_num'] = last_v_num

        order_df['version'] = vr['version']
        order_df['order'] = range(1, len(order_df) + 1)
        outcomes = order_df['action'].tolist()
        if 'CHAPTERED' in outcomes or 'FILED' in outcomes:
            if 'VETOED' in outcomes:
                outcome = 'VETOED'
            else:
                outcome = 'CHAPTERED'
        else:
            outcome = 'FAILED'
        return action_df, order_df, outcome

class BillVersion(Node):
    def __init__(self, bill_id, version_id, digest, vote_required, local_program, fiscal_com, tax_levy, urgency):
        vote_required = vote_required_codes.get(features_vote_required(vote_required), 0)
        local_program = bool_correction(local_program)
        fiscal_com = bool_correction(fiscal_com)
        tax_levy = bool_correction(tax_levy)
        urgency = bool_correction(urgency)
        features = {
            'digest': digest,
            'VoteRequired': vote_required,
            'LocalProgram': local_program,
            'FiscalCommittee': fiscal_com,
            'TaxLevy': tax_levy,
            'Urgency': urgency,
            'date': None
        }
        super().__init__(version_id, "bill_version", features)
        self.bill_id = bill_id
        self.actions = {}

    def add_actions(self, location, date):
        if location not in self.actions:
            self.actions[location] = []
        self.actions[location].append(date)

    def add_date(self, date):
        self.features['date'] = date

class Legislator(Node):
    def __init__(self, legislator_id, party, occupation):
        party = party_encoder.transform([party])[0] if party in party_encoder.classes_ else -1
        features = {
            'party': party,
            'occupation': occupation
        }
        super().__init__(legislator_id, "legislator", features)
        self.terms = []

class LegislatorTerm(Node):
    def __init__(self, term, legislator_id, chamber, district):
        chamber = chamber_encoder.transform([chamber])[0] if chamber in chamber_encoder.classes_ else -1
        features = {
            'chamber': chamber,
            'district': district,
            'term': term
        }
        node_id = f"{legislator_id}_{term}_{chamber}"
        super().__init__(node_id, "legislator_term", features)
        self.committees = []
        self.committee_positions = []

    def add_committee(self, committee_id):
        self.committees.append(committee_id)

    def add_committee_position(self, committee_id, position):
        self.committee_positions.append((committee_id, position))

class Committee(Node):
    def __init__(self, committee_id, name, chamber, term):
        chamber = chamber_encoder.transform([chamber])[0] if chamber in chamber_encoder.classes_ else -1
        features = {
            'name': name,
            'chamber': chamber
        }
        term_ = term.split('-')[0]
        id = f"{committee_id}_{term_}"
        super().__init__(id, "committee", features)
        self.members = []

    def add_member(self, legislator_id):
        self.members.append(legislator_id)

class LobbyFirm(Node):
    def __init__(self, firm_id, name):
        features = {
            'name': name
        }
        super().__init__(firm_id, "lobby_firm", features)
        self.total_donations = 0

    def add_donation(self, amount):
        self.total_donations += amount

class Donor(Node):
    def __init__(self, donor_id, name):
        features = {
            'name': name
        }
        super().__init__(donor_id, "donor", features)
        self.total_donations = 0

    def add_donation(self, amount):
        self.total_donations += amount

class Vote(Edge):
    def __init__(self, legislator, bill_version, vote, motion, date, direction):
        attributes = {
            'vote': 1 if vote == 'AYE' else -1 if (vote == 'NOE' or vote == 'NO') else 0,
            'motion': motion,
            'date': date
        }
        if direction == 1:
            super().__init__(legislator, bill_version, 'voted_on', attributes)
        else:
            super().__init__(bill_version, legislator, 'vote_from', attributes)

class CommitteeMembership(Edge):
    def __init__(self, legislator, committee, position, direction):
        position = pos_encoder.transform([position])[0] if position in pos_encoder.classes_ else pos_encoder.transform(['member'])[0]
        attributes = {
            'position': position
        }
        if direction == 1:
            super().__init__(legislator, committee, 'member_of', attributes)
        else:
            super().__init__(committee, legislator, 'has_member', attributes)
        committee.add_member(legislator)

class Sponsorship(Edge):
    def __init__(self, legislator, bill_version, author_type, direction):
        author_type = author_levels.get(author_type_map.get(author_type, 'AUTHOR'))
        attributes = {
            'author_type': author_type
        }
        if direction == 1:
            super().__init__(legislator, bill_version, 'wrote', attributes)
        else:
            super().__init__(bill_version, legislator, 'written_by', attributes)

class Reading(Edge):
    def __init__(self, bill, committee, date, direction):
        attributes = {
            'date': date
        }
        if direction == 1:
            super().__init__(committee, bill, 'read', attributes)
        else:
            super().__init__(bill, committee, 'read_by', attributes)

class Donation(Edge):
    def __init__(self, donor, recipient, amount, date, type, direction=1):
        attributes = {
            'amount': amount,
            'date': date
        }
        if type == 'CampaignContribution':
            if direction == 1:
                super().__init__(donor, recipient, 'donated_to', attributes)
            else:
                super().__init__(recipient, donor, 'has_donation', attributes)
        else:
            if direction == 1:
                super().__init__(donor, recipient, 'lobbied', attributes)
            else:
                super().__init__(recipient, donor, 'has_lobbying', attributes)

class Version(Edge):
    def __init__(self, bill_version, bill, direction):
        if direction == 1:
            super().__init__(bill_version, bill, 'is_version')
        else:
            super().__init__(bill, bill_version, 'has_version')

class siblingVersion(Edge):
    def __init__(self, version1, version2, direction):
        if direction == 1:
            super().__init__(version1, version2, 'priorVersion')
        else:
            super().__init__(version2, version1, 'nextVersion')

class samePerson(Edge):
    def __init__(self, node1, node2):
        super().__init__(node1, node2, 'samePerson')

In [83]:
class GraphBuilder:
    def __init__(self):
        self.nodes = {}
        self.edges = []
        self.versions = []
        self.nodes_by_type = defaultdict(list)
        self.edges_by_type = defaultdict(list)
        self._type_counters = defaultdict(int)

    def add_version(self, version):
        self.versions.append(version)

    def add_node(self, node):
        key = (node.type, node.id)
        if key not in self.nodes:
            self.nodes[key] = node
        idx = self._type_counters[node.type]
        self._type_counters[node.type] += 1
        self.nodes_by_type[node.type].append(node)

    def get_node(self, type_, id_):
        return self.nodes.get((type_, id_))

    def add_edge(self, edge):
        self.edges.append(edge)
        etype = (edge.source.type, edge.relation, edge.target.type)
        self.edges_by_type[etype].append(edge)

    def build(self):
        return {
            "nodes": list(self.nodes.values()),
            "edges": self.edges,
            "nodes_by_type": self.nodes_by_type,
            "edges_by_type": self.edges_by_type
        }

    def add_bills(self, bill_ids, titles, subjects, titles_embs, subjects_embs, features):
        def process_single_bill(bill):
            try:
                title = text_clean(titles.get(bill, ''))
                subject = text_clean(subjects.get(bill, ''))
                title_emb = titles_embs.get(title, None)
                subject_emb = subjects_embs.get(subject, None)
                measure_type = re.search(r'[A-Za-z]+', bill).group()
                bill_node = Bill(bill, title_emb, subject_emb, measure_type)

                versions = version_id_mapping.get(bill, [])
                versions_ = []
                dates = introduction_dates.get(bill, {}).get('Dates', [])
                try:
                    fd = sorted(list(set(dates)))[0]
                except IndexError:
                    y = int(bill[:4])
                    fd = pd.Timestamp(year=y, month=2, day=1)
                bill_node.add_date(fd)

                if not versions:
                    return

                self.add_node(bill_node)

                for version in versions:
                    digest = features[version]['digest']
                    if str(digest) == 'nan' or version.endswith('VETO'):
                        continue

                    digest_emb = digest_embeddings.get(digest, None)
                    if digest_emb is None:
                        continue

                    version_node = BillVersion(
                        bill, version, digest_emb,
                        features[version]['VoteRequired'],
                        features[version]['LocalProgram'],
                        features[version]['FiscalCommittee'],
                        features[version]['TaxLevy'],
                        features[version]['Urgency']
                    )

                    self.add_node(version_node)
                    if version not in self.versions:
                        self.versions.append(version)
                    versions_.append(version)


                orders = [vnums.get(v) for v in versions]
                sorted_versions = [s for _, s in sorted(zip(orders, versions))]
                for i, s in enumerate(sorted_versions):
                    v = self.get_node('bill_version', s)
                    if v is None or bill_node is None:
                        continue
                    self.add_edge(Version(v, bill_node, 1))
                    if i > 0:
                        prev_v = self.get_node('bill_version', sorted_versions[i - 1])
                        if prev_v is not None:
                            self.add_edge(siblingVersion(prev_v, v, 1))
                o = outcome.get(bill, 0)
                bill_node.add_outcome(o)
            except Exception as e:
                raise ValueError(f"Error processing bill {bill}: {e}")

        for bill in tqdm(bill_ids):
            process_single_bill(bill)

    def add_legislators(self, legislators_):
        for legislator in tqdm(legislators_):
            leg_name = legislators[legislator]
            party = leg_parties.get(leg_name)
            occupation = leg_occupations.get(leg_name)
            occ_embedding = occ_embeddings.get(occupation, None)
            legislator_node = Legislator(legislator, party, occ_embedding)
            self.add_node(legislator_node)
            terms = politicians.loc[politicians['full_name'] == leg_name, ['Term', 'District No.', 'chamber']].drop_duplicates()
            for _, term in terms.iterrows():
                term_node = LegislatorTerm(term['Term'], legislator, term['chamber'], term['District No.'])
                self.add_node(term_node)
                self.add_edge(samePerson(legislator_node, term_node))

    def add_committees(self, committees_df):
        for _, row in tqdm(committees_df[['committee_clean', 'Term', 'chamber']].drop_duplicates().iterrows(), total=committees_df[['committee_clean', 'Term', 'chamber']].drop_duplicates().shape[0]):
            committee_name = committee_embeddings.get(row['committee_clean'].lower(), None)
            committee_id = committee_codes.get(row['committee_clean'].lower(), None)
            chamber = chamber_encoder.transform([row['chamber']])[0] if row['chamber'] in chamber_encoder.classes_ else -1
            committee_node = Committee(committee_id, committee_name, chamber, row['Term'])
            self.add_node(committee_node)
            term = row['Term']
            members = politicians.loc[(politicians['committee_clean'] == row['committee_clean']) & (politicians['Term'] == row['Term']), ['position', 'full_name', 'chamber']].drop_duplicates()
            for _, member in members.iterrows():
                leg_id = legislator_codes[member['full_name']]
                chamber = chamber_encoder.transform([member['chamber']])[0] if member['chamber'] in chamber_encoder.classes_ else -1
                leg_node_id = f"{leg_id}_{term}_{chamber}"
                leg_node = self.get_node('legislator_term', leg_node_id)
                if (leg_node is not None) & (committee_node is not None):
                    self.add_edge(CommitteeMembership(leg_node, committee_node, member['position'], 1))
                    committee_node.add_member(leg_node_id)
                    leg_node.add_committee(committee_name)

    def add_votes(self):
        for _, row in tqdm(voting.loc[voting['bill_ID'].isin(bill_ids)].iterrows(), total=voting.loc[voting['bill_ID'].isin(bill_ids)].shape[0]):
            bv_id = row['bv_id']
            v_node = self.get_node('bill_version', bv_id)
            if v_node is None:
                continue
            last = row['legislator_name'].strip().lower()
            house = chamber_encoder.transform([row['chamber']])[0] if row['chamber'] in chamber_encoder.classes_ else -1
            legislator = legislators_last_names.get((row['chamber'].lower(), last, row['term']), None)
            if legislator is None:
                if len(last.split(' ')) > 1:
                    legislator = row['legislator_name']
                else:
                    continue
            legislator_id = legislator_codes.get(legislator, None)
            leg_term_node = self.get_node('legislator_term', f"{legislator_id}_{row['term']}_{house}")
            if leg_term_node is None:
                continue
            vote = row['vote_code']
            motion_id = row['motion_id']
            motion_text = motion_codes.get(motion_id, None)
            if motion_text is None:
                continue
            if row['location_code'] not in ['AFLOOR', 'SFLOOR']:
                actions = v_node.actions
                if row['location_code'] in actions:
                    if row['vote_date_time'] not in actions[row['location_code']]:
                        actions[row['location_code']].append(row['vote_date_time'])
            if motion_text is None:
                motion_embedding = ''
            else:
                motion_embedding = motion_embeddings.get(motion_text, None)
            self.add_edge(Vote(leg_term_node, v_node, vote, motion_embedding, row['vote_date_time'], 1))


    def add_readings(self):
        for _, row in tqdm(hear.loc[hear['bill_id'].isin(bill_ids)].iterrows(), total=hear.loc[hear['bill_id'].isin(bill_ids)].shape[0]):
            b_node = self.get_node('bill', row['bill_id'])
            if b_node is None:
                continue
            location = row['committee_clean']
            if location is None or location == '':
                continue
            term = row['year']
            committee_id = committee_codes.get(str(location).lower(), None)
            if committee_id is None:
                continue
            committee_node = self.get_node('committee', f"{committee_id}_{term}")
            if committee_node is None:
                continue
            self.add_edge(Reading(committee_node, b_node, term, 1))
        for _, row in tqdm(voting.loc[(voting['bv_id'].notna()) & (voting['voting_place'].notna()) & (voting['bill_ID'].isin(bill_ids)), ['bv_id', 'Date', 'voting_place']].drop_duplicates().iterrows()):
            bv_node = self.get_node('bill_version', row['bv_id'])
            if bv_node is None:
                continue
            location = row['voting_place']
            if location is None or location == '':
                continue
            date = pd.Timestamp(row['Date'])
            committee_id = committee_codes.get(str(location).lower(), None)
            if committee_id is None:
                continue
            year = int(date.year)
            committee_node = self.get_node('committee', f"{committee_id}_{year}")
            if committee_node is None:
                continue
            self.add_edge(Reading(committee_node, bv_node, date, 1))


    def add_sponsorships(self, sponsors):
        for _, row in tqdm(sponsors.loc[sponsors['Name'].apply(lambda x: isinstance(x, str))].iterrows(), total=sponsors.shape[0]):
            version = row['bill_ID']
            version_node = self.get_node('bill_version', version)
            if version_node is None:
                continue
            if row['House'] == 'UNKNOWN':
                com = author_com_matches.get(row['Name'], None)
                year = row['term'].split('-')[0]
                com_node = self.get_node('committee', f"{com}_{year}")
                if com_node is None:
                   continue
                self.add_edge(Sponsorship(com_node, version_node, row['Contribution'], 1))
                continue
            else:
                if row['Name'].strip() in ['Mark Stone', 'Cristina Garcia', 'John Campbell', 'Bill Campbell', 'Eduardo Garcia']:
                    leg_name = row['Name']
                else:
                    leg_name = row['full_name']
                    if leg_name is None or leg_name == '':
                        continue
                house = chamber_encoder.transform([row['House'].lower()])[0] if row['House'].lower() in chamber_encoder.classes_ else -1
                leg_id = legislator_codes.get(leg_name)
                leg_node = self.get_node('legislator_term', f"{leg_id}_{row['term']}_{house}")
                if leg_node is None:
                    continue

                self.add_edge(Sponsorship(leg_node, version_node, row['Contribution'], 1))


    def add_lobbyists(self, lobbyists):
        for key in tqdm(lobbyists.keys(), total=len(lobbyists.keys())):
            lobbyist = LobbyFirm(text_clean(key), lobbyists[key])
            self.add_node(lobbyist)

    def add_donations(self, donations):
        for _, row in tqdm(donations.iterrows(), total=donations.shape[0]):
            firm = row['FIRM_NAME']
            firm_node = self.get_node('lobby_firm', text_clean(firm))
            if firm_node is None:
                continue
            if row['clean_beneficiary'] in committee_codes:
                com = committee_codes.get(row['clean_beneficiary'])
                year = row['term'].split('-')[0]
                com_node = self.get_node('committee', f"{com}_{year}")
                if com_node is None:
                    continue
                self.add_edge(Donation(firm_node, com_node, row['BENE_AMT'], row['EXPN_DATE'], 'Lobbying', 1))
                firm_node.add_donation(row['BENE_AMT'])

            else:
                dicti = pol_names_terms.get((row['clean_beneficiary'], row['term']), None)
                chamber = dicti['chamber'] if dicti is not None else None
                name = dicti['name'] if dicti is not None else None
                if chamber is None or name is None:
                    continue
                chamber = chamber_encoder.transform([chamber])[0] if chamber in chamber_encoder.classes_ else -1
                leg_id = legislator_codes.get(name)
                leg_node = self.get_node('legislator_term', f"{leg_id}_{row['term']}_{chamber}")
                if leg_node is None:
                    continue
                self.add_edge(Donation(firm_node, leg_node, row['BENE_AMT'], row['EXPN_DATE'], 'Lobbying', 1))
                firm_node.add_donation(row['BENE_AMT'])

    def add_donors(self, donors):

        for donor in tqdm(donors.keys(), total=len(donors.keys())):
            donor_embedding = donors[donor]
            donor = Donor(donor, donor_embedding)
            self.add_node(donor)

    def add_contributions(self, contributions):
        for _, row in tqdm(contributions.iterrows(), total=contributions.shape[0]):
            expender = row['ExpenderName']
            expender_node = self.get_node('donor', expender)
            if expender_node is None:
                continue
            recipient = row['matched_target_name']
            recipient_id = legislator_codes.get(recipient)
            chamber = chamber_encoder.transform([row['chamber']])[0] if row['chamber'] in chamber_encoder.classes_ else -1
            recipient_node = self.get_node('legislator_term', f"{recipient_id}_{row['Term']}_{chamber}")
            if recipient_node is None:
                continue
            self.add_edge(Donation(expender_node, recipient_node, row['Amount'], row['DateEnd'], 'CampaignContribution', 1))
            expender_node.add_donation(row['Amount'])

In [84]:
builder = GraphBuilder()
builder.add_bills(bill_ids, bill_titles, bill_subjects, title_embeddings, subject_embeddings, features)
builder.add_legislators(legislators)
builder.add_sponsorships(sponsors)

100%|██████████| 47669/47669 [07:19<00:00, 108.53it/s]
100%|██████████| 506/506 [00:02<00:00, 239.80it/s]
100%|█████████▉| 702154/702157 [01:18<00:00, 8979.93it/s] 


In [85]:
builder.add_committees(politicians)
builder.add_lobbyists(lobbying_firms_embeddings)
builder.add_donations(lob)
builder.add_donors(donor_embeddings)
builder.add_contributions(campaign_contributions)

100%|██████████| 1699/1699 [00:09<00:00, 187.06it/s]
100%|██████████| 1165/1165 [00:00<00:00, 215258.33it/s]
100%|██████████| 122317/122317 [00:04<00:00, 24704.74it/s]
100%|██████████| 506/506 [00:00<00:00, 379595.39it/s]
100%|██████████| 9151/9151 [00:00<00:00, 10447.89it/s]


In [86]:
builder.add_votes()
builder.add_readings()

100%|██████████| 5965170/5965170 [3:04:11<00:00, 539.76it/s]   
100%|██████████| 120596/120596 [00:02<00:00, 46107.76it/s]
99460it [02:31, 658.37it/s]  


In [87]:
graph_data = builder.build()

In [88]:
nodes = graph_data['nodes']
edges = graph_data['edges']
nodes_by_type = graph_data['nodes_by_type']
edges_by_type = graph_data['edges_by_type']

In [89]:
rev_subject_embeddings = {v: k for k, v in subject_embeddings.items()}
bill_subjects = {}

for node in nodes_by_type['bill']:
    if node.features['subject'] in rev_subject_embeddings:
        subj = rev_subject_embeddings[node.features['subject']]
        bill_subjects[node.id] = subj

with open('bill_subjects.json', 'w') as f:
    json.dump(bill_subjects, f)

In [90]:
old = [g for g in globals() if g not in ['builder', 'graph_data', 'nodes', 'edges', 'nodes_by_type', 'edges_by_type', 'rev_subject_embeddings', 'bill_subjects', 'author_type_map', 'author_levels', 'measure_types', 'parties', 'chambers', 'outcome_mapping', 'measure_encoder', 'party_encoder', 'chamber_encoder', 'pos_encoder', 'feature_encoders', 'positions']]

In [91]:
for o in old:
    if o in globals():
        del globals()[o]

In [92]:
import torch
data = torch.load('data4.pt', weights_only=False)

In [94]:
import datetime
import hashlib
import json
import re
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from torch_geometric.data import HeteroData


timestamp_edges = [
    ('donor', 'donated_to', 'legislator_term'),
    ('legislator_term', 'rev_donated_to', 'donor'),
    ('lobby_firm', 'lobbied', 'legislator_term'),
    ('lobby_firm', 'lobbied', 'committee'),
    ('committee', 'rev_lobbied', 'lobby_firm'),
    ('legislator_term', 'rev_lobbied', 'lobby_firm'),
    ('bill_version', 'rev_voted_on', 'legislator_term'),
    ('legislator_term', 'voted_on', 'bill_version'),
    ('committee', 'read', 'bill'),
    ('bill', 'rev_read', 'committee'),
]
timestamp_nodes = ['legislator_term', 'bill_version', 'bill']

def _to_ts(t: Any) -> float:
    try:
        if isinstance(t, (int, float)) and 1900 <= t and t <= 2100:
            return datetime.datetime(int(t), 6, 15).timestamp()
        elif (isinstance(t, str) or isinstance(t, float)) and (float(t) < 2100 and float(t) > 1900):
            return datetime.datetime(int(float(t)), 6, 15).timestamp()
        elif float(t) > 0 and float(t) < 1990:
            return float(t)
        elif float(t) > 17000000.0:
            return float(t)
        elif isinstance(t, datetime.datetime):
            return t.timestamp()
        else:
            return float(t) * 1e9
    except:
        return datetime.datetime(2000, 6, 15).timestamp()

def _is_time_key(k: Any) -> bool:
    if k is None:
        return False
    s = str(k).lower()
    return ('date' in s) or ('time' in s) or (s in ('term', 'year'))

def _parse_term_value(v: Any) -> Any:
    if v is None:
        return None
    if isinstance(v, str):
        m = re.search(r'\d{4}', v)
        if m:
            return int(m.group(0))
    if isinstance(v, (list, tuple)) and len(v) > 0:
        return _parse_term_value(v[0])
    return v

def _looks_like_datetime_string(s: str) -> bool:
    s = s.strip()
    if not s:
        return False
    if re.search(r'\d{4}-\d{1,2}-\d{1,2}', s):
        return True
    if re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', s):
        return True
    if re.search(r'(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', s.lower()) and re.search(r'\d{4}', s):
        return True
    return False

def _stable_hash_to_int(s: str) -> int:
    return int(hashlib.blake2s(s.encode('utf-8'), digest_size=4).hexdigest(), 16)

# —— vector parsing helpers ——

_VECTOR_DICT_KEYS = ('embedding', 'vector', 'values', 'data', 'array')

def _maybe_unwrap_dict_container(x: Any) -> Any:
    if isinstance(x, dict):
        for k in _VECTOR_DICT_KEYS:
            if k in x:
                return x[k]
    return x

def _parse_numeric_string_vector(s: str) -> np.ndarray | None:
    # Try JSON list first
    s2 = s.strip()
    if not s2:
        return None
    if s2.startswith('[') and s2.endswith(']'):
        try:
            arr = json.loads(s2)
            return _to_float_array(arr)
        except Exception:
            pass
    # Try comma-separated or whitespace-separated floats
    if (',' in s2) or (re.search(r'\s', s2) and len(s2.split()) > 1):
        tokens = [t for t in re.split(r'[,\s]+', s2) if t]
        try:
            vals = [float(t) for t in tokens]
            return np.asarray(vals, dtype=np.float32)
        except Exception:
            return None
    return None

def _to_float_array(x: Any) -> np.ndarray | None:
    x = _maybe_unwrap_dict_container(x)

    # torch.Tensor
    if isinstance(x, torch.Tensor):
        try:
            arr = x.detach().cpu().numpy()
            return arr.astype(np.float32).reshape(-1)
        except Exception:
            return None

    # pandas Series
    if isinstance(x, pd.Series):
        try:
            arr = x.to_numpy()
            return np.asarray(arr, dtype=np.float32).reshape(-1)
        except Exception:
            return None

    # numpy array
    if isinstance(x, np.ndarray):
        if x.dtype.kind in 'iuf':
            return x.astype(np.float32).reshape(-1)
        # object/other → try elementwise float cast
        try:
            flat = x.reshape(-1)
            vals = [float(v) for v in flat]
            return np.asarray(vals, dtype=np.float32)
        except Exception:
            return None

    # list/tuple (possibly nested)
    if isinstance(x, (list, tuple)):
        try:
            arr = np.asarray(x, dtype=np.float32)
            if arr.dtype.kind in 'iuf':
                return arr.reshape(-1)
            # if dtype not numeric, try elementwise coercion
            flat = np.array([float(v) for v in _flatten_once(x)], dtype=np.float32)
            return flat.reshape(-1)
        except Exception:
            # last resort: try manual elementwise
            try:
                vals = [float(v) for v in _flatten_once(x)]
                return np.asarray(vals, dtype=np.float32).reshape(-1)
            except Exception:
                return None

    # numeric scalar (keep as length-1 vector)
    if isinstance(x, (int, float, np.integer, np.floating)) and not isinstance(x, bool):
        return np.asarray([float(x)], dtype=np.float32)

    # datetime-like strings → leave to time path elsewhere
    if isinstance(x, str):
        if _looks_like_datetime_string(x):
            return None
        # vector-like string?
        vec = _parse_numeric_string_vector(x)
        if vec is not None:
            return vec.reshape(-1)
        return None

    return None

def _flatten_once(seq):
    for el in seq:
        if isinstance(el, (list, tuple, np.ndarray)):
            for sub in (el if isinstance(el, (list, tuple)) else el.tolist()):
                yield sub
        else:
            yield el

def _as1d_numeric_or_time(x: Any) -> np.ndarray | None:
    if x is None:
        return None
    if isinstance(x, bool):
        return np.array([1.0 if x else 0.0], dtype=np.float32)
    if isinstance(x, (pd.Timestamp, np.datetime64)):
        return np.array([pd.to_datetime(x, errors='coerce').value / 1e9], dtype=np.float32)
    # try general numeric/vector coercion
    return _to_float_array(x)

def _collect_keys_allow_all(dicts: List[Dict[str, Any]]) -> List[str]:
    keys = set()
    for d in dicts:
        if not d:
            continue
        keys.update(d.keys())
    return sorted(keys)

def _key_dims_with_default(dicts: List[Dict[str, Any]], keys: List[str], default_dim: int = 1) -> Dict[str, int]:
    dims: Dict[str, int] = {}
    for k in keys:
        m = 0
        for d in dicts:
            v = d.get(k, None)
            arr = _as1d_numeric_or_time(v)
            if arr is not None:
                m = max(m, int(arr.shape[0]))
        dims[k] = m if m > 0 else default_dim
    return dims

def _fit_categorical_maps(
    dicts: List[Dict[str, Any]],
    keys: List[str],
    dims: Dict[str, int],
    max_card_per_key: int = 50000
) -> Dict[str, Dict[str, int]]:
    cat_maps: Dict[str, Dict[str, int]] = {}
    for k in keys:
        if dims[k] != 1:
            continue
        seen: Dict[str, int] = {}
        next_id = 1  # 0 reserved for missing/unknown
        for d in dicts:
            v = d.get(k, None)
            if v is None:
                continue
            if isinstance(v, str) and not _looks_like_datetime_string(v) and _parse_numeric_string_vector(v) is None:
                if v not in seen:
                    if len(seen) < max_card_per_key:
                        seen[v] = next_id
                        next_id += 1
        if seen:
            cat_maps[k] = seen
    return cat_maps

def _encode_scalar_from_category(v: Any, k: str, cat_maps: Dict[str, Dict[str, int]]) -> float:
    if v is None:
        return 0.0
    if isinstance(v, bool):
        return 1.0 if v else 0.0
    if isinstance(v, (int, float, np.integer, np.floating)) and not isinstance(v, bool):
        return float(v)
    if isinstance(v, str) and not _looks_like_datetime_string(v) and _parse_numeric_string_vector(v) is None:
        mapping = cat_maps.get(k, None)
        if mapping is not None:
            return float(mapping.get(v, 0))
        return float((_stable_hash_to_int(v) % 65535) + 1)
    return 0.0

def _stack_with_categories(
    dicts: List[Dict[str, Any]],
    keys: List[str],
    dims: Dict[str, int],
    cat_maps: Dict[str, Dict[str, int]]
) -> torch.Tensor:
    if not keys:
        return torch.empty((len(dicts), 0), dtype=torch.float32)
    rows: List[np.ndarray] = []
    for d in dicts:
        parts: List[np.ndarray] = []
        for k in keys:
            v = d.get(k, None)
            arr = _as1d_numeric_or_time(v)
            if arr is None:
                if dims[k] == 1:
                    parts.append(np.array([_encode_scalar_from_category(v, k, cat_maps)], dtype=np.float32))
                else:
                    parts.append(np.zeros(dims[k], dtype=np.float32))
            else:
                if arr.shape[0] < dims[k]:
                    pad = np.zeros(dims[k], dtype=np.float32)
                    pad[:arr.shape[0]] = arr
                    arr = pad
                elif arr.shape[0] > dims[k]:
                    arr = arr[:dims[k]]
                parts.append(arr.astype(np.float32))
        rows.append(np.concatenate(parts, axis=0))
    mat = np.vstack(rows).astype(np.float32)
    mat = np.nan_to_num(mat, nan=0.0, posinf=0.0, neginf=0.0)
    return torch.from_numpy(mat)


def _prep_node_dicts(ntype: str, nlist: List[Any]) -> List[Dict[str, Any]]:
    dicts: List[Dict[str, Any]] = []
    use_time = ntype in timestamp_nodes
    for n in nlist:
        d = dict((n.features or {}))
        if use_time and d:
            for k in list(d.keys()):
                if _is_time_key(k):
                    v = d[k]
                    if k == 'term':
                        v = _parse_term_value(v)
                    d[k] = _to_ts(v)
        dicts.append(d)
    return dicts

def _prep_edge_dicts_filtered(
    etype: Tuple[str, str, str],
    elist: List[Any],
    id_map_src: Dict[Any, int],
    id_map_dst: Dict[Any, int],
) -> Tuple[List[int], List[int], List[Dict[str, Any]]]:
    use_time = etype in timestamp_edges
    src_idx: List[int] = []
    dst_idx: List[int] = []
    dicts: List[Dict[str, Any]] = []
    for e in elist:
        si = id_map_src.get(e.source.id, None)
        di = id_map_dst.get(e.target.id, None)
        if si is None or di is None:
            continue
        d = dict((e.attributes or {}))
        if use_time and d:
            for k in list(d.keys()):
                if _is_time_key(k):
                    d[k] = _to_ts(d[k])
        src_idx.append(si)
        dst_idx.append(di)
        dicts.append(d)
    return src_idx, dst_idx, dicts


def _load_bill_labels(path: str):
    with open(path, 'r') as f:
        raw = json.load(f)
    vals = list(raw.values())
    if all(isinstance(v, (int, np.integer)) for v in vals):
        label_map = {}
        inv = None
    else:
        uniq = sorted(set(map(str, vals)))
        label_map = {s: i for i, s in enumerate(uniq)}
        inv = {i: s for s, i in label_map.items()}
    return raw, label_map, inv


def build_heterodata_compact_with_time_and_labels(gb, labels_json_path: str):
    data = HeteroData()

    # map original IDs to 0..N-1 per node type
    id_maps: Dict[str, Dict[Any, int]] = {
        ntype: {n.id: i for i, n in enumerate(nlist)}
        for ntype, nlist in gb.nodes_by_type.items()
    }

    # — Nodes
    for ntype, nlist in gb.nodes_by_type.items():
        if not nlist:
            continue
        dicts = _prep_node_dicts(ntype, nlist)
        keys = _collect_keys_allow_all(dicts)
        dims = _key_dims_with_default(dicts, keys, default_dim=1)
        cat_maps = _fit_categorical_maps(dicts, keys, dims)
        x = _stack_with_categories(dicts, keys, dims, cat_maps)
        if x.numel() == 0:
            data[ntype].num_nodes = len(nlist)
        else:
            data[ntype].x = x

    # — Edges
    for etype, elist in gb.edges_by_type.items():
        st, rel, dt = etype
        if not elist or st not in id_maps or dt not in id_maps:
            continue

        id_map_src = id_maps[st]
        id_map_dst = id_maps[dt]

        src, dst, dicts = _prep_edge_dicts_filtered(etype, elist, id_map_src, id_map_dst)
        if not src:
            continue

        edge_index = torch.tensor([src, dst], dtype=torch.long)
        keys = _collect_keys_allow_all(dicts)
        dims = _key_dims_with_default(dicts, keys, default_dim=1)
        cat_maps = _fit_categorical_maps(dicts, keys, dims)
        edge_attr = _stack_with_categories(dicts, keys, dims, cat_maps)

        data[etype].edge_index = edge_index
        if edge_attr.numel() > 0:
            data[etype].edge_attr = edge_attr

    # — Bill labels & outcomes
    if 'bill' in gb.nodes_by_type and gb.nodes_by_type['bill']:
        raw_labels, label_map, _ = _load_bill_labels(labels_json_path)
        bills = gb.nodes_by_type['bill']
        clusters: List[int] = []
        ys: List[int] = []
        for n in bills:
            bid = n.id
            v = raw_labels.get(bid, None)
            if v is None:
                clusters.append(-1)
            else:
                if label_map:
                    clusters.append(label_map.get(str(v), -1))
                else:
                    try:
                        clusters.append(int(v))
                    except:
                        clusters.append(-1)
            yv = getattr(n, 'outcome', None)
            ys.append(-1 if yv is None else int(yv))
        data['bill'].cluster = torch.tensor(clusters, dtype=torch.long)
        data['bill'].y = torch.tensor(ys, dtype=torch.long)

    # — Final numeric cleanup
    for store in (list(data.node_stores) + list(data.edge_stores)):
        for k, v in list(store.items()):
            if isinstance(v, torch.Tensor) and v.dtype.is_floating_point:
                torch.nan_to_num_(v, nan=0.0, posinf=0.0, neginf=0.0)

    return data, id_maps

data, id_maps = build_heterodata_compact_with_time_and_labels(builder, 'bill_labels.json')

In [96]:
torch.save(data, 'data4.pt', _use_new_zipfile_serialization=True)

In [97]:
import json
with open('node_id_map.json', 'w') as f:
    json.dump(id_maps, f)

In [98]:
for ntype in data.node_types:
    print(f"Node type '{ntype}': {data[ntype].num_nodes} nodes, feature shape: {data[ntype].x.shape if 'x' in data[ntype] else 'N/A'}")
for etype in data.edge_types:
    print(f"Edge type '{etype}': {data[etype].edge_index.shape[1]} edges, feature shape: {data[etype].edge_attr.shape if 'edge_attr' in data[etype] else 'N/A'}")

Node type 'bill': 47658 nodes, feature shape: torch.Size([47658, 770])
Node type 'bill_version': 200955 nodes, feature shape: torch.Size([200955, 390])
Node type 'legislator': 506 nodes, feature shape: torch.Size([506, 385])
Node type 'legislator_term': 1448 nodes, feature shape: torch.Size([1448, 4])
Node type 'committee': 1699 nodes, feature shape: torch.Size([1699, 65])
Node type 'lobby_firm': 1165 nodes, feature shape: torch.Size([1165, 384])
Node type 'donor': 506 nodes, feature shape: torch.Size([506, 64])
Edge type '('bill_version', 'is_version', 'bill')': 200955 edges, feature shape: N/A
Edge type '('bill_version', 'priorVersion', 'bill_version')': 154342 edges, feature shape: N/A
Edge type '('legislator', 'samePerson', 'legislator_term')': 1448 edges, feature shape: N/A
Edge type '('legislator_term', 'wrote', 'bill_version')': 582740 edges, feature shape: torch.Size([582740, 1])
Edge type '('legislator_term', 'member_of', 'committee')': 17563 edges, feature shape: torch.Size([