In [1]:
import pandas as pd
import re
import numpy as np
import warnings
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

## Data

In [2]:
summary_votes = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')
summary_votes.columns

Index(['bill_id', 'location_code', 'vote_date_time', 'motion_id', 'ayes',
       'noes', 'abstain', 'vote_result', 'year'],
      dtype='object')

In [3]:
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})
bill_history.columns

Index(['bill_id', 'bill_history_id', 'action_date', 'action_',
       'action_sequence', 'action_code', 'action_status', 'primary_location',
       'secondary_location', 'end_status', 'year'],
      dtype='object')

In [4]:
authors = pd.read_csv('ca_leg/legislation_data/authors.csv')
authors.columns

Index(['bill_id', 'Contribution', 'House', 'Name'], dtype='object')

In [5]:
history = pd.read_csv('ca_leg/legislation_data/history.csv')
history.columns

Index(['bill_id', 'Action', 'Date'], dtype='object')

In [6]:
versions = pd.read_csv('ca_leg/legislation_data/bill_versions.csv')
versions.columns

Index(['bill_id', 'MeasureType', 'Urgency', 'MeasureNum', 'GeneralSubject',
       'VersionNum', 'Appropriation', 'SessionYear', 'SessionNum',
       'VoteRequired', 'LocalProgram', 'FiscalCommittee', 'MeasureState',
       'TaxLevy', 'Title'],
      dtype='object')

In [7]:
bill_votes = pd.read_csv('ca_leg/legislation_data/bill_detail_vote_tbl.csv', parse_dates=['session_date'])
bill_votes.columns

Index(['bill_id', 'location_code', 'legislator_name', 'vote_date_time',
       'vote_date_seq', 'vote_code', 'motion_id', 'member_order',
       'session_date', 'year'],
      dtype='object')

In [8]:
bill_summary = pd.read_csv('ca_leg/legislation_data/bill_summary_vote_tbl.csv')
bill_summary.columns

Index(['bill_id', 'location_code', 'vote_date_time', 'motion_id', 'ayes',
       'noes', 'abstain', 'vote_result', 'year'],
      dtype='object')

In [9]:
bill_motions = pd.read_csv('ca_leg/legislation_data/bill_motion_tbl.csv')
bill_motions.columns

Index(['motion_id', 'motion_text', 'year'], dtype='object')

In [10]:
locations = pd.read_csv('ca_leg/legislation_data/committee_codes.csv')
locations.columns

Index(['committee_code', 'committee_name'], dtype='object')

In [11]:
politicians = pd.read_csv('ca_leg/legislation_data/politicians.csv')
politicians.columns

Index(['committee_clean', 'position', 'Occupation', 'Party', 'District No.',
       'Seat No.', 'Term', 'Last', 'full_name', 'chamber'],
      dtype='object')

In [12]:
lobbying = pd.read_csv('calaccess/lobbying_clean2.csv', dtype={'PAYEE_NAMS': str, 'BAKREF_TID': str})
lobbying.columns

Index(['FILING_ID', 'FIRM_NAME', 'AMEND_ID', 'LINE_ITEM', 'REC_TYPE',
       'FORM_TYPE', 'TRAN_ID', 'RECSUBTYPE', 'ENTITY_CD', 'PAYEE_NAML',
       'PAYEE_NAMF', 'PAYEE_NAMT', 'PAYEE_NAMS', 'PAYEE_CITY', 'PAYEE_ST',
       'PAYEE_ZIP4', 'CREDCARDCO', 'BENE_NAME', 'BENE_POSIT', 'BENE_AMT',
       'EXPN_DSCR', 'EXPN_DATE', 'AMOUNT', 'MEMO_CODE', 'MEMO_REFNO',
       'BAKREF_TID', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27',
       'Unnamed: 28', 'clean_beneficiary'],
      dtype='object')

In [13]:
expend_assembly = pd.read_csv('calaccess/expend_assembly_matched.csv', dtype={'TargetPropositionName': str})
expend_assembly.columns

Index(['TargetCandidateName', 'TargetCandidateOffice', 'TargetPropositionName',
       'ExpenderPosition', 'ExpenderName', 'ExpenderID', 'Amount',
       'ExpenditureDscr', 'PayeeName', 'DateStart', 'DateEnd', 'DateRange',
       'year', 'term_x', 'matched_target_name', 'term_y', 'politician',
       'committee', 'position', 'committee_clean', 'Occupation', 'Party',
       'District No.', 'Seat No.', 'Term', 'First', 'Last', 'Position',
       'full_name', 'chamber', 'target_name'],
      dtype='object')

In [14]:
expend_senate = pd.read_csv('calaccess/expend_senate_matched.csv', dtype={'TargetPropositionName': str})
expend_senate.columns

Index(['TargetCandidateName', 'TargetCandidateOffice', 'TargetPropositionName',
       'ExpenderPosition', 'ExpenderName', 'ExpenderID', 'Amount',
       'ExpenditureDscr', 'PayeeName', 'DateStart', 'DateEnd', 'DateRange',
       'matched_target_name', 'term', 'politician', 'committee', 'position',
       'committee_clean', 'Name', 'Occupation', 'Party', 'District No.',
       'Seat No.', 'Phone', 'Counties', 'pages', 'Last', 'Term', 'full_name',
       'chamber'],
      dtype='object')

In [15]:
digests = pd.read_csv('ca_leg/legislation_data/digest.csv')
digests.columns

Index(['bill_id', 'DigestText'], dtype='object')

## Cleaning

In [16]:
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time']).apply(lambda x: x.strftime('%Y-%m-%d'))

In [17]:
bill_votes['legislator_name'] = bill_votes['legislator_name'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

In [18]:
ACTION_KEYWORDS = [
    "Assembly Third Reading", "Assembly 3rd reading", 'senate 3rd reading', "Senate Third Reading","Concurrence - Urgency Added", "Concurrence in Senate Amendments", "Do pass as amended, and re-refer", "Do pass as amended, but re-refer", "Do pass as amended", "Do pass and be re-referred",
    "Concurrence", "Consent Calendar", "Urgency Clause", "Special Consent",
    "Motion to Reconsider", "Do pass", "Reconsideration", "Committee amendments",
    "W/O REF. TO FILE", "Be re-referred to the Committee",
    "Lay on the Table", "Amend by", "Unfinished Business", "Placed on Appropriations Suspense File",
]

def extract_action(motion_text):
    if not isinstance(motion_text, str) or motion_text is None:
        return None
    motion = motion_text.upper()

    action = next((act for act in ACTION_KEYWORDS if act.upper() in motion), None)
    if action != 'Reconsideration' and 'RECONSIDER' in motion:
        if action is not None:
            action += ' Reconsideration'
        else:
            action = 'Reconsideration'


    return action if action else None

In [19]:
bill_motions['simplified_motion'] = bill_motions['motion_text'].apply(extract_action)

In [20]:
clean_coms = {}
for i, row in locations.iterrows():
    if row['committee_code'].startswith('CZ'):
        continue
    name = row['committee_name']
    if row['committee_code'].startswith('CS'):
        if name.startswith('Sen.'):
            cname = re.sub(r'Sen. ', 'senate ', name).lower()
        elif name.startswith('Senate '):
            cname = name.lower()
        else:
            cname = 'senate ' + name.lower()
    elif row['committee_code'].startswith('CX'):
        if name.lower().startswith('assembly'):
            cname = name.lower()
        else:
            cname = 'assembly ' + name.lower()
    if re.search(r'x\d$', cname) is not None:
        cname = re.sub(r'x(?=\d$)', 'no. ', cname)
    clean_coms[row['committee_code']] = cname

In [21]:
from rapidfuzz import fuzz, process

leg_committees = [f"{row['chamber']} {row['committee_clean']}".lower() for _, row in politicians[['committee_clean', 'chamber']].drop_duplicates().iterrows()]

def match_committees(_names, clean_coms, threshold=92):
    clean_c = list(clean_coms.values())
    clean_codes = list(clean_coms.keys())
    name_mapping = {}
    for i, clean in enumerate(clean_c):
        code = clean_codes[i]
        matches = []
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=threshold
        ))
        matches.append(process.extractOne(
            clean,
            _names,
            scorer=fuzz.partial_ratio,
            score_cutoff=threshold
        ))
        valid_matches = [m for m in matches if m is not None]
        if len(valid_matches) > 0:
            best_match = max(valid_matches, key=lambda x: x[1])
            name_mapping[code] = best_match[0]
        else:
            fall_back = process.extractOne(
                clean,
                _names,
                scorer=fuzz.token_sort_ratio,
                score_cutoff=threshold - 8
            )
            if fall_back is not None:
                name_mapping[code] = fall_back[0]
            else:
                name_mapping[code] = None
    return name_mapping

committee_matches = match_committees(leg_committees, clean_coms)

In [22]:
locations['committee_clean'] = locations['committee_code'].map(committee_matches)

In [23]:
locations.loc[locations['committee_name'] == 'EDUCATION X5', 'committee_clean'] = 'Budget and Fiscal Review: Education'
locations.loc[locations['committee_code'] == 'CX12', 'committee_clean'] = 'Budget No. 1 on Health and Human Services'
locations.loc[locations['committee_code'] == 'CS68', 'committee_clean'] = 'Budget No. 3 - Health and Human Services'
locations.loc[locations['committee_code'] == 'CS66', 'committee_clean'] = 'Senate Veterans Affairs'
locations.loc[locations['committee_code'] == 'CS56', 'committee_clean'] = 'Senate Public Employment and Retirement'
locations.loc[locations['committee_code'] == 'CS62', 'committee_clean'] = 'Senate Budget and Fiscal Review'
locations.loc[locations['committee_code'] == 'CX23', 'committee_clean'] = 'Assembly Utilities and Commerce'

In [24]:
motion_codes = {
    row['motion_id']: row['simplified_motion']
    for _, row in bill_motions.iterrows()
}

In [25]:
summary_votes['motion_text'] = summary_votes['motion_id'].map(motion_codes)

In [26]:
def repair_bill_id(id):
    front, end = id[:4], id[4:]
    if re.search(r'\d{4}$', front):
        return f"{front}{int(front) + 1}{end}"
    else:
        return id

In [27]:
versions['ID'] = versions['bill_id'].apply(lambda x: repair_bill_id(x))

In [28]:
bill_vers = versions.loc[versions['bill_id'].str.startswith('2')]
for i, row in bill_vers.iterrows():
    tail = f"{row['VersionNum']}{row['MeasureState']}"
    repaired = repair_bill_id(re.sub(tail, '', row['bill_id']))
    end = int(repaired[-4:])

    bill_vers.loc[i, 'bill_ID'] = f"{repaired[:-4]}{end}"

In [29]:
legislators = {i: pol for i, pol in enumerate(politicians['full_name'].unique().tolist())}

leg_parties = {row['full_name']: row['Party'] for _, row in politicians[['full_name', 'Party']].drop_duplicates().iterrows()}
leg_occupations = {row['full_name']: row['Occupation'] for _, row in politicians[['full_name', 'Occupation']].drop_duplicates().iterrows()}
committees = {i: com for i, com in enumerate(politicians['committee_clean'].unique().tolist())}
lobby_firms = {i: firm for i, firm in enumerate(lobbying['FIRM_NAME'].unique().tolist())}

donor_names = list(set(expend_assembly['ExpenderName'].unique().tolist() + expend_senate['ExpenderName'].unique().tolist()))
donors = {i: donor for i, donor in enumerate(donor_names)}

In [30]:
import pickle

def save_dict(d, filename):
    with open(filename, 'wb') as f:
        pickle.dump(d, f)

for name, d in zip(
    ['legislators', 'committees', 'lobby_firms', 'donors'],
    [legislators, committees, lobby_firms, donors]
):
    save_dict(d, f'{name}.pkl')

In [30]:
bill_titles = {row['bill_ID']: row['Title'] for _, row in bill_vers[['bill_ID', 'Title']].drop_duplicates().iterrows()}

In [31]:
bill_subjects = {row['bill_ID']: row['GeneralSubject'] for _, row in bill_vers.loc[bill_vers['GeneralSubject'].apply(lambda x: x is not None and isinstance(x, str)), ['bill_ID', 'GeneralSubject']].drop_duplicates().iterrows()}

In [32]:
bill_ids = list(set(bill_votes.loc[bill_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist() + summary_votes.loc[summary_votes['bill_id'].str.startswith('2'), 'bill_id'].unique().tolist()))

In [33]:
bill_id_codes = {row['bill_id']: row['bill_ID'] for _, row in bill_vers.drop_duplicates(subset=['bill_id', 'bill_ID']).iterrows()}
history['bill_ID'] = history['bill_id'].map(bill_id_codes)

In [34]:
history['Date'] = pd.to_datetime(history['Date'])

In [35]:
introduction_dates = {}
for v, group in history.loc[history['bill_ID'].isin(bill_ids)].groupby('bill_ID'):
    introduction_dates[v] = {'Dates': group['Date'].unique().tolist(), 'Actions': group.sort_values('Date', ascending=True).drop_duplicates(subset=['Action', 'Date'])['Action'].tolist()}

In [36]:
version_id_mapping = {i: list(group.values) for i, group in bill_vers.groupby('bill_ID')['ID']}

In [37]:
vote_bill_ids = {}
for i in summary_votes.loc[summary_votes['bill_id'].isin(bill_ids)].groupby(['year', 'motion_id'])['bill_id'].value_counts().index:
    year, motion_id, bill_id = i
    if (year, motion_id) not in vote_bill_ids.keys():
        vote_bill_ids[(year, motion_id)] = [bill_id]
    else:
        vote_bill_ids[(year, motion_id)].append(bill_id)


In [38]:
bill_vers_dig = bill_vers.merge(digests, on='bill_id', how='inner')

In [39]:
legislators_last_names = {}
for _, row in politicians[['chamber', 'Last', 'Term', 'full_name']].drop_duplicates().iterrows():
    legislators_last_names[(row['chamber'], row['Last'].lower(), row['Term'])] = row['full_name']

In [40]:
features = {row['ID']: {
    'digest': row['DigestText'],
    'MeasureState': row['MeasureState'],
    'VoteRequired': row['VoteRequired'] if row['VoteRequired'] is not None else 'No',
    'VersionNum': row['VersionNum'] if row['VersionNum'] is not None else 'No',
    'LocalProgram': row['LocalProgram'] if row['LocalProgram'] is not None else 'No',
    'FiscalCommittee': row['FiscalCommittee'] if row['FiscalCommittee'] is not None else 'No',
    'TaxLevy': row['TaxLevy'] if row['TaxLevy'] is not None else 'No',
    'Urgency': row['Urgency'] if row['Urgency'] is not None else 'No'} for _, row in bill_vers_dig.iterrows()}

In [41]:
legislator_codes = {v: k for k, v in legislators.items()}

In [42]:
committee_codes = {v.lower(): k for k, v in committees.items()}

In [43]:
bill_votes['chamber'] = bill_votes['location_code'].apply(lambda x: 'assembly' if x == 'AFLOOR' or x.startswith('CX') else 'senate' if x == 'SFLOOR' or x.startswith('CS') else 'full')
bill_votes['vote_date_time'] = pd.to_datetime(bill_votes['vote_date_time'])
bill_votes['term'] = bill_votes['vote_date_time'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1  else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year + 1}-{x.year + 2}")

In [44]:
author_locations = authors.loc[(authors['House'] == 'UNKNOWN') & (authors['bill_id'].map(bill_id_codes).isin(bill_ids)), ['bill_id', 'Name']].drop_duplicates()
for i, row in author_locations.iterrows():
    if 'AB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Assembly ' + row['Name']
    elif 'SB' in row['bill_id']:
        author_locations.loc[i, 'name'] = 'Senate ' + row['Name']
    else:
        author_locations.loc[i, 'name'] = 'Joint ' + row['Name']

In [45]:
from fuzzywuzzy import fuzz

def fuzzy_strings(source_list, target_list):
    def preprocess_name(name):
        if not isinstance(name, str):
            return ""
        name = name.lower()
        name = re.sub(r'\(.*?\)', '', name)
        name = re.sub(r'committee on', '', name)
        name = re.sub(r'[^a-z\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        return name

    clean_source = [preprocess_name(c) for c in source_list]
    clean_target = [preprocess_name(c) for c in target_list]

    keywords = ["education", "health", "finance", "budget", "transportation",
                "judiciary", "environment", "agriculture", "energy", "labor",
                "housing", "veterans affairs", "public safety", "insurance", "banking", "public health", "small business", "redistricting",
                "public utilities", "natural resources", "water",
                "technology", "communications", "elections", "government",
                "appropriations", "rules", "ethics", 'criminal justice', "environmental protection", "college and university", "human services", "reproductive health", "mental health", "technology", "aggriculture", "urban development", "renewable energy", "gun violence", "commerce", "privacy", "cybersecurity", "infrastructure", "disaster preparedness", "prisons", "aging"]

    def get_committee_keywords(name):
        return set(kw for kw in keywords if kw in name)

    target_keywords = [get_committee_keywords(name) for name in clean_target]

    def calculate_similarity(source_idx, target_idx):
        source = clean_source[source_idx]
        target = clean_target[target_idx]

        if not source or not target:
            return 0

        if source == target:
            return 100

        token_sort = fuzz.token_sort_ratio(source, target)
        token_set = fuzz.token_set_ratio(source, target)
        partial = fuzz.partial_ratio(source, target)

        source_kw = get_committee_keywords(source)
        keyword_overlap = len(source_kw.intersection(target_keywords[target_idx]))
        keyword_bonus = min(20, keyword_overlap * 10)
        weighted_score = (token_sort * 0.3) + (token_set * 0.5) + (partial * 0.2) + keyword_bonus

        return weighted_score

    matches = {}
    for i, source in enumerate(source_list):
        scores = [calculate_similarity(i, j) for j in range(len(target_list))]

        if not scores or max(scores) < 60:
            matches[source] = None
        else:
            best_idx = np.argmax(scores)
            confidence = scores[best_idx]

            if confidence >= 60:
                matches[source] = target_list[best_idx]
            else:
                matches[source] = None

    return matches

In [46]:
author_com_matches = fuzzy_strings(author_locations['name'].unique().tolist(), leg_committees)

In [47]:
author_locations['name'] = author_locations['name'].map(author_com_matches)

In [48]:
sponsors = authors.loc[authors['bill_id'].map(bill_id_codes).isin(bill_ids)]
sponsors['term'] = sponsors['bill_id'].apply(lambda x: f"{x[:4]}-{int(x[:4]) + 1}" if int(x[:4]) % 2 == 1 else f"{int(x[:4]) - 1}-{x[:4]}" if int(x[:4]) % 2 == 0 and int(x[:4]) < 2009 else f"{x[:4]}-{int(x[:4]) + 1}")

In [49]:
lob = lobbying.loc[lobbying['clean_beneficiary'].notna(), ['FIRM_NAME', 'EXPN_DSCR', 'clean_beneficiary', 'EXPN_DATE', 'BENE_AMT']]
lob['EXPN_DATE'] = pd.to_datetime(lob['EXPN_DATE'])
lob['term'] = lob['EXPN_DATE'].apply(lambda x: f"{x.year}-{x.year + 1}" if x.year % 2 == 1  else f"{x.year - 1}-{x.year}" if x.year % 2 == 0 and x < pd.Timestamp(year=x.year, month=11, day=2) else f"{x.year}-{x.year + 1}")

In [50]:
pol_names_terms = {}
for _, row in politicians[['full_name', 'Term', 'chamber']].drop_duplicates().iterrows():
    pol_names_terms[(row['full_name'].lower(), row['Term'])] = {'chamber': row['chamber'], 'name': row['full_name']}

In [51]:
campaign_contributions = pd.concat([expend_assembly.loc[expend_assembly['full_name'].notna(), ['ExpenderName', 'Amount', 'full_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'full_name', 'DateEnd']), expend_senate.loc[expend_senate['full_name'].notna(), ['ExpenderName', 'Amount', 'full_name', 'Term', 'chamber', 'DateEnd']].drop_duplicates(subset=['ExpenderName', 'Amount', 'full_name', 'DateEnd'])])

In [52]:
campaign_contributions['DateEnd'] = pd.to_datetime(campaign_contributions['DateEnd'])

In [53]:
sponsors['bill_ID'] = sponsors['bill_id'].apply(repair_bill_id)

## Embeddings

### Saved

In [54]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='mps')

In [55]:
subjects = list(set([t for t in versions.loc[versions['bill_id'].str.startswith('2')]['GeneralSubject'].tolist() if (isinstance(t, str) and t is not None)]))
embeddings = model.encode(
    subjects,
    batch_size=64,
    show_progress_bar=True,
    convert_to_tensor=True,
    num_workers=4,
    normalize_embeddings=True,
    truncate_dim=64
)

Batches:   0%|          | 0/354 [00:00<?, ?it/s]

In [56]:
subject_embeddings = {subjects[i]: embeddings[i] for i in range(len(subjects))}

In [57]:
occ_embeddings = {}
for occ in list(set(list(leg_occupations.values()))):
    if isinstance(occ, str) and len(occ) > 0:
        occ_embeddings[occ] = model.encode(
            occ,
            convert_to_tensor=True,
            normalize_embeddings=True,
            truncate_dim=32)

In [58]:
titles = [t for t in bill_vers['Title'].unique().tolist() if (isinstance(t, str) and t is not None and t != '')]
title_embeddings = model.encode(
    titles,
    batch_size=32,
    show_progress_bar=True,
    convert_to_tensor=True,
    num_workers=4,
    normalize_embeddings=True,
    truncate_dim=64
)
title_embeddings = {titles[i]: title_embeddings[i] for i in range(len(titles))}

Batches:   0%|          | 0/1448 [00:00<?, ?it/s]

In [59]:
lobbying_firms_embeddings = {}
for firm in [firm for firm in lobbying.loc[(lobbying['clean_beneficiary'].notna()) & (lobbying['FIRM_NAME'])]['FIRM_NAME'].unique().tolist() if isinstance(firm, str)]:
    lobbying_firms_embeddings[firm] = model.encode(firm, convert_to_tensor=True, normalize_embeddings=True, truncate_dim=64)

In [None]:
lobbying_expense_embeddings = {}
for expense in tqdm([expense for expense in lobbying.loc[(lobbying['clean_beneficiary'].notna())]['EXPN_DSCR'].unique().tolist() if isinstance(expense, str)]):
    lobbying_expense_embeddings[expense] = model.encode(expense, convert_to_tensor=True, normalize_embeddings=True, truncate_dim=32)

 49%|████▉     | 2720/5541 [01:22<01:25, 32.93it/s]


KeyboardInterrupt: 

In [60]:
committee_embeddings = {}
for committee in politicians['committee_clean'].unique().tolist():
    co = re.sub(r'assembly|senate|committee|subcommittee', '', committee.lower())
    committee_embeddings[committee.lower()] = model.encode(co,  convert_to_tensor=True, normalize_embeddings=True, truncate_dim=64)

In [61]:
donor_embeddings = {}

for donor in tqdm(donor_names):
    donor_embeddings[donor] = model.encode(donor, convert_to_tensor=True, normalize_embeddings=True, truncate_dim=64)

100%|██████████| 1136/1136 [00:42<00:00, 26.44it/s]


In [62]:
motion_embeddings = {}
for motion in tqdm([t for t in pd.DataFrame.from_dict(motion_codes, orient='index').reset_index().rename({'index': 'motion_id', 0: 'motion_text'}, axis=1)['motion_text'].drop_duplicates().tolist() if t is not None]):
    motion_embeddings[motion] = model.encode(motion, convert_to_tensor=True, normalize_embeddings=True, truncate_dim=16)

100%|██████████| 33/33 [00:00<00:00, 63.04it/s]


```# digest_embeddings = {}
# for digest in tqdm(list(set([t if (isinstance(t, str) and t is not None) else '' for t in digests.loc[digests['bill_id'].str.startswith('2')]['DigestText'].tolist()]))):
    # digest_embeddings[digest] = model.encode(digest, convert_to_tensor=True, normalize_embeddings=True)
# torch.save(digest_embeddings, 'digests.pt')
```

### Load

In [63]:
digest_embeddings = torch.load('digests.pt')

In [64]:
positions = {p: 'Democratic Alternate' if re.search(r'Democratic\s*Alternate', p) is not None else 'Vice Chair' if re.search(r'V\s*i\s*c\s*e\s*-*\s*C\s*h\s*a\s*i\s*r\s*', p) is not None else 'Co-Chair' if re.search(r'Co\s*-\s*Chair', p) is not None else 'Chair' if re.search(r'Cha\s*i\s*r', p) is not None else 'Republican Alternate' if re.search(r'\s*Republican\s*Alternate', p) is not None else p for p in politicians['position'].unique()}

## OOP Graph-Construction

In [65]:
class Node:
    def __init__(self, id, type, features=None):
        self.id = id
        self.type = type
        self.features = features or {}

class Edge:
    def __init__(self, source, target, relation, attributes=None):
        self.source = source
        self.target = target
        self.relation = relation
        self.attributes = attributes or {}

class Bill(Node):
    def __init__(self, bill_id, title, subject, measure_type):
        features = {
            'title': title,
            'subject': subject,
            'measure_type': measure_type,
            'date': None,
        }
        super().__init__(bill_id, "bill", features)
        self.actions = None
        self.order_df = None
        self.outcome = None

    def add_actions(self, actions):
        if self.actions is None:
            self.actions = actions
        else:
            self.actions = pd.concat([self.actions, actions], ignore_index=True).drop_duplicates()

    def add_date(self, date):
        self.features['date'] = date

    def add_order_df(self, order_df):
        if self.order_df is None:
            self.order_df = order_df
        else:
            self.order_df = pd.concat([self.order_df, order_df], ignore_index=True).drop_duplicates()

    def add_outcome(self, outcome):
        self.outcome = outcome

    def align_actions_versions(self, bill, versions_, dates):
        dates = pd.Series(dates).sort_values(ascending=True).drop_duplicates().tolist()
        actions = [i for i in introduction_dates.get(bill, {}).get('Actions', []) if i != 'FILED']
        if len(actions) > len(dates):
            if actions[-2:] == ['ENROLLED', 'CHAPTERED'] or actions[-2:] == ['APPROVED', 'CHAPTERED'] or len(actions) <= 4 and actions[-1] == 'ENROLLED' or abs(len(dates) - len(actions)) >= 2 and actions[-1] == 'ENROLLED' or actions == ['INTRODUCED', 'ENROLLED', 'AMENDED_SENATE'] or actions[-1] == 'APPROVED' or len(actions) == 3 and all(a.startswith('PASSED_') for a in actions[-2:]) or actions == ['ENROLLED', 'INTRODUCED'] or actions == ['INTRODUCED', 'ENROLLED'] or actions == ['INTRODUCED', 'REVISED'] or len(actions) > 3 and actions[-3:] == ['ENROLLED', 'CORRECTED', 'CHAPTERED'] or len(actions) > 3 and actions[-3:] == ['PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_SENATE'] or list(set(['INTRODUCED', 'PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_SENATE'])) == list(set(actions)) or list(set(['INTRODUCED', 'PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_ASSEMBLY'])) == list(set(actions)) or actions[-2] == 'ENROLLED' and actions[-1].startswith('PASSED_') or len(actions) > 5 and actions[-4] == 'CHAPTERED' and actions[-1].startswith('PASSED_') or actions[-2:] == ['PASSED_SENATE', 'PASSED_ASSEMBLY'] or actions == ['INTRODUCED', 'AMENDED_SENATE', 'PASSED_SENATE', 'PASSED_ASSEMBLY', 'AMENDED_ASSEMBLY'] or len(actions) == 9 and actions[-6:] == ['APPROVED', 'CHAPTERED', 'ENROLLED', 'AMENDED_SENATE', 'PASSED_ASSEMBLY', 'PASSED_SENATE'] or len(actions) > 5 and actions[-4:] == ['ENROLLED', 'PASSED_SENATE', 'APPROVED', 'PASSED_ASSEMBLY'] or dates == [pd.Timestamp('2008-12-08 00:00:00'), pd.Timestamp('2008-12-18 00:00:00')]:
                dates.append(dates[-1])
                if len(actions) > len(dates):
                    dates.append(dates[-1])
                    if len(actions) > len(dates):
                        dates.append(dates[-1])
                        if len(actions) > len(dates):
                            dates.append(dates[-1])
                            if len(actions) > len(dates):
                                dates.append(dates[-1])
            if len(dates) == 1 and len(actions) > 1:
                for _ in range(len(actions) - len(dates)):
                    dates.append(dates[0])
            if actions[-2:] == ['INTRODUCED', 'PASSED_ASSEMBLY']:
                dates = [dates[0]] + dates
            if len(actions) >= 6 and actions[:3] == ['INTRODUCED', 'AMENDED_ASSEMBLY', 'ENROLLED'] and actions[-3:] == ['CHAPTERED', 'APPROVED', 'CORRECTED']:
                dates = dates[:2] + [dates[2]] + dates[2:-2] + [dates[-2]] + dates[-2:]
            elif ('PASSED_ASSEMBLY' in actions and 'AMENDED_ASSEMBLY' in actions) or ('PASSED_SENATE' in actions and 'AMENDED_SENATE' in actions):
                if len(dates) == 3:
                    dates = dates[:1] + [dates[1]] + dates[1:]
                    if all(a for a in ['PASSED_ASSEMBLY', 'AMENDED_ASSEMBLY', 'PASSED_SENATE', 'AMENDED_SENATE'] if a in actions):
                        dates = dates[:1] + [dates[1]] + dates[1:]
                elif 'PROPOSED_CONFERENCE_REPORT_1' in actions and len(actions) - len(dates) == 2:
                    dates = dates[:2] + [dates[2]] + [dates[2]] + dates[2:]
                elif len(dates) > 3 and len(actions) - len(dates) > 0 and not (actions[-4:] == ['PASSED_ASSEMBLY', 'ENROLLED', 'PASSED_SENATE', 'CHAPTERED'] and 'AMENDED_SENATE' in actions):
                    dates = dates[:2] + [dates[2]] + dates[2:]
                if len(actions) > 4 and actions[-3:] == ['ENROLLED', 'PASSED_SENATE', 'CHAPTERED']:
                    dates = dates[:-2] + [dates[-2]] + dates[-2:]

            if len(actions) - len(dates) == 1:
                if 'CORRECTED' in actions:
                    actions.remove('CORRECTED')
                elif 'RESCIND' in actions:
                    actions.remove('RESCIND')

            if actions[-1] == 'CORRECTED' and len(actions) - len(dates) == 2:
                if len(dates) >= 5:
                    dates = dates[:3] + [dates[3]] + [dates[3]] + [dates[4]] + dates[4:]
                elif len(dates) == 2:
                    dates = [dates[0]] + [dates[0]] + dates[0:]
                else:
                    dates = dates[:2] + [dates[2]] + [dates[2]] + dates[2:]
            if actions[-1] == 'CHAPTERED' and len(actions) - len(dates) == 3:
                dates = dates + [dates[-1]] + [dates[-1]] + [dates[-1]]
            if actions[-2:] == ['ENROLLED', 'VETOED'] and len(actions) - len(dates) > 0 :
                dates = dates[:-4] + [dates[-4]]  + [dates[-3]] + dates[-3:]

            if len(dates) < len(actions) and'ENROLLED' in actions and actions.index('ENROLLED') < len(actions) - 1:
                for i in range(len(actions) - actions.index('ENROLLED')):
                    dates = dates + [dates[-1]]
        if len(actions) + 1 == len(dates):
            dates = dates[:-1]
        try:
            action_df = pd.DataFrame({'date': dates, 'action': actions})
        except:
            return None, None, None
        action_df['date'] = pd.to_datetime(action_df['date'], errors='coerce')
        order_df = action_df.loc[~action_df['action'].isin(['FILED', 'PASSED_ASSEMBLY', 'PASSED_SENATE', 'APPROVED'])]
        repair_flag = False
        if order_df.shape[0] > len(versions_):
            version_ends = [re.search(r'INT|AMD|ENR|CHP|PRO', v).group() for v in versions_]
            if 'ENR' in version_ends:
                v_enr = version_ends.index('ENR')
                extension = [versions_[v_enr - 1] if v_enr - 1 != 0 else versions_[v_enr] for _ in range(len(order_df) - len(versions_))]
                versions_ = versions_[:v_enr] + extension + versions_[v_enr:]
            else:
                repair_flag = True
        vr = pd.DataFrame({'version': versions_})
        if vr.shape[0] == 0:
            return None, None, None
        try:
            vr['v_num'] = vr['version'].apply(lambda x: re.search(r'\d{2}(?=INT|AMD|ENR|CHP|PRO)', x).group()).astype(int)
        except:
            return None, None, None
        vr = vr.sort_values('v_num', ascending=False).reset_index(drop=True)
        if repair_flag:
            last_v = vr.loc[vr['version'].notna()].iloc[-1]['version']
            last_v_num = float(re.search(r'\d{2}(?=INT|AMD|ENR|CHP|PRO)', last_v).group())
            for i in range(len(order_df) - len(versions_)):
                vr.loc[len(vr) + i, 'version'] = last_v
                vr.loc[len(vr) + i, 'v_num'] = last_v_num

        order_df['version'] = vr['version']
        order_df['order'] = range(1, len(order_df) + 1)
        outcomes = order_df['action'].tolist()
        if 'CHAPTERED' in outcomes or 'FILED' in outcomes:
            if 'VETOED' in outcomes:
                outcome = 'VETOED'
            else:
                outcome = 'CHAPTERED'
        else:
            outcome = 'FAILED'
        return action_df, order_df, outcome

class BillVersion(Node):
    def __init__(self, bill_id, version_id, digest, vote_required, local_program, fiscal_com, tax_levy, urgency):
        features = {
            'digest': digest,
            'VoteRequired': vote_required,
            'LocalProgram': local_program,
            'FiscalCommittee': fiscal_com,
            'TaxLevy': tax_levy,
            'Urgency': urgency,
            'date': None
        }
        super().__init__(version_id, "bill_version", features)
        self.bill_id = bill_id
        self.actions = {}

    def add_actions(self, location, date):
        if location not in self.actions:
            self.actions[location] = []
        self.actions[location].append(date)

    def add_date(self, date):
        self.features['date'] = date

class Legislator(Node):
    def __init__(self, legislator_id, party, occupation):
        features = {
            'party': party,
            'occupation': occupation
        }
        super().__init__(legislator_id, "legislator", features)
        self.terms = []

class LegislatorTerm(Node):
    def __init__(self, term, legislator_id, chamber, district):
        features = {
            'chamber': chamber,
            'district': district,
            'term': term
        }
        node_id = f"{legislator_id}_{term}_{chamber}"
        super().__init__(node_id, "legislator_term", features)
        self.committees = []
        self.committee_positions = []

    def add_committee(self, committee_id):
        self.committees.append(committee_id)

    def add_committee_position(self, committee_id, position):
        self.committee_positions.append((committee_id, position))

class Committee(Node):
    def __init__(self, committee_id, name, chamber, term):
        features = {
            'name': name,
            'chamber': chamber
        }
        term_ = term.split('-')[0]
        id = f"{committee_id}_{term_}"
        super().__init__(id, "committee", features)
        self.members = []

    def add_member(self, legislator_id):
        self.members.append(legislator_id)

class LobbyFirm(Node):
    def __init__(self, firm_id, name):
        features = {
            'name': name
        }
        super().__init__(firm_id, "lobby_firm", features)
        self.total_donations = 0

    def add_donation(self, amount):
        self.total_donations += amount

class Donor(Node):
    def __init__(self, donor_id, name):
        features = {
            'name': name
        }
        super().__init__(donor_id, "donor", features)
        self.total_donations = 0

    def add_donation(self, amount):
        self.total_donations += amount

class Vote(Edge):
    def __init__(self, legislator, bill_version, vote, motion, date, direction):
        attributes = {
            'vote': 1 if vote == 'AYE' else -1 if vote == 'NOE' else 0,
            'motion': motion,
            'date': date
        }
        if direction == 1:
            super().__init__(legislator, bill_version, 'voted_on', attributes)
        else:
            super().__init__(bill_version, legislator, 'vote_from', attributes)

class CommitteeMembership(Edge):
    def __init__(self, legislator, committee, position, direction):
        attributes = {
            'position': position
        }
        if direction == 1:
            super().__init__(legislator, committee, 'member_of', attributes)
        else:
            super().__init__(committee, legislator, 'has_member', attributes)
        committee.add_member(legislator)

class Sponsorship(Edge):
    def __init__(self, legislator, bill_version, author_type, direction):
        attributes = {
            'author_type': author_type
        }
        if direction == 1:
            super().__init__(legislator, bill_version, 'wrote', attributes)
        else:
            super().__init__(bill_version, legislator, 'written_by', attributes)

class Reading(Edge):
    def __init__(self, bill_version, committee, date, direction):
        attributes = {
            'date': date
        }
        if direction == 1:
            super().__init__(committee, bill_version, 'read', attributes)
        else:
            super().__init__(bill_version, committee, 'read_by', attributes)

class Donation(Edge):
    def __init__(self, donor, recipient, amount, date, type, direction=1):
        attributes = {
            'amount': amount,
            'date': date
        }
        if type == 'CampaignContribution':
            if direction == 1:
                super().__init__(donor, recipient, 'donated_to', attributes)
            else:
                super().__init__(recipient, donor, 'has_donation', attributes)
        else:
            if direction == 1:
                super().__init__(donor, recipient, 'lobbied', attributes)
            else:
                super().__init__(recipient, donor, 'has_lobbying', attributes)

class Version(Edge):
    def __init__(self, bill_version, bill, order, direction):
        attributes = {
            'order': order
        }
        if direction == 1:
            super().__init__(bill_version, bill, 'is_version', attributes)
        else:
            super().__init__(bill, bill_version, 'has_version', attributes)

class siblingVersion(Edge):
    def __init__(self, version1, version2, direction):
        if direction == 1:
            super().__init__(version1, version2, 'priorVersion')
        else:
            super().__init__(version2, version1, 'nextVersion')

class samePerson(Edge):
    def __init__(self, node1, node2):
        super().__init__(node1, node2, 'samePerson')

In [66]:
class GraphBuilder:
    def __init__(self):
        self.nodes = {}
        self.edges = []
        self.versions = []

    def add_version(self, version):
        self.versions.append(version)

    def add_node(self, node):
        key = (node.type, node.id)
        if key not in self.nodes:
            self.nodes[key] = node

    def get_node(self, type_, id_):
        return self.nodes.get((type_, id_))

    def add_edge(self, edge):
        self.edges.append(edge)

    def build(self):
        return list(self.nodes.values()), self.edges

    def add_bills(self, bill_ids, titles, subjects, titles_embs, subjects_embs, features):
        def process_single_bill(bill):
            try:
                title = titles.get(bill, '')
                subject = subjects.get(bill, '')
                title_emb = titles_embs.get(title, None)
                subject_emb = subjects_embs.get(subject, None)
                measure_type = re.search(r'[A-Za-z]+', bill).group()
                bill_node = Bill(bill, title_emb, subject_emb, measure_type)

                versions = version_id_mapping.get(bill, [])
                versions_ = []
                dates = introduction_dates.get(bill, {}).get('Dates', [])
                fd = sorted(list(set(dates)))[0]
                bill_node.add_date(fd)

                if not versions:
                    return

                for version in versions:
                    digest = features[version]['digest']
                    if str(digest) == 'nan' or version.endswith('VETO'):
                        continue

                    digest_emb = digest_embeddings.get(digest, None)
                    if digest_emb is None:
                        continue

                    version_node = BillVersion(
                        bill, version, digest_emb,
                        features[version]['VoteRequired'],
                        features[version]['LocalProgram'],
                        features[version]['FiscalCommittee'],
                        features[version]['TaxLevy'],
                        features[version]['Urgency']
                    )

                    self.add_node(version_node)
                    if version not in self.versions:
                        self.versions.append(version)
                    versions_.append(version)

                action_df, order_df, outcome = bill_node.align_actions_versions(bill, versions_, dates)
                bill_node.add_actions(action_df)
                bill_node.add_order_df(order_df)
                bill_node.add_outcome(outcome)

                self.add_node(bill_node)
                for i, row in order_df.drop_duplicates(subset='version').iterrows():
                    version_node = self.get_node('bill_version', row['version'])
                    version_node.add_date(row['date'])
                    if version_node:
                        self.add_edge(Version(version_node, bill_node, row['order'], 1))
                        if i < len(order_df) - 1:
                            v2_node = self.get_node('bill_version', order_df.iloc[i + 1]['version'])
                            if v2_node:
                                self.add_edge(siblingVersion(version_node, v2_node, 1))
            except:
                pass
        for bill in tqdm(bill_ids):
            process_single_bill(bill)

    def add_legislators(self, legislators_):
        for legislator in tqdm(legislators_):
            leg_name = legislators[legislator]
            party = leg_parties.get(leg_name)
            occupation = leg_occupations.get(leg_name)
            occ_embedding = occ_embeddings.get(occupation, None)
            legislator_node = Legislator(legislator, party, occ_embedding)
            self.add_node(legislator_node)
            terms = politicians.loc[politicians['full_name'] == leg_name, ['Term', 'District No.', 'chamber']].drop_duplicates()
            for _, term in terms.iterrows():
                term_node = LegislatorTerm(term['Term'], legislator, term['chamber'], term['District No.'])
                self.add_node(term_node)
                self.add_edge(samePerson(legislator_node, term_node))

    def add_committees(self, committees_df):
        for _, row in tqdm(committees_df[['committee_clean', 'Term', 'chamber']].drop_duplicates().iterrows(), total=committees_df[['committee_clean', 'Term', 'chamber']].drop_duplicates().shape[0]):
            committee_name = committee_embeddings.get(row['committee_clean'].lower(), None)
            committee_id = committee_codes.get(row['committee_clean'].lower(), None)
            committee_node = Committee(committee_id, committee_name, row['chamber'], row['Term'])
            self.add_node(committee_node)
            term = row['Term']
            members = politicians.loc[(politicians['committee_clean'] == row['committee_clean']) & (politicians['Term'] == row['Term']), ['position', 'full_name', 'chamber']].drop_duplicates()
            for _, member in members.iterrows():
                leg_id = legislator_codes[member['full_name']]
                leg_node_id = f"{leg_id}_{term}_{member['chamber']}"
                leg_node = self.get_node('legislator_term', leg_node_id)
                self.add_edge(CommitteeMembership(leg_node, committee_node, member['position'], 1))
                committee_node.add_member(leg_node_id)
                leg_node.add_committee(committee_name)

    def add_votes(self):
        for _, row in tqdm(bill_votes.loc[bill_votes['bill_id'].isin(bill_ids)].iterrows(), total=bill_votes.loc[bill_votes['bill_id'].isin(bill_ids)].shape[0]):
            bill_node = self.get_node('bill', row['bill_id'])
            if bill_node is None:
                continue
            order_df = bill_node.order_df
            if order_df is None:
                continue
            try:
                v = order_df.loc[order_df['date'] <= row['vote_date_time'], 'version'].values[-1]
            except:
                v = order_df['version'].values[-1]
            if v is None:
                continue
            v_node = self.get_node('bill_version', v)
            if v_node is None:
                continue
            last = row['legislator_name'].strip().lower()
            legislator = legislators_last_names.get((row['chamber'], last, row['term']), None)
            if legislator is None:
                if len(last.split(' ')) > 1:
                    legislator = row['legislator_name']
                else:
                    continue
            legislator_id = legislator_codes.get(legislator, None)
            leg_term_node = self.get_node('legislator_term', f"{legislator_id}_{row['term']}_{row['chamber']}")
            if leg_term_node is None:
                continue
            vote = row['vote_code']
            motion_id = row['motion_id']
            motion_text = motion_codes.get(motion_id, None)
            if motion_text is None:
                continue
            if row['location_code'] not in ['AFLOOR', 'SFLOOR']:
                actions = v_node.actions
                if row['location_code'] in actions:
                    if row['vote_date_time'] not in actions[row['location_code']]:
                        actions[row['location_code']].append(row['vote_date_time'])
            if motion_text is None:
                motion_embedding = ''
            else:
                motion_embedding = motion_embeddings.get(motion_text, None)
            self.add_edge(Vote(leg_term_node, v_node, vote, motion_embedding, row['vote_date_time'], 1))


    def add_readings(self):
        for version in self.versions:
            v_node = self.get_node('bill_version', version)
            if v_node is None:
                continue
            actions = v_node.actions
            for location, dates in actions.items():
                if location.startswith('CS') or location.startswith('CX'):
                    com = committee_matches.get(location)
                    if com is None:
                        continue
                    committee_id = committee_codes.get(com.lower(), None)
                    for date in dates:
                        year = date.year
                        if year % 2 == 0:
                            year -= 1
                        com_node = self.get_node('committee', f"{committee_id}_{year}")
                        if com_node is None:
                            continue
                        self.add_edge(Reading(v_node, com_node, date, 1))

    def add_sponsorships(self, sponsors):
        for _, row in tqdm(sponsors.iterrows(), total=sponsors.shape[0]):
            version = row['bill_ID']
            version_node = self.get_node('bill_version', version)
            if version_node is None:
                continue
            if row['House'] == 'UNKNOWN':
                com = author_com_matches.get(row['Name'], None)
                year = row['term'].split('-')[0]
                com_node = self.get_node('committee', f"{com}_{year}")
                if com_node is None:
                   continue
                self.add_edge(Sponsorship(com_node, version_node, row['Contribution'], 1))
            else:
                if row['Name'] in ['Mark Stone', 'Cristina Garcia', 'John Campbell', 'Bill Campbell', 'Eduardo Garcia']:
                    leg_id = legislator_codes.get(row['Name'])
                    leg_node = self.get_node('legislator_term', f"{leg_id}_{row['term']}_{row['House'].lower()}")
                else:
                    try:
                        name = re.sub(r'\'', '', re.sub(r'-', ' ', row['Name'])).lower().strip()
                    except:
                        continue
                    leg_name = legislators_last_names.get((row['House'].lower(), name, row['term']), None)
                    if leg_name is None:
                        continue
                    leg_id = legislator_codes.get(leg_name)
                    leg_node = self.get_node('legislator_term', f"{leg_id}_{row['term']}_{row['House'].lower()}")
                self.add_edge(Sponsorship(leg_node, version_node, row['Contribution'], 1))


    def add_lobbyists(self, lobbyists):
        for key in tqdm(lobbyists.keys(), total=len(lobbyists.keys())):
            lobbyist = LobbyFirm(key, lobbyists[key])
            self.add_node(lobbyist)

    def add_donations(self, donations):
        for _, row in tqdm(donations.iterrows(), total=donations.shape[0]):
            firm = row['FIRM_NAME']
            firm_node = self.get_node('lobby_firm', firm)
            if firm_node is None:
                continue
            if row['clean_beneficiary'] in committee_codes:
                com = committee_codes.get(row['clean_beneficiary'])
                year = row['term'].split('-')[0]
                com_node = self.get_node('committee', f"{com}_{year}")
                if com_node is None:
                    continue
                self.add_edge(Donation(firm_node, com_node, row['BENE_AMT'], row['EXPN_DATE'], 'Lobbying', 1))
                firm_node.add_donation(row['BENE_AMT'])

            else:
                dicti = pol_names_terms.get((row['clean_beneficiary'], row['term']), None)
                chamber = dicti['chamber'] if dicti is not None else None
                name = dicti['name'] if dicti is not None else None
                if chamber is None or name is None:
                    continue
                leg_id = legislator_codes.get(name)
                leg_node = self.get_node('legislator_term', f"{leg_id}_{row['term']}_{chamber}")
                if leg_node is None:
                    continue
                self.add_edge(Donation(firm_node, leg_node, row['BENE_AMT'], row['EXPN_DATE'], 'Lobbying', 1))
                firm_node.add_donation(row['BENE_AMT'])

    def add_donors(self, donors):
        for donor in tqdm(donors.keys(), total=len(donors.keys())):
            donor_embedding = donors[donor]
            donor = Donor(donor, donor_embedding)
            self.add_node(donor)

    def add_contributions(self, contributions):
        for _, row in tqdm(contributions.iterrows(), total=contributions.shape[0]):
            expender = row['ExpenderName']
            expender_node = self.get_node('donor', expender)
            if expender_node is None:
                continue
            recipient = row['full_name']
            recipient_id = legislator_codes.get(recipient)
            recipient_node = self.get_node('legislator_term', f"{recipient_id}_{row['Term']}_{row['chamber']}")
            if recipient_node is None:
                continue
            self.add_edge(Donation(expender_node, recipient_node, row['Amount'], row['DateEnd'], 'CampaignContribution', 1))
            expender_node.add_donation(row['Amount'])

In [67]:
builder = GraphBuilder()
builder.add_bills(bill_ids, bill_titles, bill_subjects, title_embeddings, subject_embeddings, features)
builder.add_legislators(legislators)
builder.add_sponsorships(sponsors)

100%|██████████| 45373/45373 [03:51<00:00, 196.06it/s]
100%|██████████| 508/508 [00:00<00:00, 1009.19it/s]
100%|██████████| 674859/674859 [00:12<00:00, 55946.84it/s]


In [68]:
builder.add_committees(politicians)
builder.add_lobbyists(lobbying_firms_embeddings)
builder.add_donations(lob)
builder.add_donors(donor_embeddings)
builder.add_contributions(campaign_contributions)

100%|██████████| 1882/1882 [00:02<00:00, 659.85it/s]
100%|██████████| 1325/1325 [00:00<00:00, 888055.74it/s]
100%|██████████| 104110/104110 [00:01<00:00, 66174.77it/s]
100%|██████████| 1136/1136 [00:00<00:00, 856657.56it/s]
100%|██████████| 7028/7028 [00:00<00:00, 62291.20it/s]


In [69]:
builder.add_votes()

100%|██████████| 6636032/6636032 [10:42<00:00, 10323.58it/s] 


In [70]:
nodes, edges = builder.build()

In [71]:
author_type_map = {
    'LEAD_AUTHOR': 'LEAD_AUTHOR',
    'PRINCIPAL_COAUTHOR': 'PRINCIPAL_COAUTHOR',
    'COAUTHOR': 'COAUTHOR',
    'data="COAUTHOR"': 'COAUTHOR',
    'data="LEAD_AUTHOR"': 'LEAD_AUTHOR',
    'data="PRINCIPAL_COAUTHOR"': 'PRINCIPAL_COAUTHOR',
    'nan': 'AUTHOR'
}
author_levels = {
    'AUTHOR': 1,
    'COAUTHOR': 1,
    'PRINCIPAL_COAUTHOR': 2,
    'LEAD_AUTHOR': 3
}

In [73]:
from sklearn.preprocessing import LabelEncoder

measure_types = bill_vers['MeasureType'].unique()
parties = politicians['Party'].unique()
chambers = politicians['chamber'].unique()
outcome_mapping = {'CHAPTERED': 1, 'VETOED': 0, 'FAILED': -1, 'ENROLLED': 1}
measure_encoder = LabelEncoder()
measure_encoder.fit(measure_types)
party_encoder = LabelEncoder()
party_encoder.fit(parties)
chamber_encoder = LabelEncoder()
chamber_encoder.fit(chambers)
pos = list(positions.values()) + ['member']
pos_encoder = LabelEncoder()
pos_encoder.fit(pos)

from torch_geometric.data import HeteroData
import numpy as np

In [None]:
feature_encoders = {
    'measure_type': measure_encoder,
    'party': party_encoder,
    'chamber': chamber_encoder,
    'position': pos_encoder
}

def get_concat_features(attrs, node=False):
    features = []
    for k, v in attrs.items():
        if isinstance(v, torch.Tensor):
            features.append(v.detach().cpu().numpy())
        elif isinstance(v, np.ndarray):
            try:
                features.append(v[:, 0])
            except:
                features.append(v.reshape(-1))
        else:
            if (v is None) or (v == ''):
                vals = 0
                if node == True:
                    if k in ['title', 'subject', 'digest', 'occupation', 'name']:
                        vals = np.zeros((1, 384), dtype=np.float32).reshape(-1)
            elif k == 'date':
                vals = v.timestamp() if v is not None and v is not pd.NaT else 0
            elif k == 'author_type':
                vals = author_levels[author_type_map[str(v)]] if str(v) in author_levels else 1
            elif k == 'term':
                vals = int(v.split('-')[0]) if v is not None else 2000
            elif k == 'district':
                t = re.sub(r'[^\d]', '', str(v)) if str(v) != '' else 0
                vals = float(t) if t != '' else 0.0
            elif k in ['VoteRequired', 'LocalProgram', 'FiscalCommittee', 'TaxLevy', 'Urgency']:
                vals = int(str(v).lower() == 'yes') if v is not None else 0
            elif k in feature_encoders:
                if k == 'position':
                    vals = pos_encoder.transform([positions.get(v, 'member')])[0] if v is not None else 0
                else:
                    vals = feature_encoders[k].transform([v])[0] if v is not None else 0
            else:
                try:
                    t = float(v)
                    vals = v
                except:
                    vals = 0
            features.append([vals])
    try:
        x = np.concatenate(features, axis=-1).astype(np.float32).reshape(1, -1)
        return x
    except:
        return 'bad'

In [122]:
from collections import defaultdict
node_id_map = defaultdict(dict)
nodes_by_type = defaultdict(list)
edges_by_type = defaultdict(list)
clean_edges = []

for node in nodes:
    nodes_by_type[node.type].append(node)
for edge in edges:
    try:
        edge_type = (edge.source.type, edge.relation, edge.target.type)
        edges_by_type[edge_type].append(edge)
        clean_edges.append(edge)
    except:
        pass

In [142]:
data = HeteroData()
node_attrs = defaultdict(list)

for ntype, nlist in nodes_by_type.items():
    outcomes = []
    bad_count = 0
    bad_ids = []
    for node in nlist:
        attrs = node.features
        x_attr = get_concat_features(attrs, node=True)
        if not isinstance(x_attr, np.ndarray):
            if x_attr == 'bad':
                bad_count += 1
                bad_ids.append(node.id)
                continue
        elif 'bad' in x_attr:
            bad_count += 1
            bad_ids.append(node.id)
            continue
        node_attrs[ntype].append(x_attr)
        if ntype == 'bill':
            outcome = node.outcome
            o = outcome_mapping.get(outcome, -1)
            outcomes.append(o)
    if ntype == 'bill':
        data[ntype].y = np.array(outcomes, dtype=np.int64)
    data[ntype].x = np.vstack(node_attrs[ntype])
    data[ntype].num_nodes = len(nlist) - bad_count
    node_id_map[ntype] = {node.id: i for i, node in enumerate(nlist) if node.id not in bad_ids}


In [131]:
edge_indices = defaultdict(list)
edge_attrs = defaultdict(list)

for etype, elist in edges_by_type.items():
    for edge in elist:
        try:
            src = node_id_map[edge.source.type][edge.source.id]
            dst = node_id_map[edge.target.type][edge.target.id]
        except:
            continue
        edge_indices[etype].append([src, dst])
        edge_attrs[etype].append(get_concat_features(edge.attributes))

In [158]:
for etype in edge_indices.keys():
    edge_idx = torch.tensor(edge_indices[etype], dtype=torch.long).t().contiguous()
    data[etype].edge_index = edge_idx
    if etype in edge_attrs.keys():
        if edge_attrs[etype] is None:
            continue
        elif np.any(edge_attrs[etype] == np.array([['bad']])):
            edge_attrs[etype] = [np.array([[1]]) if i[0] == 'bad' else np.array([[0]]) for i in edge_attrs[etype]]
    edge_attr_np = np.vstack(edge_attrs[etype])
    try:
        edge_attr = torch.from_numpy(edge_attr_np).float()
    except:
        print(edge_attr_np, etype)
        break
    data[etype].edge_attr = edge_attr

In [159]:
torch.save(data, 'data2.pt', _use_new_zipfile_serialization=True)

In [None]:
import json
with open('node_id_map.json', 'w') as f:
    json.dump(node_id_map, f)

: 