In [1]:
# https://www.propublica.org/datastore/dataset/congressional-data-bulk-legislation-bills

# Get party affiliations from https://projects.propublica.org/api-docs/congress-api/members/

In [7]:
import os
import zipfile
import json
from tqdm import tqdm
import pandas as pd

In [3]:
db = []
LIMIT = 5


for i, z in enumerate(os.listdir('data')):
    if i > LIMIT:
        break
        
    zip_file = os.path.join('data', z)
    with zipfile.ZipFile(zip_file, 'r') as f:
        names = [file for file in f.namelist() if file.endswith('data.json')]
        for file in tqdm(names, f'Opening from {zip_file}: ', position=0):
            with f.open(file) as bill:
                data = json.load(bill)
                db.append({'id': data['bill_id'],
                           'title': data['official_title'],
                           'sponsor': data['sponsor'],
                           'cosponsors': data['cosponsors']})

            

Opening from data/100.zip: 100%|██████████| 11278/11278 [00:07<00:00, 1552.25it/s]
Opening from data/101.zip: 100%|██████████| 11787/11787 [00:09<00:00, 1215.49it/s]
Opening from data/102.zip: 100%|██████████| 12016/12016 [00:10<00:00, 1159.72it/s]
Opening from data/103.zip: 100%|██████████| 9822/9822 [00:07<00:00, 1231.84it/s]
Opening from data/104.zip: 100%|██████████| 7991/7991 [00:06<00:00, 1167.25it/s]
Opening from data/105.zip: 100%|██████████| 9141/9141 [00:08<00:00, 1109.40it/s]


In [4]:
members = {}
edges = {}

def get_id(member):
    try:
        member_id = int(member['thomas_id'])
    except:
        member_id = 'None'
    return member_id

def add_to_members(entry):
    member_id = get_id(entry)
    
    if member_id == 'None':
        members[member_id] = ['N/A', 'N/A']
        return 0
    
    else:
        try:
            namesplit = entry['name'].split(', ')
            lastname = namesplit[0]
            firstname = namesplit[1]
            if len(namesplit) > 2:
                suffix = f', {namesplit[2]}'
            else:
                suffix = ''
        except:
            print(entry['name'])
            raise ValueError
        name = f"{entry['title']} {firstname} {lastname}{suffix}"
        state = entry['state']

        if member_id not in members.keys():
            members[member_id] = [name, state]

        return 0
        
def compare_cosponsors(sponsor, cosponsor_list, bill_name):
    sponsor_id = get_id(sponsor)
    if sponsor_id not in edges.keys():
        edges[sponsor_id] = {}
    
    for cosponsor in cosponsor_list + [sponsor]:  # Get that diagonal lol
        cosponsor_id = get_id(cosponsor)
        if cosponsor_id not in edges[sponsor_id].keys():
            edges[sponsor_id][cosponsor_id] = []
        edges[sponsor_id][cosponsor_id].append(bill_name)
    
def summarize_edges():
    out = []
    for sponsor, cosponsor_list in edges.items():
        for cosponsor, bills in cosponsor_list.items():
            out.append((sponsor, cosponsor, len(bills)))
    return out
    

for bill in tqdm(db, 'Processing database: ', position=0):
    bill_name = f"{bill['id']}: {bill['title']}"
    sponsor = bill['sponsor']
    cosponsors = bill['cosponsors']
    
    add_to_members(sponsor)
    for cosponsor in cosponsors:
        add_to_members(cosponsor)
    
    compare_cosponsors(sponsor, cosponsors, bill_name)
    
len(summarize_edges())

Processing database: 100%|██████████| 62035/62035 [00:02<00:00, 29812.90it/s]


256373

In [5]:
# Todo: add reciprocal entries so that it becomes symmetrical

In [12]:
members_df = pd.DataFrame([(k, v[0], v[1]) for k, v in members.items()], columns=['id', 'name', 'state'])
members_df

Unnamed: 0,id,name,state
0,1318,Sen Robert J. Dole,KS
1,1282,Sen Lloyd M. Bentsen,TX
2,1286,Sen Christopher S. Bond,MO
3,1288,Sen Rudy Boschwitz,MN
4,1289,Sen Bill Bradley,NJ
...,...,...,...
963,1484,Rep Vito Fossella,NY
964,1501,Rep Barbara Lee,CA
965,1506,Rep Gregory W. Meeks,NY
966,1469,Rep Robert A. Brady,PA


In [23]:
smry_df = pd.DataFrame(summarize_edges(), columns = ['source', 'target', 'weight'])
edges_df = smry_df[smry_df.source != smry_df.target].copy()
sizes_df = smry_df[smry_df.source == smry_df.target].copy()

In [24]:
edges_df.source = edges_df.source.map(lambda x: members.get(x, 'NA')[0])
edges_df.target = edges_df.target.map(lambda x: members.get(x, 'NA')[0])
edges_df

Unnamed: 0,source,target,weight
0,Sen Robert J. Dole,Sen Lloyd M. Bentsen,30
1,Sen Robert J. Dole,Sen Christopher S. Bond,65
2,Sen Robert J. Dole,Sen Rudy Boschwitz,39
3,Sen Robert J. Dole,Sen Bill Bradley,31
4,Sen Robert J. Dole,Sen Dale Bumpers,25
...,...,...,...
256365,"Rep Charles W. ""Chip"" Pickering",Rep Richard W. Pombo,1
256366,"Rep Charles W. ""Chip"" Pickering",Rep Earl Pomeroy,1
256367,"Rep Charles W. ""Chip"" Pickering",Rep Gerald B. H. Solomon,1
256368,"Rep Charles W. ""Chip"" Pickering",Rep Mac Thornberry,1


In [26]:
members_with_sizes = members_df.merge(sizes_df, left_on='id', right_on='source', how='inner')
members_with_sizes = members_with_sizes[['id', 'name', 'state', 'weight']]
members_with_sizes.columns = ['id', 'name', 'state', 'size']
members_with_sizes

Unnamed: 0,id,name,state,size
0,1318,Sen Robert J. Dole,KS,444
1,1282,Sen Lloyd M. Bentsen,TX,165
2,1286,Sen Christopher S. Bond,MO,123
3,1288,Sen Rudy Boschwitz,MN,115
4,1289,Sen Bill Bradley,NJ,241
...,...,...,...,...
956,1465,Rep Mary Bono Mack,CA,2
957,1484,Rep Vito Fossella,NY,3
958,1501,Rep Barbara Lee,CA,1
959,1506,Rep Gregory W. Meeks,NY,1


In [30]:
edges_df.to_csv('edges.tsv', sep='\t', index=False)
members_with_sizes.to_csv('nodes.tsv', sep='\t', index=False)