# Parse 111-115th Congress

Export a CSV of URLS to scrape

In [1]:
import json
import pandas as pd
import glob
import os
from tqdm.notebook import tqdm



In [2]:
bill_paths = []
for n in range(111,116):
    if (n == 111) or (n == 112):
        file_paths = glob.glob(f'data/{n}/bills/hr/' + '*' + os.path.sep)
    elif n == 115:
        file_paths = glob.glob(f'data/{n}/{n}/bills/hr/' + '*' + os.path.sep)
    else:
        # 113, 114th
        file_paths = glob.glob(f'data/{n}/congress/data/{n}/bills/hr/' + '*' + os.path.sep)
        
    bill_paths.extend(file_paths)

In [3]:
len(bill_paths)

33089

In [4]:
# empty list to collect bills that passed
passed_bills = []

# loop over all the bill_paths
for bill_path in tqdm(bill_paths):
    
    # there's a data.json file in every bill_path
    file_name = f'{ bill_path }data.json'

    # read the json
    with open(file_name) as f:
        bill_json = json.load(f)
        # every bill has a status key; i only want the ones where
        # `status` is 'ENACTED:SIGNED'
        if bill_json['status'] == 'ENACTED:SIGNED':
            # append bill_path to list if it was enacted/signed
            passed_bills.append(bill_path)

  0%|          | 0/33089 [00:00<?, ?it/s]

In [5]:
len(passed_bills)

1161

In [6]:
bills_data = []
for bill_path in passed_bills:
    
    # there's a data.json file in every bill_path
    file_name = f'{ bill_path }/data.json'
    
    # read the json
    with open(file_name) as f:
        bill_json = json.load(f)
        congress = bill_json['congress']
        bill_number = bill_json['number']
        bill_url = f'https://www.congress.gov/bill/{ congress }th-congress/house-bill/{ bill_number }/text?r=1&s=2&format=txt'
        bills_data.append({
            'congress': congress,
            'bill_number': bill_number,
            'url': bill_url
        })
        
bills = pd.DataFrame(bills_data)

In [7]:
bills

Unnamed: 0,congress,bill_number,url
0,111,5014,https://www.congress.gov/bill/111th-congress/h...
1,111,5470,https://www.congress.gov/bill/111th-congress/h...
2,111,2004,https://www.congress.gov/bill/111th-congress/h...
3,111,3114,https://www.congress.gov/bill/111th-congress/h...
4,111,3326,https://www.congress.gov/bill/111th-congress/h...
...,...,...,...
1156,115,6414,https://www.congress.gov/bill/115th-congress/h...
1157,115,2611,https://www.congress.gov/bill/115th-congress/h...
1158,115,6621,https://www.congress.gov/bill/115th-congress/h...
1159,115,3996,https://www.congress.gov/bill/115th-congress/h...


In [8]:
bills.to_csv('bill_to_scrape.csv', index=False)