# Step 2: Parsing S Bills Data from the 104th-116th Congressional Sessions

## 2.1: Importing necessary packages

In [6]:
import json
import pandas as pd
# Importing glob, which is a package that... (see link here: https://docs.python.org/3/library/glob.html)
import glob
import os
# Importing os...(see link here: https://docs.python.org/3/library/glob.html)
from tqdm.notebook import tqdm

## 2.2: Creating a list called bill_paths to store only the file paths within our new data directory that include information on the bills of interest

In [12]:
bill_paths = []
for n in range(104,117):
    # Check file paths patterns 
    file_paths = {
        104: glob.glob(f'../data/{n}/s/' + '*' + os.path.sep),
        105: glob.glob(f'../data/{n}/s/' + '*' + os.path.sep),
        106: glob.glob(f'../data/{n}/bills/s/' + '*' + os.path.sep),
        107: glob.glob(f'../data/{n}/bills/s/' + '*' + os.path.sep),
        108: glob.glob(f'../data/{n}/bills/s/' + '*' + os.path.sep),
        109: glob.glob(f'../data/{n}/bills/s/' + '*' + os.path.sep),
        110: glob.glob(f'../data/{n}/bills/s/' + '*' + os.path.sep),
        111: glob.glob(f'../data/{n}/bills/s/' + '*' + os.path.sep),
        112: glob.glob(f'../data/{n}/bills/s/' + '*' + os.path.sep),
        113: glob.glob(f'../data/{n}/congress/data/{n}/bills/s/' + '*' + os.path.sep),
        114: glob.glob(f'../data/{n}/congress/data/{n}/bills/s/' + '*' + os.path.sep),
        115: glob.glob(f'../data/{n}/{n}/bills/s/' + '*' + os.path.sep),
        116: glob.glob(f'../data/{n}/congress/data/{n}/bills/s/' + '*' + os.path.sep)
    }
    file_path = file_paths[n]
        
    bill_paths.extend(file_path)

### Checking out the length of the list to get a sense of how many bills it contains

In [13]:
len(bill_paths)

45437

## 2.3: Creating another list and using another for loop to store data on only bills that have been signed into law

In [14]:
# empty list to collect bills that passed
passed_bills = []
no_json = []

# loop over all the bill_paths
for bill_path in tqdm(bill_paths):
    try:    
        # there's a data.json file in every bill_path
        file_name = f'{ bill_path }data.json'

        # read the json
        with open(file_name) as f:
            bill_json = json.load(f)
            # every bill has a status key; i only want the ones where
            # `status` is 'ENACTED:SIGNED'
            if bill_json['status'] == 'ENACTED:SIGNED':
                # append bill_path to list if it was enacted/signed
                passed_bills.append(bill_path)
        
        #print('Found for ', str(bill_path))
    except Exception as e: 
        print('Failed for ', str(bill_path))
        print(e)
        no_json.append(bill_path)

  0%|          | 0/45437 [00:00<?, ?it/s]

Failed for  ../data/113/congress/data/113/bills/s/s1696/
[Errno 2] No such file or directory: '../data/113/congress/data/113/bills/s/s1696/data.json'


### Checking out the length of the passed_bills list to get a sense of how many bills it contains

In [15]:
len(passed_bills)

1481

## 2.4: Creating another list and using another for loop to extract only the few datapoints we're interested in within our list of passed bills

In [16]:
#Creating a list called bills_data that will store only the data points of interest
bills_data = []
for bill_path in passed_bills:
    
    # Every bill_path contains a data.json file. With string interpolation, we can grab these files.
    file_name = f'{ bill_path }/data.json'
    
    # Reading the json and extracting data on key variables as well as adding the URL to the full bill text
    with open(file_name) as f:
        bill_json = json.load(f)
        congress = bill_json['congress']
        bill_number = bill_json['number']
        bill_url = f'https://www.congress.gov/bill/{ congress }th-congress/senate-bill/{ bill_number }/text?r=1&s=2&format=txt'
        bills_data.append({
            'congress': congress,
            'bill_number': bill_number,
            'url': bill_url
        })
# Converting the bills_data list into a tabular format as a dataframe called 'bills'        
bills = pd.DataFrame(bills_data)

In [17]:
# Printing the bills list to take a glance and see if it is looking the way we expect. It seems that it is!
bills

Unnamed: 0,congress,bill_number,url
0,104,2198,https://www.congress.gov/bill/104th-congress/s...
1,104,2153,https://www.congress.gov/bill/104th-congress/s...
2,104,1970,https://www.congress.gov/bill/104th-congress/s...
3,104,1341,https://www.congress.gov/bill/104th-congress/s...
4,104,1111,https://www.congress.gov/bill/104th-congress/s...
...,...,...,...
1476,116,24,https://www.congress.gov/bill/116th-congress/s...
1477,116,163,https://www.congress.gov/bill/116th-congress/s...
1478,116,199,https://www.congress.gov/bill/116th-congress/s...
1479,116,394,https://www.congress.gov/bill/116th-congress/s...


## 2.4: Exporting our new dataframe to a CSV format for scraping in the next step

In [19]:
# Exporting the dataframe to a csv format, which we can use in the next step to scrape and add the word count for each bill
bills.to_csv('senate_bills_to_scrape.csv', index=False)