# Step 3: Reading in the Senate bills for which I'll scrape word count data, sorting them, and grabbing wordcount data

## 3.1: Importing the necessary packages

In [1]:
import json
import requests
import pandas as pd
from tqdm.notebook import tqdm



## 3.2: Reading the 'senate_bills_to_scrape.csv' file into a new dataframe and sorting the entries in ascending order by congress number, then bill number

In [2]:
# Reading csv file into a new dataframe
bills_to_scrape = pd.read_csv('senate_bills_to_scrape.csv')

# Sorting the bills by congresional session and bill number
bills_to_scrape = bills_to_scrape.sort_values(by=['congress', 'bill_number']).reset_index(drop=True)

# Printing the updated dataframe to ensure the sorting worked as intended (it did!)
bills_to_scrape

Unnamed: 0,congress,bill_number,url
0,104,1,https://www.congress.gov/bill/104th-congress/s...
1,104,2,https://www.congress.gov/bill/104th-congress/s...
2,104,4,https://www.congress.gov/bill/104th-congress/s...
3,104,39,https://www.congress.gov/bill/104th-congress/s...
4,104,178,https://www.congress.gov/bill/104th-congress/s...
...,...,...,...
1476,116,4116,https://www.congress.gov/bill/116th-congress/s...
1477,116,4148,https://www.congress.gov/bill/116th-congress/s...
1478,116,4209,https://www.congress.gov/bill/116th-congress/s...
1479,116,4762,https://www.congress.gov/bill/116th-congress/s...


## 3.3: Looping through the bills to download
Here I start by converting to a Python dictionary file format, which is easier to loop through...

In [3]:
# Creating the new dictionary file
bills = json.loads(bills_to_scrape.to_json(orient='records'))

In [4]:
# Taking a quick look at the dictionary file to make sure it's looking as intended (it is!)
bills[0:5]

[{'congress': 104,
  'bill_number': 1,
  'url': 'https://www.congress.gov/bill/104th-congress/senate-bill/1/text?r=1&s=2&format=txt'},
 {'congress': 104,
  'bill_number': 2,
  'url': 'https://www.congress.gov/bill/104th-congress/senate-bill/2/text?r=1&s=2&format=txt'},
 {'congress': 104,
  'bill_number': 4,
  'url': 'https://www.congress.gov/bill/104th-congress/senate-bill/4/text?r=1&s=2&format=txt'},
 {'congress': 104,
  'bill_number': 39,
  'url': 'https://www.congress.gov/bill/104th-congress/senate-bill/39/text?r=1&s=2&format=txt'},
 {'congress': 104,
  'bill_number': 178,
  'url': 'https://www.congress.gov/bill/104th-congress/senate-bill/178/text?r=1&s=2&format=txt'}]

In [5]:
!mkdir -p pages

## 3.4: Using a for loop to download all the .html pages on Congress.gov that contain the full text of each bill


In [7]:
# Looping through bills to download bill urls and text of all enacted bills
for bill in tqdm(bills):
    congress = bill['congress']
    bill_number = bill['bill_number']
    bill_url = bill['url']
    # Request the URL
    page = requests.get(bill_url)
    
    # Save the HTML of the URL
    # See string_interpolation.ipynb notebook in this repo for how f-strings work
    with open(f'pages/senate/{congress}_{ bill_number }.html', 'w') as f:
        f.write(page.text)

  0%|          | 0/1481 [00:00<?, ?it/s]