# Step 4: Scraping the bill word count data

## 4.1: Importing necessary packages

In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re
import string
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm



## 4.2: Reading the 'bills_to_scrape.csv' file into a new dataframe and sorting the entries in ascending order by congress number, then bill number
The 'bills_to_scrape.csv' file that we created in the previous step is looking great, though it could be better organized. Here we read the csv file into a new dataframe and use the .sort_values function to arrange all the bills within it in ascending order first by congressional session number and second by bill number. This will make our dataset cleanear and easier to analyze later on.

In [2]:
bills_to_scrape = pd.read_csv('sjres_to_scrape.csv')
bills_to_scrape = bills_to_scrape.sort_values(by=['congress', 'bill_number']).reset_index(drop=True)
bills_to_scrape

Unnamed: 0,congress,bill_number,url
0,104,20,https://www.congress.gov/bill/104th-congress/s...
1,104,38,https://www.congress.gov/bill/104th-congress/s...
2,104,53,https://www.congress.gov/bill/104th-congress/s...
3,104,64,https://www.congress.gov/bill/104th-congress/s...
4,105,5,https://www.congress.gov/bill/105th-congress/s...
...,...,...,...
64,115,57,https://www.congress.gov/bill/115th-congress/s...
65,115,60,https://www.congress.gov/bill/115th-congress/s...
66,116,65,https://www.congress.gov/bill/116th-congress/s...
67,116,66,https://www.congress.gov/bill/116th-congress/s...


## 4.3: Saving a copy of the bills_to_scrape dataframe called bills_df as a backup 

In [3]:
bills_df = bills_to_scrape.copy()

## 4.4: Converting the bills_to_scrape dataframe into a json file called bills that we can loop through 

In [4]:
bills = json.loads(bills_to_scrape.to_json(orient='records'))

## 4.5: Using a for loop and Beautiful Soup to navigate to each bill URL, identify the component of the page that contains the full bill text, get the text, clean it up, get the word count, and store it in a new column called 'word_count'

In [5]:
# Making the punctuation table before the loop, which will be used to replace punctuation with space in the loop
punctuation_table = str.maketrans({key: ' ' for key in string.punctuation})

In [6]:
for bill in tqdm(bills):
# for bill in tqdm(bills[0:290]): # run this line if you want the first 290
    congress = bill['congress']
    bill_number = bill['bill_number']
    bill_url = bill['url']
    
    with open(f'pages/sjres/{congress}_{ bill_number }.html', 'r') as f:

        # Parse the page with bs4
        soup = BeautifulSoup(f.read(), features='html.parser')

        # Find and get what's inside `id='billTextContainer'`
        bill_text_container = soup.find(id='billTextContainer')
        
        try:
            bill_text = bill_text_container.get_text()

            # Clean up the bill text

            # Replace punctuation with space
            bill_text_cleaned = bill_text.translate(punctuation_table)

            # Replace newlines with space
            bill_text_cleaned = re.sub('\\n', ' ', bill_text_cleaned)

            # Replace multiple spaces with one space
            bill_text_cleaned = re.sub('\s{2,}', ' ', bill_text_cleaned)

            # Get the word count
            bill_word_count = len(bill_text_cleaned.split())

            # # Save the word count into the dataframe
            bills_df.loc[
                (bills_df['bill_number'] == bill_number) & (bills_df['congress'] == congress), 'word_count'
            ] = bill_word_count
        except:
            pass

  0%|          | 0/69 [00:00<?, ?it/s]

In [7]:
bills[12]

{'congress': 106,
 'bill_number': 42,
 'url': 'https://www.congress.gov/bill/106th-congress/senate-joint-resolution/42/text?r=1&s=2&format=txt'}

In [8]:
bills_df.iloc[125]['url']

'https://www.congress.gov/bill/106th-congress/house-bill/1959/text?r=1&s=2&format=txt'

## 4.6: Export the updated dataframe to a CSV file for analysis

In [8]:
bills_df.to_csv('sjres.csv', index=False)