# Step 4: Scraping the Senate bill word count data

## 4.1: Importing necessary packages

In [None]:
import json
import requests
from bs4 import BeautifulSoup
import re
import string
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm



ModuleNotFoundError: No module named 'vega_datasets'

## 4.2: Reading the 'senate_bills_to_scrape.csv' file into a new dataframe and sorting the entries in ascending order by congress number, then bill number

In [9]:
pd.set_option('display.max_colwidth', None)
bills_to_scrape = pd.read_csv('senate_bills_to_scrape.csv')
bills_to_scrape = bills_to_scrape.sort_values(by=['congress', 'bill_number']).reset_index(drop=True)
bills_to_scrape

Unnamed: 0,congress,bill_number,url
0,104,1,https://www.congress.gov/bill/104th-congress/senate-bill/1/text?r=1&s=2&format=txt
1,104,2,https://www.congress.gov/bill/104th-congress/senate-bill/2/text?r=1&s=2&format=txt
2,104,4,https://www.congress.gov/bill/104th-congress/senate-bill/4/text?r=1&s=2&format=txt
3,104,39,https://www.congress.gov/bill/104th-congress/senate-bill/39/text?r=1&s=2&format=txt
4,104,178,https://www.congress.gov/bill/104th-congress/senate-bill/178/text?r=1&s=2&format=txt
...,...,...,...
1476,116,4116,https://www.congress.gov/bill/116th-congress/senate-bill/4116/text?r=1&s=2&format=txt
1477,116,4148,https://www.congress.gov/bill/116th-congress/senate-bill/4148/text?r=1&s=2&format=txt
1478,116,4209,https://www.congress.gov/bill/116th-congress/senate-bill/4209/text?r=1&s=2&format=txt
1479,116,4762,https://www.congress.gov/bill/116th-congress/senate-bill/4762/text?r=1&s=2&format=txt


## 4.3: Saving a copy of the senate_bills_to_scrape dataframe called senate_bills_df as a backup 

In [19]:
senate_bills_df = bills_to_scrape.copy()

## 4.4: Converting the senate_bills_to_scrape dataframe into a json file called bills that we can loop through 

In [20]:
bills = json.loads(bills_to_scrape.to_json(orient='records'))

## 4.5: Using a for loop and Beautiful Soup to navigate to each bill URL, identify the component of the page that contains the full bill text, get the text, clean it up, get the word count, and store it in a new column called 'word_count'

In [38]:
# Making the punctuation table before the loop, which will be used to replace punctuation with space in the loop
punctuation_table = str.maketrans({key: ' ' for key in string.punctuation})

In [43]:
for bill in tqdm(bills):
# for bill in tqdm(bills[0:290]): # run this line if you want the first 290
    congress = bill['congress']
    bill_number = bill['bill_number']
    bill_url = bill['url']
    
    with open(f'pages/senate/{congress}_{ bill_number }.html', 'r') as f:

        # Parse the page with bs4
        soup = BeautifulSoup(f.read(), features='html.parser')

        # Find and get what's inside `id='billTextContainer'`
        bill_text_container = soup.find(id='billTextContainer')
        
        try:
            bill_text = bill_text_container.get_text()

            # Clean up the bill text

            # Replace punctuation with space
            bill_text_cleaned = bill_text.translate(punctuation_table)

            # Replace new lines with space
            bill_text_cleaned = re.sub('\\n', ' ', bill_text_cleaned)

            # Replace multiple spaces with one space
            bill_text_cleaned = re.sub('\s{2,}', ' ', bill_text_cleaned)

            # Get the word count
            bill_word_count = len(bill_text_cleaned.split())

            # Save the word count into the dataframe
            senate_bills_df.loc[
                (senate_bills_df['bill_number'] == bill_number) & (senate_bills_df['congress'] == congress), 'word_count'
            ] = bill_word_count
            
        except:
            print('passed!')
            pass

  0%|          | 0/1481 [00:00<?, ?it/s]

passed!


In [45]:
#Taking a quick look at the dataframe
pd.set_option('display.max_colwidth', None)
senate_bills_df

Unnamed: 0,congress,bill_number,url,word_count
0,104,1,https://www.congress.gov/bill/104th-congress/senate-bill/1/text?r=1&s=2&format=txt,10498.0
1,104,2,https://www.congress.gov/bill/104th-congress/senate-bill/2/text?r=1&s=2&format=txt,19733.0
2,104,4,https://www.congress.gov/bill/104th-congress/senate-bill/4/text?r=1&s=2&format=txt,6164.0
3,104,39,https://www.congress.gov/bill/104th-congress/senate-bill/39/text?r=1&s=2&format=txt,29252.0
4,104,178,https://www.congress.gov/bill/104th-congress/senate-bill/178/text?r=1&s=2&format=txt,184.0
...,...,...,...,...
1476,116,4116,https://www.congress.gov/bill/116th-congress/senate-bill/4116/text?r=1&s=2&format=txt,283.0
1477,116,4148,https://www.congress.gov/bill/116th-congress/senate-bill/4148/text?r=1&s=2&format=txt,170.0
1478,116,4209,https://www.congress.gov/bill/116th-congress/senate-bill/4209/text?r=1&s=2&format=txt,375.0
1479,116,4762,https://www.congress.gov/bill/116th-congress/senate-bill/4762/text?r=1&s=2&format=txt,182.0


## 4.6: Export the updated dataframe to a CSV file for analysis

In [46]:
senate_bills_df.to_csv('senate_bills.csv', index=False)