In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re
import string
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm



In [2]:
bills_to_scrape = pd.read_csv('bill_to_scrape.csv')
bills_to_scrape = bills_to_scrape.sort_values(by=['congress', 'bill_number']).reset_index(drop=True)
bills_to_scrape

Unnamed: 0,congress,bill_number,url
0,111,1,https://www.congress.gov/bill/111th-congress/h...
1,111,2,https://www.congress.gov/bill/111th-congress/h...
2,111,81,https://www.congress.gov/bill/111th-congress/h...
3,111,131,https://www.congress.gov/bill/111th-congress/h...
4,111,146,https://www.congress.gov/bill/111th-congress/h...
...,...,...,...
1156,115,7243,https://www.congress.gov/bill/115th-congress/h...
1157,115,7279,https://www.congress.gov/bill/115th-congress/h...
1158,115,7318,https://www.congress.gov/bill/115th-congress/h...
1159,115,7319,https://www.congress.gov/bill/115th-congress/h...


In [11]:
bills_df = bills_to_scrape.copy()

In [3]:
bills = json.loads(bills_to_scrape.to_json(orient='records'))

In [4]:
# Making the punctuation table before the loop, which will be used to replace punctuation with space in the loop
punctuation_table = str.maketrans({key: ' ' for key in string.punctuation})

In [16]:
for bill in tqdm(bills):
# for bill in tqdm(bills[0:290]): # run this line if you want the first 290
    congress = bill['congress']
    bill_number = bill['bill_number']
    bill_url = bill['url']
    
    with open(f'pages/{congress}_{ bill_number }.html', 'r') as f:

        # Parse the page with bs4
        soup = BeautifulSoup(f.read(), features='html.parser')

        # Find and get what's inside `id='billTextContainer'`
        bill_text_container = soup.find(id='billTextContainer')
        
        try:
            bill_text = bill_text_container.get_text()

            # Clean up the bill text

            # Replace punctuation with space
            bill_text_cleaned = bill_text.translate(punctuation_table)

            # Replace newlines with space
            bill_text_cleaned = re.sub('\\n', ' ', bill_text_cleaned)

            # Replace multiple spaces with one space
            bill_text_cleaned = re.sub('\s{2,}', ' ', bill_text_cleaned)

            # Get the word count
            bill_word_count = len(bill_text_cleaned.split())

            # # Save the word count into the dataframe
            bills_df.loc[
                (bills_df['bill_number'] == bill_number) & (bills_df['congress'] == congress), 'word_count'
            ] = bill_word_count
        except:
            pass

  0%|          | 0/290 [00:00<?, ?it/s]

111th congress: 3619 does not have #billTextContainer


In [15]:
bills[125]

{'congress': 111,
 'bill_number': 3619,
 'url': 'https://www.congress.gov/bill/111th-congress/house-bill/3619/text?r=1&s=2&format=txt'}

In [20]:
bills_df[
    (bills_df['congress'] == 111) & 
    (bills_df['bill_number'] == 3619)
]

Unnamed: 0,congress,bill_number,url,word_count
125,111,3619,https://www.congress.gov/bill/111th-congress/h...,


In [22]:
bills_df.iloc[125]['url']

'https://www.congress.gov/bill/111th-congress/house-bill/3619/text?r=1&s=2&format=txt'

In [None]:
bills_df.to_csv('bills.csv', index=False)