# Acquiring Legislation PDFs from Congress.gov

PP project with Shirely and Gifford

In [1]:
import os
import urllib
import pandas as pd

In [2]:
search_results_filename = "search_results_2021-03-19_0151pm.csv"
search_results_filepath = ("data-raw/" + search_results_filename)
search_results_df = pd.read_csv(search_results_filepath, skiprows=3)

search_results_df.head()

Unnamed: 0,Legislation Number,URL,Congress,Amends Bill,Title,Sponsor,Date of Introduction,Date Offered,Date Submitted,Date Proposed,Number of Cosponsors,Committees,Latest Action Date,Latest Action
0,H.R. 1319,https://www.congress.gov/bill/117th-congress/h...,117th Congress (2021-2022),,American Rescue Plan Act of 2021,"Rep. Yarmuth, John A. [D-KY-3]",02/24/2021,,,,0,House - Budget,03/11/2021,Became Public Law No: 117-2.
1,H.R. 335,https://www.congress.gov/bill/117th-congress/h...,117th Congress (2021-2022),,To provide for an exception to a limitation ag...,"Rep. Smith, Adam [D-WA-9]",01/15/2021,,,,0,House - Armed Services,01/22/2021,Became Public Law No: 117-1.
2,H.R. 8906,https://www.congress.gov/bill/116th-congress/h...,116th Congress (2019-2020),,Lifespan Respite Care Reauthorization Act of 2020,"Rep. Langevin, James R. [D-RI-2]",12/09/2020,,,,1,House - Energy and Commerce,01/05/2021,Became Public Law No: 116-324.
3,H.R. 8900,https://www.congress.gov/bill/116th-congress/h...,116th Congress (2019-2020),,"Further Continuing Appropriations Act, 2021, a...","Rep. Lowey, Nita M. [D-NY-17]",12/08/2020,,,,0,"House - Appropriations, Budget",12/11/2020,Became Public Law No: 116-215.
4,H.R. 8810,https://www.congress.gov/bill/116th-congress/h...,116th Congress (2019-2020),,National Landslide Preparedness Act,"Rep. DelBene, Suzan K. [D-WA-1]",11/24/2020,,,,6,"House - Natural Resources, Science, Space, and...",01/05/2021,Became Public Law No: 116-323.


We are interested in the URL. We will attempt to parse through them and download .PDFs of the bills if available

In [3]:
url = search_results_df.URL #[200] 
# url = url.apply(lambda x: x.strip('https://'))
# url = url.apply(lambda x: r'https://' + x)
url

0      https://www.congress.gov/bill/117th-congress/h...
1      https://www.congress.gov/bill/117th-congress/h...
2      https://www.congress.gov/bill/116th-congress/h...
3      https://www.congress.gov/bill/116th-congress/h...
4      https://www.congress.gov/bill/116th-congress/h...
                             ...                        
341    https://www.congress.gov/bill/116th-congress/s...
342    https://www.congress.gov/bill/116th-congress/s...
343    https://www.congress.gov/bill/116th-congress/s...
344    https://www.congress.gov/bill/116th-congress/s...
345    https://www.congress.gov/bill/116th-congress/s...
Name: URL, Length: 346, dtype: object

In [4]:
import requests
from bs4 import BeautifulSoup


headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [5]:
# Function to itterate through each of the URLs from the dataframe 
import requests
import time # make sure we sleep to not overwhelm the servers
from random import randint
import re

def acquire_pdf(url):
    """
    Acquires PDF of bill from congress.gov
    
    needs url exported from search results. 
    
    I.e. https://www.congress.gov/search?q={%22source%22:%22legislation%22,%22congress%22:[%22117%22,%22116%22],%22bill-status%22:%22law%22}&searchResultViewType=expanded
    """
    
    print(url)
    url_cleaned = re.sub('[:/.//-]+', '', url)
    
    with open('cache.txt', 'r') as c:
        cache = c.read()
        
    if url in cache:
        print('URL in Cache - file skipped.')
    else: 
        req = requests.get(url, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
    
        time.sleep(randint(2, 5))
    
        links = []
        for a in soup.find_all('td'):
            for b in a.findChildren():
                link = (b.get('href'))
                links.append(link)


        pdf_path_url = [] 
        for l in links:
            if l.endswith(".pdf"):
                pdf_path_url.append(l)


        base_path = r'https://www.congress.gov'

        try:
            if len(pdf_path_url) < 1:
                print("No PDF found")
                outcome = 'no PDF found'
    

            else:
                pdf_path = base_path + pdf_path_url[0]

                res = requests.get(pdf_path)

                f = open('data-output/' + str(url_cleaned) + '.pdf', 'wb')
                f.write(res.content)
                f.close()
                print('text saved!')
                outcome = 'text saved'
    


        except: 
            print('Unknown error')
            outcome = 'unknown error'
    

        with open('cache.txt', 'a') as f:
            f.write(str(url) + '\t' + outcome + '\n')
    
    

In [6]:
%%time
url
import timeit

for u in url: 
    acquire_pdf(u)

https://www.congress.gov/bill/117th-congress/house-bill/1319
No PDF found
https://www.congress.gov/bill/117th-congress/house-bill/335
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/8906
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/8900
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/8810
No PDF found
https://www.congress.gov/bill/116th-congress/house-bill/8611
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/8472
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/8354
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/8337
URL in Cache - file skipped.
https://www.congress.gov/bill/116th-congress/house-bill/8276
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/8247
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/7898
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/7810
text saved!
https://www.congress.gov/bill/116

text saved!
https://www.congress.gov/bill/116th-congress/house-bill/3207
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/3196
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/3153
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/3151
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/3144
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/3055
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/3005
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/2969
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/2940
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/2938
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/2744
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/2695
text saved!
https://www.congress.gov/bill/116th-congress/house-bill/2502
text saved!
https://www.congress.gov/bill/116th-con

text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/5076
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/5036
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4996
No PDF found
https://www.congress.gov/bill/116th-congress/senate-bill/4902
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4762
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4684
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4209
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4148
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4126
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4116
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4091
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4075
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/4072
text saved!
https://www.congress.gov/

text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/163
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/153
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/151
text saved!
https://www.congress.gov/bill/116th-congress/senate-bill/134
URL in Cache - file skipped.
https://www.congress.gov/bill/116th-congress/senate-bill/50
URL in Cache - file skipped.
https://www.congress.gov/bill/116th-congress/senate-bill/49
URL in Cache - file skipped.
https://www.congress.gov/bill/116th-congress/senate-bill/47
URL in Cache - file skipped.
https://www.congress.gov/bill/116th-congress/senate-bill/24
URL in Cache - file skipped.
https://www.congress.gov/bill/116th-congress/senate-joint-resolution/67
text saved!
https://www.congress.gov/bill/116th-congress/senate-joint-resolution/66
text saved!
https://www.congress.gov/bill/116th-congress/senate-joint-resolution/65
text saved!
Wall time: 28min 52s
