# Acquiring Legislation PDFs from Congress.gov

IARPC Project
Erick Cohen

In [1]:
import os
import urllib
from pathlib import Path
import pandas as pd
from pyprojroot import here
import random

In [2]:
# data_raw_path = here("data-raw")
li = []

data_raw_path = '../data-raw'

committee_exports = [filename for filename in os.listdir(data_raw_path) if filename.endswith('.csv')]

for file in committee_exports:
    search_results_filename = file
    search_results_filepath = (str(data_raw_path) + "/" + search_results_filename)
    df = pd.read_csv(search_results_filepath, skiprows=3)
    li.append(df)

search_results_df = pd.concat(li, axis=0, ignore_index=True)

In [3]:
search_results_df.shape

(4016, 6)

In [4]:
# remove duplicate entries
search_results_df.drop_duplicates(inplace=True)
search_results_df.shape

(2230, 6)

In [5]:
search_results_df.head()

Unnamed: 0,Report Number,URL,Congress,Accompanies,Title,Committee
0,H. Rept. 117-12,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 233,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules
1,H. Rept. 117-11,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 198,PROVIDING FOR CONSIDERATION OF THE SENATE AMEN...,House Rules
2,H. Rept. 117-10,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 188,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules
3,H. Rept. 117-9,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 179,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules
4,H. Rept. 117-8,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 166,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules


We are interested in the URL. We will attempt to parse through them and download .PDFs of the bills if available

In [6]:
url = search_results_df.URL 
# url = url.apply(lambda x: x.strip('https://'))
# url = url.apply(lambda x: r'https://' + x)
url

0       https://www.congress.gov/congressional-report/...
1       https://www.congress.gov/congressional-report/...
2       https://www.congress.gov/congressional-report/...
3       https://www.congress.gov/congressional-report/...
4       https://www.congress.gov/congressional-report/...
                              ...                        
3821    https://www.congress.gov/congressional-report/...
3822    https://www.congress.gov/congressional-report/...
3823    https://www.congress.gov/congressional-report/...
3824    https://www.congress.gov/congressional-report/...
3825    https://www.congress.gov/congressional-report/...
Name: URL, Length: 2230, dtype: object

In [7]:
import requests
from bs4 import BeautifulSoup


headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [8]:
data_output_path = here("data-output")

# Function to itterate through each of the URLs from the dataframe 
import requests
import time # make sure we sleep to not overwhelm the servers
import re

def acquire_pdf(url, create_cache=False):
    """
    Acquires PDF of committee from congress.gov
    
    needs url that can be exported from search results. 
    
    create_cache(Bool): if True, creates a cache.txt file with the url and text status
    
    I.e. https://www.congress.gov/search?q={%22source%22:%22legislation%22,%22congress%22:[%22117%22,%22116%22],%22bill-status%22:%22law%22}&searchResultViewType=expanded
    """
    if create_cache:
        if 'cache.txt' not in os.listdir():
            file = open("cache.txt", "x") 
            file.close() 
            print('cache.txt created')
        else: 
            pass
    else:
        pass
    
    print(url)
    url_cleaned = re.sub('[:/.//-]+', '', url)
    
    with open('cache.txt', 'r') as c:
        cache = c.read()
        
    if url in cache:
        print('URL in Cache - file skipped.')
    else:
    
        req = requests.get(url, headers)
        soup = BeautifulSoup(req.content, 'html.parser')

        time.sleep(random.randrange(3, 6))

        links = []
        for a in soup.find_all('div', attrs={'id' : 'report'}):
            for b in a.findChildren():
                link = (b.get('href'))
                if link != None:
                    links.append(link)


        pdf_path_url = [] 
        for l in links:
            if l.endswith(".pdf"):
                pdf_path_url.append(l)


        base_path = r'https://www.congress.gov'

        try:
            if len(pdf_path_url) < 1:
                print("No PDF found")
                outcome = 'no PDF found'

            else:
                pdf_path = base_path + pdf_path_url[0]

                res = requests.get(pdf_path)

                f = open('../data-output/' + str(url_cleaned) + '.pdf', 'wb')
                f.write(res.content)
                f.close()
                print('text saved!')
                outcome = 'text saved'

        except:
            print('Unknown error')
            outcome = 'unknown error'

        with open('cache.txt', 'a') as f:
                f.write(str(url) + '\t' + outcome + '\n')

In [10]:
%%time
url
import timeit

for u in url: 
    acquire_pdf(u, create_cache=True)

https://www.congress.gov/congressional-report/117th-congress/house-report/12/1
URL in Cache - file skipped.
https://www.congress.gov/congressional-report/117th-congress/house-report/11/1
URL in Cache - file skipped.
https://www.congress.gov/congressional-report/117th-congress/house-report/10/1
URL in Cache - file skipped.
https://www.congress.gov/congressional-report/117th-congress/house-report/9/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/8/1


KeyboardInterrupt: 