# Acquiring Legislation PDFs from Congress.gov

IARPC Project
Erick Cohen

In [1]:
import os
import urllib
from pathlib import Path
import pandas as pd
from pyprojroot import here
import random

In [2]:
data_raw_path = here("data-raw")
li = []

for file in os.listdir(data_raw_path):
    search_results_filename = file
    search_results_filepath = (str(data_raw_path) + "/" + search_results_filename)
    df = pd.read_csv(search_results_filepath, skiprows=3)
    li.append(df)

search_results_df = pd.concat(li, axis=0, ignore_index=True)

In [3]:
search_results_df.shape

(4016, 6)

In [4]:
# remove duplicate entries
search_results_df.drop_duplicates(inplace=True)
search_results_df.shape

(2230, 6)

In [5]:
search_results_df.head()

Unnamed: 0,Report Number,URL,Congress,Accompanies,Title,Committee
0,H. Rept. 117-12,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 233,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules
1,H. Rept. 117-11,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 198,PROVIDING FOR CONSIDERATION OF THE SENATE AMEN...,House Rules
2,H. Rept. 117-10,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 188,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules
3,H. Rept. 117-9,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 179,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules
4,H. Rept. 117-8,https://www.congress.gov/congressional-report/...,117th Congress (2021-2022),H.Res. 166,PROVIDING FOR CONSIDERATION OF THE BILL (H.R. ...,House Rules


We are interested in the URL. We will attempt to parse through them and download .PDFs of the bills if available

In [6]:
url = search_results_df.URL #[200] 
url = url.apply(lambda x: x.strip('https://'))
url = url.apply(lambda x: r'https://' + x)
url

0       https://www.congress.gov/congressional-report/...
1       https://www.congress.gov/congressional-report/...
2       https://www.congress.gov/congressional-report/...
3       https://www.congress.gov/congressional-report/...
4       https://www.congress.gov/congressional-report/...
                              ...                        
3821    https://www.congress.gov/congressional-report/...
3822    https://www.congress.gov/congressional-report/...
3823    https://www.congress.gov/congressional-report/...
3824    https://www.congress.gov/congressional-report/...
3825    https://www.congress.gov/congressional-report/...
Name: URL, Length: 2230, dtype: object

In [7]:
import requests
from bs4 import BeautifulSoup


headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [8]:
data_output_path = here("data-output")

# Function to itterate through each of the URLs from the dataframe 
import requests
import time # make sure we sleep to not overwhelm the servers


def acquire_pdf(url, i):
    """
    Acquires PDF of bill from congress.gov
    
    needs url that can be exported from search results. 
    
    I.e. https://www.congress.gov/search?q={%22source%22:%22legislation%22,%22congress%22:[%22117%22,%22116%22],%22bill-status%22:%22law%22}&searchResultViewType=expanded
    """
    
    print(url)
    
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    
    time.sleep(random.randrange(3, 6))
    
    links = []
    for a in soup.find_all('div', attrs={'id' : 'report'}):
        for b in a.findChildren():
            link = (b.get('href'))
            if link != None:
                links.append(link)


    pdf_path_url = [] 
    for l in links:
        if l.endswith(".pdf"):
            pdf_path_url.append(l)

    
    base_path = r'https://www.congress.gov'
    
    try:
        if len(pdf_path_url) < 1:
            print("No PDF found")
        
        else:
            pdf_path = base_path + pdf_path_url[0]

            res = requests.get(pdf_path)

            f = open(str(data_output_path) + '/' + 'file_num_' + str(i) + '.pdf', 'wb')
            f.write(res.content)
            f.close()
            print('text saved!')
        
    except:  
        print('Unknown error')
    

In [9]:
%%time
url
import timeit

for u, i in zip(url, list(range(len(url)))): 
    acquire_pdf(u, i)

https://www.congress.gov/congressional-report/117th-congress/house-report/12/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/11/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/10/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/9/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/8/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/7/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/6/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/5/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/4/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/3/1
text saved!
https://www.congress.gov/congressional-report/117th-congress/house-report/2/1
text saved!
https:/

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/76/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/77/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/78/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/79/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/80/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/81/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/82/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/83/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/84/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/85/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/86/1

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/164/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/165/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/166/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/167/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/168/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/169/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/170/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/171/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/172/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/173/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/253/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/254/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/255/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/256/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/257/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/258/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/259/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/260/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/261/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/262/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/341/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/342/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/343/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/344/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/345/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/346/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/347/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/348/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/349/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/350/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/429/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/430/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/431/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/432/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/433/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/434/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/435/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/436/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/436/2
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/437/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/518/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/519/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/520/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/521/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/522/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/523/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/524/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/525/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/526/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/527/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/607/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/608/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/609/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/610/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/611/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/612/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/613/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/614/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/615/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/616/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/696/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/697/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/698/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/699/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/700/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/701/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/702/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/703/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/704/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/705/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

ConnectionError: HTTPSConnectionPool(host='www.congress.gov', port=443): Max retries exceeded with url: /congressional-report/115th-congress/house-report/731/1?Access-Control-Allow-Origin=%2A&Access-Control-Allow-Methods=GET&Access-Control-Allow-Headers=Content-Type&Access-Control-Max-Age=3600&User-Agent=Mozilla%2F5.0+%28X11%3B+Ubuntu%3B+Linux+x86_64%3B+rv%3A52.0%29+Gecko%2F20100101+Firefox%2F52.0 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000022B8EE27088>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

Because there are so many files, the itteration should be chunked to allow the server not be overwhelemed by one function. 

In [23]:
%%time
url[755:]

for u, i in zip(url[755:], list(range(len(url[755:])))): 
    acquire_pdf(u, i + 755)

https://www.congress.gov/congressional-report/115th-congress/house-report/730/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/731/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/732/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/733/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/734/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/735/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/736/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/737/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/738/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/739/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/740/1


text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/819/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/820/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/821/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/822/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/823/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/824/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/825/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/826/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/827/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/828/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/909/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/910/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/911/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/912/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/913/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/914/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/915/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/916/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/917/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/918/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/711/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/710/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/709/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/708/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/707/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/706/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/705/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/704/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/703/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/702/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/621/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/620/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/619/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/618/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/617/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/616/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/615/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/614/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/613/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/612/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/531/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/530/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/529/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/528/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/527/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/526/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/525/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/524/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/523/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/522/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/443/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/442/2
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/442/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/441/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/440/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/439/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/438/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/437/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/436/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/435/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/355/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/354/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/353/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/352/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/351/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/350/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/349/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/348/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/347/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/346/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/270/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/269/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/268/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/267/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/266/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/265/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/264/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/263/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/262/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/261/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/180/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/179/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/178/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/177/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/176/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/175/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/174/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/173/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/172/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/171/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-r

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/92/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/91/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/90/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/89/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/88/2
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/88/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/87/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/86/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/85/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/84/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/83/1

text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/7/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/6/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/5/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/4/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/3/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/2/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/house-report/1/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/336/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/335/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/334/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/333/

text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/254/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/253/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/252/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/251/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/250/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/249/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/248/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/247/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/246/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/245/1
text saved!
https://www.congress.gov/congressional-report/116th-congre

text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/165/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/164/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/163/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/162/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/161/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/160/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/159/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/158/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/157/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/156/1
text saved!
https://www.congress.gov/congressional-report/116th-congre

text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/76/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/75/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/74/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1130/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1129/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1128/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1127/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1126/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1125/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1124/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/

text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1044/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1043/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1042/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1041/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1040/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1039/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1038/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1037/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1036/1
text saved!
https://www.congress.gov/congressional-report/115th-congress/house-report/1035/1
text saved!
https://www.congress.gov/congressional-report/115th-congre

text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/35/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/36/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/37/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/38/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/39/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/40/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/41/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/42/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/43/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-report/44/1
text saved!
https://www.congress.gov/congressional-report/116th-congress/senate-

In [21]:
range(len(url[755:]))

range(0, 1475)

In [10]:
# url = r'https://www.congress.gov/congressional-report/117th-congress/house-report/9/1' # url[105]
# # url = url.strip('https://')
# # url = r'https://' + str(url)
# url


# req = requests.get(url, headers)
# soup = BeautifulSoup(req.content, 'html.parser')
# print(soup.prettify())

In [11]:
# links = []

# for a in soup.find_all('div', attrs={'id' : 'report'}):
#     for b in a.findChildren():
#         link = (b.get('href'))
#         if link != None:
#             links.append(link)

# print(links)

# pdf_path_url = [] 
# for l in links:
#     if l.endswith(".pdf"):
#         pdf_path_url.append(l)

# pdf_path_url

In [12]:
# import requests

# base_path = r'https://www.congress.gov'
# pdf_path = base_path + pdf_path_url[0]

# res = requests.get(pdf_path)


In [13]:
# print(res)
# f = open('data-raw/test3.pdf', 'wb')
# f.write(res.content)
# f.close()