In [None]:
from bs4 import BeautifulSoup as bs
import requests
import lxml
import pandas as pd
import numpy as np
import os
import playwright
from configparser import ConfigParser
from urllib.parse import urljoin
import json
import pandas as pd
import time

In [None]:
API_VERSION = "v3"
ROOT_URL = "https://api.congress.gov/"
RESPONSE_FORMAT = "json"


class _MethodWrapper:
    """ Wrap request method to facilitate queries.  Supports requests signature. """

    def __init__(self, parent, http_method):
        self._parent = parent
        self._method = getattr(parent._session, http_method)

    def __call__(self, endpoint, *args, **kwargs):  # full signature passed here
        response = self._method(
            urljoin(self._parent.base_url, endpoint), *args, **kwargs
        )
        # unpack
        if response.headers.get("content-type", "").startswith("application/json"):
            return response.json(), response.status_code
        else:
            return response.content, response.status_code


class CDGClient:
    """ A sample client to interface with Congress.gov. """

    def __init__(
        self,
        api_key,
        api_version=API_VERSION,
        response_format=RESPONSE_FORMAT,
        raise_on_error=True,
    ):
        self.base_url = urljoin(ROOT_URL, api_version) + "/"
        self._session = requests.Session()

        # do not use url parameters, even if offered, use headers
        self._session.params = {"format": response_format}
        self._session.headers.update({"x-api-key": api_key})

        if raise_on_error:
            self._session.hooks = {
                "response": lambda r, *args, **kwargs: r.raise_for_status()
            }

    def __getattr__(self, method_name):
        """Find the session method dynamically and cache for later."""
        method = _MethodWrapper(self, method_name)
        self.__dict__[method_name] = method
        return method

In [None]:
api_key = '' # Write your API key from Congress API here.
client = CDGClient(api_key = api_key)

## Get the PDF links for Senate and House

In [None]:
offsets = [0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 
          0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250, 0, 250]
years = [1995, 1995, 1996, 1996, 1997, 1997, 1998, 1998, 1999, 1999, 2000, 2000, 2001, 2001, 2002, 2002, 2003, 2003, 2004,
        2004, 2005, 2005, 2006, 2006, 2007, 2007, 2008, 2008, 2009, 2009, 2010, 2010, 2011, 2011, 2012, 2012, 2013, 2013,
        2014, 2014, 2015, 2015, 2016, 2016, 2017, 2017, 2018, 2018, 2020, 2020, 2021, 2021, 2022, 2022, 2023, 2023]

# Initialize empty lists to store Senate Section and House Section URLs
senate = []
house = []

for off, year in zip(offsets, years):
    # Construct the API endpoint with the current offset and year
    endpoint = f"congressional-record?format=json&y={year}&offset={off}&limit=250"

    # Make the API request using your client
    data, status_code = client.get(endpoint)

    # Check the status code to ensure a successful response
    if status_code == 200:
        # Iterate through the Issues and extract Senate and House Section URLs
        for issue in data['Results']['Issues']:
            if 'Senate' in issue['Links'] and issue['Links']['Senate']['Label'] == 'Senate Section':
                senate.extend([pdf['Url'] for pdf in issue['Links']['Senate']['PDF']])
            if 'House' in issue['Links'] and issue['Links']['House']['Label'] == 'House Section':
                house.extend([pdf['Url'] for pdf in issue['Links']['House']['PDF']])
    else:
        print(f"Failed to fetch data for year {year} with offset {off}. Status code: {status_code}")  

In [None]:
# 2019 needs to be done seperately. It is giving error to parameters for some reason.

# Construct the API endpoint with the current offset and year
endpoint = f"congressional-record?format=json&y=2019&offset=0&limit=210"

# Make the API request using your client
data, status_code = client.get(endpoint)

# Check the status code to ensure a successful response
if status_code == 200:
    # Iterate through the Issues and extract Senate and House Section URLs
    for issue in data['Results']['Issues']:
        if 'Senate' in issue['Links'] and issue['Links']['Senate']['Label'] == 'Senate Section':
            senate.extend([pdf['Url'] for pdf in issue['Links']['Senate']['PDF']])
        if 'House' in issue['Links'] and issue['Links']['House']['Label'] == 'House Section':
            house.extend([pdf['Url'] for pdf in issue['Links']['House']['PDF']])
else:
    print(f"Failed to fetch data for year {year} with offset {off}. Status code: {status_code}") 

In [None]:
# Seperate By Session
# Senate
# Initialize a dictionary to group links by the value
link_senate = {}

# Iterate through the links and group them by the value (e.g., "104")
for link in senate:
    # Extract the value from the link (e.g., "104")
    value = link.split('/')[3]  # Assumes the value is always at this position
    # Create a list for the value if it doesn't exist in the dictionary
    if value not in link_senate:
        link_senate[value] = []
    # Append the link to the corresponding value in the dictionary
    link_senate[value].append(link)

# Convert the dictionary to a list of lists
linksen = list(link_senate.values())

# House
# Initialize a dictionary to group links by the value
link_house = {}

# Iterate through the links and group them by the value (e.g., "104")
for link in house:
    # Extract the value from the link (e.g., "104")
    value = link.split('/')[3]  # Assumes the value is always at this position
    # Create a list for the value if it doesn't exist in the dictionary
    if value not in link_house:
        link_house[value] = []
    # Append the link to the corresponding value in the dictionary
    link_house[value].append(link)

# Convert the dictionary to a list of lists
linkho = list(link_house.values())

In [None]:
# Create the subdirectory if it doesn't exist
subdirectory = '../records_links'
os.makedirs(subdirectory, exist_ok=False)

In [None]:
# Iterate through link_lists and save each list to a separate text file in the subdirectory
for i, links in enumerate(linksen, start=1):
    # Define the filename for the text file in the subdirectory (e.g., "output/links_104.txt")
    filename = os.path.join(subdirectory, f"senate_links_{list(link_senate.keys())[i-1]}.txt")
    with open(filename, 'w') as file:
        # Write each link to the text file, separated by a line
        file.write('\n'.join(links))
    print(f"Saved links to {filename}")

In [None]:
# Iterate through link_lists and save each list to a separate text file in the subdirectory
for i, links in enumerate(linkho, start=1):
    # Define the filename for the text file in the subdirectory (e.g., "output/links_104.txt")
    filename = os.path.join(subdirectory, f"house_links_{list(link_house.keys())[i-1]}.txt")
    with open(filename, 'w') as file:
        # Write each link to the text file, separated by a line
        file.write('\n'.join(links))
    print(f"Saved links to {filename}")

In [None]:
# Some are saved into a text file called crec. These do not have the session number in the links. 
# There are only 4 in total so it would make sense to just correct them manually.

#https://www.congress.gov/crec/2005/07/18/CREC-2005-07-18-house.pdf
#https://www.congress.gov/crec/2016/12/30/CREC-2016-12-30-house-bk2.pdf
#https://www.congress.gov/crec/2017/09/18/CREC-2017-09-18-house.pdf
#https://www.congress.gov/crec/2018/01/03/CREC-2018-01-03-house-bk2.pdf
