In [None]:
###### FIRST CELL TO RUN ########
#This script will run determine how many roll calls/sessions happened for a given year so that we know how which APIs we have 
#to call to retrieve all the data for a given year
#######

import requests
import csv

## Enter URL here and the number of years you want the script to pull back from the site
def fetch_votes_by_year_and_roll_and_save_to_csv(csv_filename):
    base_url = "https://clerk.house.gov/evs/{year}/roll{roll}.xml"
    years = [2021, 2022, 2023, 2024]

## Create the csv and 3 headers
    with open(csv_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Year', 'Roll Number', 'URL'])  # Write the header row

## Logic and shit
        for year in years:
            roll = 1
            while True:
                formatted_roll = str(roll).zfill(3)  # Ensure the roll number is formatted correctly (e.g., 001)
                url = base_url.format(year=year, roll=formatted_roll)
                
                try:
                    response = requests.get(url)
                    # Check if the response status is 200 (OK) and assume no content indicates the end
                    if response.status_code == 200 and response.content:
                        writer.writerow([year, formatted_roll, url])  # Write the found data to the CSV
                        roll += 1
                    else:
                        # If the status is not 200 or there's no content, break and move to the next year
                        print(f"No more rolls found for {year} at roll number {formatted_roll}. Moving to next year...")
                        break
                except Exception as e:
                    print(f"An error occurred: {e}")
                    break

## Specify the CSV filename where the data will be saved
## csv will have every www.url.com/XYZ.xml URL. This is basically a list of every individualized API that we need to call in the next step
csv_filename = 'available_rolls.csv'

# Fetch the available rolls and save them to the specified CSV
fetch_votes_by_year_and_roll_and_save_to_csv(csv_filename)

print(f"Data saved to {csv_filename}")


In [None]:
#####Second Cell to run once first cell is done#######
### Make sure you only run this when ready. It will take a couple minutes to finish executing depending on your systems specs.

import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

# Function to read the CSV and extract unique URLs
def read_csv(file_path):
    urls = set()
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            urls.add(row['URL'])
    return urls

# Worker function for making GET requests with logging
def fetch_url(url):
    print(f'Starting fetch for: {url}')
    try:
        response = requests.get(url, timeout=10)
        print(f'Completed fetch for: {url}')
        return url, response.text
    except requests.RequestException as e:
        print(f'Error fetching {url}: {e}')
        return url, str(e)

# Main function to orchestrate the fetching and saving process with logging
def main(csv_path):
    urls = read_csv(csv_path)
    responses = []

    # This is where multi-threading happens, if we increase the size of the pull we can always increase the max_workers variable
    # Use ThreadPoolExecutor to fetch data from URLs concurrently
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(fetch_url, url): url for url in urls}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                responses.append(data)
            except Exception as exc:
                print(f'{url} generated an exception: {exc}')

    # Save responses to a new CSV file with UTF-8 encoding
    with open('Roll XML Data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['URL', 'Response']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for url, response in responses:
            writer.writerow({'URL': url, 'Response': response})

# Uncomment the line below to run the script with the actual CSV path
main('available_rolls.csv')

