In [None]:
import requests
from bs4 import BeautifulSoup
import logging
import csv

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# URL of the page to scrape
url = "https://www.senate.gov/legislative/votes_new.htm"

# Send a HTTP request to the URL
response = requests.get(url)
logging.info('Fetched the votes page from the Senate website')

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find the select element that contains the years and URLs
select_element = soup.find('select', {'name': 'menu'})

# Define the years of interest
years_of_interest = ['2020', '2021', '2022', '2023', '2024']
urls_to_fetch = []

# Iterate through the option elements within the select element
for option in select_element.find_all('option'):
    # Extract the text (year) and value (URL) from each option element
    year_text = option.text.strip()
    year_url = option['value']
    # Check if the year is one of the years of interest
    if any(year in year_text for year in years_of_interest):
        full_url = f"https://www.senate.gov{year_url}"
        logging.info(f"Found URL for year {year_text}: {full_url}")
        urls_to_fetch.append(full_url.replace('.htm', '.xml'))

# Filename for the CSV output
csv_filename = "senate_summarized_raw.csv"

# Open the CSV file for writing
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Assuming the structure of the XML is known and consistent
    for xml_url in urls_to_fetch:
        logging.info(f"Fetching data from {xml_url}")
        response = requests.get(xml_url)
        if response.status_code == 200:
            # This is a simplification. You'd likely need to parse the XML and extract specific data
            writer.writerow([response.text])
            logging.info(f"Successfully wrote data from {xml_url}")
        else:
            logging.error(f"Failed to fetch data from {xml_url}")


In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# Your list of URLs
## We do store the links in a list above, so you should be able to call it without hardcodding the urls
## Got Lazy and didn't do it the "right" way. Sue me, or fix it :) 
urls_to_fetch = [
    "https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_118_2.htm",
    "https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_118_1.htm",
    "https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_117_2.htm",
    "https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_117_1.htm",
    "https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_116_2.htm",
]

# Function to fetch and parse hyperlinks under the "Vote (Tally)" column
def fetch_vote_tally_links(urls):
    all_links = []  # Store all the hyperlinks here
    
    for url in urls:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # This is a placeholder for finding the correct table and the "Vote (Tally)" column
            table = soup.find('table', id='listOfVotes') or soup.find('table')
            if table:
                links = table.find_all('a')
                for link in links:
                    # Assuming each link under "Vote (Tally)" is what we need
                    # You might need to filter these links based on their location in the table
                    all_links.append((url, link.get('href')))
    
    return all_links

# Function to write the hyperlinks to a CSV file
def write_links_to_csv(links, filename='vote_tally_links.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Source URL', 'Hyperlink'])
        for link in links:
            writer.writerow(link)

# Main execution flow
if __name__ == "__main__":
    links = fetch_vote_tally_links(urls_to_fetch)
    write_links_to_csv(links)

# Uncomment the last two lines to run the script


In [5]:
import csv
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Setup basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def read_urls_from_csv(filename='vote_tally_links.csv'):
    urls = []
    with open(filename, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            url = row[1].replace('.htm', '.xml')  # Replace .htm with .xml
            if not url.startswith('http'):
                url = 'https://www.senate.gov' + url  # Prepend base URL
            if 'senate.gov' in url:  # Filter to include only senate.gov URLs
                urls.append(url)
    return urls

def fetch_url(url):
    logging.info(f'Fetching URL: {url}')
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            logging.info(f'Successfully fetched: {url}')
            return url, response.text
        else:
            logging.error(f'Failed to fetch {url} with status code: {response.status_code}')
            return url, f'Error: Failed with status code {response.status_code}'
    except requests.RequestException as e:
        logging.error(f'Error fetching {url}: {str(e)}')
        return url, f'Error: {str(e)}'

def write_responses_to_csv(responses, filename='api_responses_filtered.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['URL', 'API Response'])
        for response in responses:
            writer.writerow(response)
    logging.info(f'Responses written to {filename}')

if __name__ == "__main__":
    urls = read_urls_from_csv()
    responses = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(fetch_url, url): url for url in urls}
        for future in as_completed(future_to_url):
            try:
                data = future.result()
                responses.append(data)
            except Exception as exc:
                url = future_to_url[future]
                logging.error(f'{url} generated an exception: {exc}')

    write_responses_to_csv(responses)


2024-03-02 22:39:42,259 - INFO - Fetching URL: https://www.senate.gov/legislative/LIS/roll_call_votes/vote1182/vote_118_2_00067.xml
2024-03-02 22:39:42,262 - INFO - Fetching URL: https://www.senate.gov/legislative/LIS/roll_call_votes/vote1182/vote_118_2_00066.xml
2024-03-02 22:39:42,263 - INFO - Fetching URL: https://www.senate.gov/legislative/LIS/roll_call_votes/vote1182/vote_118_2_00065.xml
2024-03-02 22:39:42,264 - INFO - Fetching URL: https://www.senate.gov/legislative/LIS/roll_call_votes/vote1182/vote_118_2_00064.xml
2024-03-02 22:39:42,266 - INFO - Fetching URL: https://www.senate.gov/legislative/LIS/roll_call_votes/vote1182/vote_118_2_00063.xml
2024-03-02 22:39:42,267 - INFO - Fetching URL: https://www.senate.gov/legislative/LIS/roll_call_votes/vote1182/vote_118_2_00062.xml
2024-03-02 22:39:42,267 - INFO - Fetching URL: https://www.senate.gov/legislative/LIS/roll_call_votes/vote1182/vote_118_2_00061.xml
2024-03-02 22:39:42,269 - INFO - Fetching URL: https://www.senate.gov/legisl