In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from click import style

**This will set up our selenium scraper in order to get information from the page**

In [1]:
#url = 'https://www.opensecrets.org/members-of-congress/members-list?cong_no=118&cycle=2024&sort=N'
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

service = Service(ChromeDriverManager().install())

driver = webdriver.Chrome(service=service)


driver.get('https://www.opensecrets.org/members-of-congress/members-list?cong_no=118&cycle=2024&sort=N')

print('Page title', driver.title)

html_content = driver.page_source


Page title Members of the 118th Congress • OpenSecrets


This will initialize our Beautiful Soup scraper and get all the congresspeople rows that contains all the basic information

In [34]:
soup = BeautifulSoup(html_content, 'html.parser')
print("Title:", soup.title.text)
print("Number of tables:", len(soup.find_all('table')))
print("Number of divs:", len(soup.find_all('div')))
from selenium.webdriver.common.by import By


#print('Number of table rows', len(soup.find_all('tr')))

table_body = soup.find('tbody')

#print(table_body)

congress_people_basic_info = table_body.find_all('tr')


next_button = driver.find_element(By.CSS_SELECTOR, "a.paginate_button.next")
print("helloooo", next_button.get_attribute('outerHTML'))
driver.execute_script("arguments[0].click();", next_button)



Title: Members of the 118th Congress • OpenSecrets
Number of tables: 1
Number of divs: 270
helloooo <a class="paginate_button next" aria-controls="DataTables_Table_0" data-dt-idx="7" tabindex="0" id="DataTables_Table_0_next">Next</a>
None


Functions to convert money from string to int

In [5]:
# Function to parse strings such as '$1,100,900'
def parse_currency_string(value):
    if value[0] == "-":
        return -1 * int(value[1:].strip('$').replace(',', ''))

    return int(value.strip('$').replace(',', ''))


# This function will be similar but will parse string such as '$11.7M' or '$11.7B'
def parse_abbreviated_number(string_amount):
    scale_factors = {'M': 1_000_000, 'K': 1_000, 'B': 1_000_000_000}

    cleaned_string = string_amount.strip().strip('$')
    try:

        scale = cleaned_string[-1].upper()
        numerical_part = cleaned_string[:-1]
        factor = scale_factors[scale]

        return int(float(numerical_part) * factor)

    except (KeyError, ValueError, IndexError) as e:
        print(f"Error: {e}")
        return None


print(parse_abbreviated_number('$11.7M'))
print(parse_currency_string('-$1,100,900'))


11700000
-1100900


**getting basic information from the congressman**

In [41]:
congress_information = []
base_url = 'https://www.opensecrets.org'
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
driver.get("https://www.opensecrets.org/members-of-congress/members-list?cong_no=118&cycle=2024&sort=N")


while True:
    try:
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        table_body = soup.find('tbody')
        congress_people_basic_info = table_body.find_all('tr')
        for entry in congress_people_basic_info:
            #print(entry)
            congressman_information = {}
            row = entry.find_all('td')
            #print("This is the row", row)
            link = entry.find_all('a')[0]['href']
            #print("This is the link", link)
            full_link = base_url + link


            congressman_information['link'] = full_link
            congressman_information['name'] = row[0].find('a').text.strip()
            congressman_information['state'] = row[1].text
            congressman_information['chamber'] = row[2].text
            congressman_information['party'] = row[3].text
            congressman_information['raised'] = parse_currency_string(row[4].text)
            congressman_information['spent'] = parse_currency_string(row[5].text)
            congressman_information['cash_on_hand'] = parse_currency_string(row[6].text)
            congressman_information['debts'] = parse_currency_string(row[7].text)

            congress_information.append(congressman_information)


            print(congressman_information)

        next_button = driver.find_element(By.CSS_SELECTOR, "a.paginate_button.next")


        if "disabled" in next_button.get_attribute("class"):
            break

        if next_button:
            driver.execute_script("arguments[0].click();", next_button)

            WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "tbody"))
            )

        else:
            break


    except (KeyError, ValueError, IndexError) as e:
        print(f"Error: {e}")



required = ["link","name","state","chamber","party","raised","spent","cash_on_hand","debts"]
num_fields = len(required)

issues = []

for i, m in enumerate(congress_information):
    missing = [k for k in required if k not in m]
    empties = [k for k in required if k in m and (m[k] in (None, "", []))]
    wrong_types = []
    # simple type checks
    for k in ["raised","spent","cash_on_hand","debts"]:
        if k in m and not isinstance(m[k], (int, float)):
            wrong_types.append(k)
    # quick link sanity
    bad_link = ("link" in m) and not (isinstance(m["link"], str) and m["link"].startswith("https://www.opensecrets.org/"))

    if missing or empties or wrong_types or bad_link:
        issues.append({
            "index": i,
            "name": m.get("name"),
            "missing": missing,
            "empty": empties,
            "wrong_types": wrong_types,
            "bad_link": bad_link,
        })

print(f"Total objects: {len(congress_information)}")
print(f"With issues:   {len(issues)}")
print(issues[:5])  # peek at first few problems


for index, member in enumerate(congress_information):

    if member['name'] == 'Earl Blumenauer':
        print(f"{index}: {member['name']}")










{'link': 'https://www.opensecrets.org/members-of-congress/adams-alma/summary?cid=N00035451&cycle=2024', 'name': 'Alma Adams', 'state': 'North Carolina', 'chamber': 'House', 'party': 'Democrat', 'raised': 714938, 'spent': 684971, 'cash_on_hand': 573409, 'debts': 0}
{'link': 'https://www.opensecrets.org/members-of-congress/aderholt-robert-b/summary?cid=N00003028&cycle=2024', 'name': 'Robert B Aderholt', 'state': 'Alabama', 'chamber': 'House', 'party': 'Republican', 'raised': 1470649, 'spent': 1663198, 'cash_on_hand': 966896, 'debts': 0}
{'link': 'https://www.opensecrets.org/members-of-congress/aguilar-pete/summary?cid=N00033997&cycle=2024', 'name': 'Pete Aguilar', 'state': 'California', 'chamber': 'House', 'party': 'Democrat', 'raised': 5808795, 'spent': 4739480, 'cash_on_hand': 2649071, 'debts': 0}
{'link': 'https://www.opensecrets.org/members-of-congress/alford-mark/summary?cid=N00049617&cycle=2024', 'name': 'Mark Alford', 'state': 'Missouri', 'chamber': 'House', 'party': 'Republican',


Function to convert end of year to the actual year

In [16]:
# We will convert the string '12' or '02' to be 12 and 2 respectively first
# We will then pass it into the function
import datetime

def convert_two_digit_year(year):
    int_year = int(year)
    current_year_two_digit = datetime.datetime.now().year % 100

    if int_year <= current_year_two_digit:
        return 2000 + int_year
    else:
        return 1900 + int_year


In [17]:
# Function will convert text into valid key
# For example, 'PAC Contributions' will be equal to pac_contributions
import re


def clean_key(raw_string):
    """
    Converts a string into a clean, snake_case format for dictionary keys.
    e.g., "PAC Contributions*" -> "pac_contributions"
    """
    # 1. Convert to lowercase
    s = raw_string.lower()
    # 2. Replace all non-alphanumeric characters (except spaces) with nothing
    s = re.sub(r'[^a-z0-9\s]', '', s)
    # 3. Replace spaces and consecutive spaces with a single underscore
    s = re.sub(r'\s+', '_', s)
    # 4. Remove leading/trailing underscores
    s = s.strip('_')
    return s


print(clean_key(' Contributions*'))



contributions


In [59]:
# creating a function that will in charge of scraping from the other page
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
import re
import time


def scrape_individual_page(link_congressman):

    top_contributors_list = []
    top_industries_list = []
    total_vs_avg_list = []
    sources_of_funds = []

    # This is responsible for getting us to the individual member page
    driver.get("about:blank")  # Clear any existing state
    driver.get(link_congressman)
    page_title = driver.title
    individual_page_src = driver.page_source
    individual_soup = BeautifulSoup(individual_page_src, 'html.parser')


    election_dates = individual_soup.find_all('div', class_="Congress--profile-timeline-item")




    total_vs_avg_list = []

    # Dynamically match the plot group regardless of the number
    chart_for_earned = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((
            By.CSS_SELECTOR,
            "[class^='raphael-group-'][class$='-plot-group']"
        ))
    )

    # Find all bar elements
    bar_elements = chart_for_earned.find_elements(By.TAG_NAME, 'rect')
    print(f"Found {len(bar_elements)} bars")

    actions = ActionChains(driver)

    for bar in bar_elements:
        try:
            # Hover over each bar to trigger tooltip
            actions.move_to_element(bar).perform()

            # Wait until tooltip is visible
            tooltip_el = WebDriverWait(driver, 14).until(
                EC.visibility_of_element_located((By.CLASS_NAME, "fc__tooltip"))
            )

            # Get tooltip HTML directly
            tooltip_html = tooltip_el.get_attribute("outerHTML")
            tooltip_soup = BeautifulSoup(tooltip_html, "html.parser")

            # Pull all divs from the tooltip
            data_divs = tooltip_soup.find_all('div')
            if not data_divs or len(data_divs) < 4:
                continue


            print(data_divs[0].text.split(','))
            # Extract the numbers
            year = convert_two_digit_year(data_divs[3].text.split(',')[1])
            total_raised_by_congressman = parse_abbreviated_number(data_divs[3].text.split(',')[2])
            average_raised = parse_abbreviated_number(data_divs[0].text.split(',')[-1])

            # Store the result
            total_vs_avg_list.append({
                'year': year,
                'total_raised_by_congressman': total_raised_by_congressman,
                'average_raised': average_raised
            })

        except Exception as e:
            print("Tooltip did not appear:", e)
            continue





    # these will be switched once we loop through all of the links
    print('Here are the election dates', election_dates)
    # This will include information of the election dates



    for item in election_dates:
        date = item.select_one('.Congress--profile-timeline-date')
        label = item.select_one('.Congress--profile-timeline-label')


        date_text = date.get_text(strip=True) if date else None
        label_text = label.get_text(strip=True)

        print("date", date_text)
        print("label", label_text)
        print(repr(label_text), repr(date_text))

        if not label_text:
            continue

        if label_text == 'First Election':
            congress_information[20]['first_election'] = date_text

        elif label_text in ["Last Election", "Next Election", "Retiring at session end"]:
            congress_information[20]["election_type"] = label_text
            congress_information[20]["election_year"] = date_text if date_text and re.fullmatch(r"\d{4}", date_text) else None



    top_industries_contributors_numbers = individual_soup.find_all('div', class_="Congress--profile-top-numbers--info--stats-number")
    top_industries_contributor_names = individual_soup.find_all('div', class_="Congress--profile-top-numbers--info--stats-name")

    # This will have to be cleaned up to remove dollar sign and commas
    congress_information[20]['top_industry'] = top_industries_contributor_names[0].text
    congress_information[20]['top_contributor'] = top_industries_contributor_names[1].text

    # Getting the numbers for the industry and contributor
    congress_information[20]['top_industry_number'] = parse_currency_string(top_industries_contributors_numbers[0].text)
    congress_information[20]['top_contributor_number'] = parse_currency_string(top_industries_contributors_numbers[1].text)


    tables_for_individual_page = individual_soup.find_all('table', class_="js-scrollable")


    contributor_table = tables_for_individual_page[0]
    industries_table = tables_for_individual_page[1]
    # Total vs average raised

    # {year: 12, raised_by_congressman: 1.23, average_raised: 2.44 }

    # If year < 24 and <= 00

    # This will get everything in the contributors table
    for tr in contributor_table.find_all('tr'):
        top_contributors_object = {}
        contributor_row = tr.find_all('td')
        if not contributor_row:
            continue

        top_contributors_object['Contributor'] = contributor_row[0].text
        top_contributors_object['total'] = contributor_row[1].text
        top_contributors_object['individuals'] = contributor_row[2].text
        top_contributors_object['pacs'] = contributor_row[3].text

        top_contributors_list.append(top_contributors_object)
        congress_information[20]['top_contributors'] = top_contributors_list
        congress_information[20]['total_vs_avg'] = total_vs_avg_list



    # This will get everything in the industries table
    for tr in industries_table.find_all('tr'):

        top_industries_object = {}

        industry_row = tr.find_all('td')

        if not industry_row:
            continue

        top_industries_object['industry'] = industry_row[0].text
        top_industries_object['total'] = industry_row[1].text
        top_industries_object['individuals'] = industry_row[2].text
        top_industries_object['pacs'] = industry_row[3].text


        top_industries_list.append(top_industries_object)

        # The index will be changed dynamically soon
        congress_information[20]['top_industries'] = top_industries_list


    table_for_contributions = individual_soup.find_all('div', class_="HorizontalStackedBar")

    all_contribution_data = []
    for div_element in table_for_contributions:

        trs_in_div = div_element.find_all('tr')
        for td in trs_in_div:
            contribution_data = {}
            cells = td.find_all('td')

            contribution_type = cells[0].text
            contribution_amount = cells[1].text
            contribution_percent = cells[2].text


            contribution_data['contribution_type'] = clean_key(contribution_type)
            contribution_data['contribution_amount'] = parse_currency_string(contribution_amount.strip())
            contribution_data['contribution_percent'] = int(float(contribution_percent.strip().strip('%')))

            all_contribution_data.append(contribution_data)


        congress_information[20]['all_contribution_data'] = all_contribution_data













# Class name for hoverables
# fc__tooltip fusioncharts-div
    return page_title
print(scrape_individual_page(congress_information[20]['link']))
print(congress_information[20])


Found 11 bars
['■\xa0Total Raised by John Barrasso - Campaign Committee', ' 96', ' $549.77K■\xa0Average Raised by Senate Members - Campaign Committee', ' 96', ' $1.81M']
['■\xa0Total Raised by John Barrasso - Campaign Committee', ' 98', ' $501■\xa0Average Raised by Senate Members - Campaign Committee', ' 98', ' $2.1M']
Error: '1'
['■\xa0Total Raised by John Barrasso - Campaign Committee', ' 08', ' $2.59M■\xa0Average Raised by Senate Members - Campaign Committee', ' 08', ' $3.13M']
['■\xa0Total Raised by John Barrasso - Campaign Committee', ' 10', ' $525.53K■\xa0Average Raised by Senate Members - Campaign Committee', ' 10', ' $3.13M']
['■\xa0Total Raised by John Barrasso - Campaign Committee', ' 12', ' $4.01M■\xa0Average Raised by Senate Members - Campaign Committee', ' 12', ' $3.19M']
['■\xa0Total Raised by John Barrasso - Campaign Committee', ' 14', ' $843.91K■\xa0Average Raised by Senate Members - Campaign Committee', ' 14', ' $3.93M']
['■\xa0Total Raised by John Barrasso - Campaign 

In [1]:
driver.close()

NameError: name 'driver' is not defined