In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from click import style

**This will set up our selenium scraper in order to get information from the page**

In [2]:
#url = 'https://www.opensecrets.org/members-of-congress/members-list?cong_no=118&cycle=2024&sort=N'
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

service = Service(ChromeDriverManager().install())

driver = webdriver.Chrome(service=service)


driver.get('https://www.opensecrets.org/members-of-congress/members-list?cong_no=118&cycle=2024&sort=N')

print('Page title', driver.title)

html_content = driver.page_source


Page title Members of the 118th Congress • OpenSecrets


This will initialize our Beautiful Soup scraper and get all the congresspeople rows that contains all the basic information

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')
print("Title:", soup.title.text)
print("Number of tables:", len(soup.find_all('table')))
print("Number of divs:", len(soup.find_all('div')))

print('Number of table rows', len(soup.find_all('tr')))

table_body = soup.find('tbody')

print(table_body)

# # Getting all the rows with the congress people info including names party, and some financials for example
for row in table_body:
    congress_people_basic_info = soup.find_all('tr')



Title: Members of the 118th Congress • OpenSecrets
Number of tables: 1
Number of divs: 234
Number of table rows 541
<tbody>
<tr>
<td class="color-category blue" style="height: 100%; vertical-align: middle;"><span class="hiddentext">Adams, Alma</span><a href="/members-of-congress/adams-alma/summary?cid=N00035451&amp;cycle=2024"> Alma Adams</a></td>
<td class="" style="height: 100%; vertical-align: middle;">North Carolina</td>
<td class="" style="height: 100%; vertical-align: middle;">House</td>
<td style=" display: none; height: 100%; vertical-align: middle;">Democrat</td>
<td class="number" style="height: 100%; vertical-align: middle;">$714,938</td>
<td class="number" style="height: 100%; vertical-align: middle;">$684,971</td>
<td class="number" style="height: 100%; vertical-align: middle;">$573,409</td>
<td class="number" style="height: 100%; vertical-align: middle;">$0</td>
</tr>
<tr>
<td class="color-category red" style="height: 100%; vertical-align: middle;"><span class="hiddentext

Functions to convert money from string to int

In [4]:
# Function to parse strings such as '$1,100,900'
def parse_currency_string(value):
    if value[0] == "-":
        return -1 * int(value[1:].strip('$').replace(',', ''))

    return int(value.strip('$').replace(',', ''))


# This function will be similar but will parse string such as '$11.7M' or '$11.7B'
def parse_abbreviated_number(string_amount):
    scale_factors = {'M': 1_000_000, 'K': 1_000, 'B': 1_000_000_000}

    cleaned_string = string_amount.strip().strip('$')
    try:

        scale = cleaned_string[-1].upper()
        numerical_part = cleaned_string[:-1]
        factor = scale_factors[scale]

        return int(float(numerical_part) * factor)

    except (KeyError, ValueError, IndexError) as e:
        print(f"Error: {e}")
        return None


print(parse_abbreviated_number('$11.7M'))
print(parse_currency_string('-$1,100,900'))


11700000
-1100900


**getting basic information from the congressman**

In [5]:
congress_information = []
base_url = 'https://www.opensecrets.org'
for entry in congress_people_basic_info[1:-1]:
    #print(entry)
    congressman_information = {}
    row = entry.find_all('td')
    #print("This is the row", row)
    link = entry.find_all('a')[0]['href']
    #print("This is the link", link)

    full_link = base_url + link


    congressman_information['link'] = full_link
    congressman_information['name'] = row[0].find('a').text.strip()
    congressman_information['state'] = row[1].text
    congressman_information['chamber'] = row[2].text
    congressman_information['party'] = row[3].text
    congressman_information['raised'] = parse_currency_string(row[4].text)
    congressman_information['spent'] = parse_currency_string(row[5].text)
    congressman_information['cash_on_hand'] = parse_currency_string(row[6].text)
    congressman_information['debts'] = parse_currency_string(row[7].text)

    congress_information.append(congressman_information)



for index, member in enumerate(congress_information):

    if member['name'] == 'Richard Blumenthal':
        print(f"{index}: {member['name']}")










35: Richard Blumenthal


Function to convert end of year to the actual year

In [6]:
# We will convert the string '12' or '02' to be 12 and 2 respectively first
# We will then pass it into the function
import datetime

def convert_two_digit_year(year):
    int_year = int(year)
    current_year_two_digit = datetime.datetime.now().year % 100

    if int_year <= current_year_two_digit:
        return 2000 + int_year
    else:
        return 1900 + int_year


In [7]:
# Function will convert text into valid key
# For example, 'PAC Contributions' will be equal to pac_contributions
import re


def clean_key(raw_string):
    """
    Converts a string into a clean, snake_case format for dictionary keys.
    e.g., "PAC Contributions*" -> "pac_contributions"
    """
    # 1. Convert to lowercase
    s = raw_string.lower()
    # 2. Replace all non-alphanumeric characters (except spaces) with nothing
    s = re.sub(r'[^a-z0-9\s]', '', s)
    # 3. Replace spaces and consecutive spaces with a single underscore
    s = re.sub(r'\s+', '_', s)
    # 4. Remove leading/trailing underscores
    s = s.strip('_')
    return s


print(clean_key(' Contributions*'))



contributions


In [8]:
# creating a function that will in charge of scraping from the other page
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
import time


def scrape_individual_page(link_congressman):

    top_contributors_list = []
    top_industries_list = []
    total_vs_avg_list = []
    sources_of_funds = []

    # This is responsible for getting us to the individual member page
    driver.get("about:blank")  # Clear any existing state
    driver.get(link_congressman)
    page_title = driver.title
    individual_page_src = driver.page_source
    individual_soup = BeautifulSoup(individual_page_src, 'html.parser')


    election_dates = individual_soup.find_all('div', class_="Congress--profile-timeline-date")



    chart_for_earned = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'raphael-group-81-plot-group'))
        )





    bar_elements = chart_for_earned.find_elements(By.TAG_NAME, 'rect')

    print(len(bar_elements))
    actions = ActionChains(driver)


    for bar in bar_elements:
        print("THis is a bar", bar)
        actions.move_to_element(bar).perform()
        try:
            # Wait for tooltip to appear
            average_vs_total_object = {}

            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.CLASS_NAME, "fc__tooltip"))
            )

            # Now get the updated HTML
            bar_soup = BeautifulSoup(driver.page_source, 'html.parser')
            tooltip_div = bar_soup.find('div', class_="fc__tooltip fusioncharts-div")

            print(tooltip_div)

            if tooltip_div:
                data_divs = tooltip_div.find_all('div')
                if not data_divs:
                    continue




                #print("Here are the data divs", data_divs)
                #print(data_divs[3].text.split(',')[1])
                year = convert_two_digit_year(data_divs[3].text.split(',')[1])

                total_raised_by_congressman = parse_abbreviated_number(data_divs[0].text.split(',')[2])
                average_raised = parse_abbreviated_number(data_divs[3].text.split(',')[2])


                #print(year)
                #print(average_raised)
                #print(total_raised_by_congressman)

                average_vs_total_object['year'] = year
                average_vs_total_object['total_raised_by_congressman'] = total_raised_by_congressman
                average_vs_total_object['average_raised'] = average_raised


                total_vs_avg_list.append(average_vs_total_object)
        except Exception as e:
            print("Tooltip did not appear:", e)
            continue




    # these will be switched once we loop through all of the links
    congress_information[2]['first_election'] = election_dates[0].text
    congress_information[2]['upcoming_election'] = election_dates[1].text

    top_industries_contributors_numbers = individual_soup.find_all('div', class_="Congress--profile-top-numbers--info--stats-number")
    top_industries_contributor_names = individual_soup.find_all('div', class_="Congress--profile-top-numbers--info--stats-name")

    # This will have to be cleaned up to remove dollar sign and commas
    congress_information[2]['top_industry'] = top_industries_contributor_names[0].text
    congress_information[2]['top_contributor'] = top_industries_contributor_names[1].text

    # Getting the numbers for the industry and contributor
    congress_information[2]['top_industry_number'] = top_industries_contributors_numbers[0].text
    congress_information[2]['top_contributor_number'] = top_industries_contributors_numbers[1].text


    tables_for_individual_page = individual_soup.find_all('table', class_="js-scrollable")


    contributor_table = tables_for_individual_page[0]
    industries_table = tables_for_individual_page[1]
    # Total vs average raised

    # {year: 12, raised_by_congressman: 1.23, average_raised: 2.44 }

    # If year < 24 and <= 00

    # This will get everything in the contributors table
    for tr in contributor_table.find_all('tr'):
        top_contributors_object = {}
        contributor_row = tr.find_all('td')
        if not contributor_row:
            continue

        top_contributors_object['Contributor'] = contributor_row[0].text
        top_contributors_object['total'] = contributor_row[1].text
        top_contributors_object['individuals'] = contributor_row[2].text
        top_contributors_object['pacs'] = contributor_row[3].text

        top_contributors_list.append(top_contributors_object)
        congress_information[2]['top_contributors'] = top_contributors_list
        congress_information[2]['total_vs_avg'] = total_vs_avg_list



    # This will get everything in the industries table
    for tr in industries_table.find_all('tr'):

        top_industries_object = {}

        industry_row = tr.find_all('td')

        if not industry_row:
            continue

        top_industries_object['industry'] = industry_row[0].text
        top_industries_object['total'] = industry_row[1].text
        top_industries_object['individuals'] = industry_row[2].text
        top_industries_object['pacs'] = industry_row[3].text


        top_industries_list.append(top_industries_object)

        # The index will be changed dynamically soon
        congress_information[2]['top_industries'] = top_industries_list


    table_for_contributions = individual_soup.find_all('div', class_="HorizontalStackedBar")

    all_contribution_data = []
    for div_element in table_for_contributions:

        trs_in_div = div_element.find_all('tr')
        for td in trs_in_div:
            contribution_data = {}
            cells = td.find_all('td')

            contribution_type = cells[0].text
            contribution_amount = cells[1].text
            contribution_percent = cells[2].text


            contribution_data['contribution_type'] = clean_key(contribution_type)
            contribution_data['contribution_amount'] = parse_currency_string(contribution_amount.strip())
            contribution_data['contribution_percent'] = int(float(contribution_percent.strip().strip('%')))

            all_contribution_data.append(contribution_data)


        congress_information[2]['all_contribution_data'] = all_contribution_data













# Class name for hoverables
# fc__tooltip fusioncharts-div
    return page_title
print(scrape_individual_page(congress_information[2]['link']))
print(congress_information[2])


7
THis is a bar <selenium.webdriver.remote.webelement.WebElement (session="749c806ee4dac6bf121564ec0d1bc445", element="f.1C517C672122A4189784BD5DFAA826CA.d.2F2F36B05C233C51010925869B17EBE7.e.1000")>
<div class="fc__tooltip fusioncharts-div" style="color: rgb(84, 84, 84); padding: 3px; font-size: 10px; overflow: hidden; border-width: 1px; border-color: rgb(102, 102, 102); white-space: nowrap; position: absolute; border-style: solid; background-color: rgb(255, 255, 255); user-select: none; top: 322.328px; left: 108px; visibility: visible; pointer-events: none; box-shadow: rgba(64, 64, 64, 0.4) 1px 1px 3px; border-radius: 0px; font-family: Verdana, sans; will-change: left, top; max-width: 842px;"><div style="margin: 0;"><div style="color: #3d71b7; display: inline-block;">■ </div><div style="display: inline-block;">Total Raised by Pete Aguilar - Campaign Committee, 12, $461.84K</div></div><div style="margin: 0;"><div style="color: #AAAAAA; display: inline-block;">■ </div><div style="color: