# Scrape for Committee's Contribution Records

This notebook scrapes from the [Wayne County Finance System](https://wccampaignfinance.com/Public/ReceiptsList) for committees' contribution records and saves them into a CSV file called [contributions.csv](../data/contributions.csv) in the data folder. It should be noted that this process takes a long time, so it's recommended that this should only be run for a few minutes for testing. A sufficient amount of data had already been scraped and put into the contributions.csv file and is readily available for immediate use. This function is not efficient and can be prone to breaking if the loading time between each command is too long. 

The next function of this notebook will read data from the contributions.csv file and generate pie charts with dynamic hovering for a provided committee's name. Names that correspond to multiple zip codes will have multiple pie charts, one for each zip code. This is to ensure that different people with the same name are not grouped into the same pie chart.    

## Imports

In [1]:
# Selenium & WebDriver
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service

# Others
import time
import pandas as pd
import numpy as np
from io import StringIO
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re
import random
import shutil

## Scraping

In [2]:
def get_driver():
    '''
    Initialize a Selenium Firefox WebDriver 

    Returns:
    WebDriver: An instance of a Firefox WebDriver.
                If the driver cannot be started in normal mode, it falls back to headless mode.
                Returns the driver

    Notes:
    - The function attempts to start the driver in regular mode first. 
    - If there is an error starting the driver, it will attempt to start it in headless mode.
    '''
    gecko_path = shutil.which("geckodriver")
    service = Service(executable_path=gecko_path)
    opts = webdriver.FirefoxOptions()
    try: # Try starting a regular driver
        driver = webdriver.Firefox(service=service, options=opts)
    except: # If not possible, start a headless/invisible driver
        opts.add_argument("--headless")
        driver = webdriver.Firefox(service=service, options=opts)
        print('Headless/Invisible Driver')
    return driver

In [3]:
# Boot up a Firefox webdriver
driver = get_driver() # This might take a while to boot up
time.sleep(3)

url = "https://wccampaignfinance.com/Public/ReceiptsList" # go to the Wayne County Contribution Record
driver.get(url)
time.sleep(2)

In [4]:
element = driver.find_element(By.XPATH, f"//input[@id='btnContinue']") # Click on the 'Continue' button
element.click()
time.sleep(3)

element = driver.find_element(By.XPATH, f"//select[@id='FilingPeriodName']") # Click on the 'Reporting Period' dropdown menu
element.click()
time.sleep(1)

element = element.find_element(By.XPATH, f".//option[@value='']") # Select the empty value in the dropdown to capture all periods
element.click()
time.sleep(1)

In [None]:
official_df = pd.read_excel('../data/List_of_names_1_7_2.xlsx') #  # Read in the List of provided names csv
def get_contribution_df(last_name, first_name, selenium_id, df=None):
    """
    Scrapes and returns a DataFrame of contribution records for a given contributor using Selenium.

    This function uses Selenium to search for a contributor by last and first name on a web form,
    then extracts contribution data from the Wayne County Finance data system.

    Parameters:
        last_name (str): The last name of the contributor to search for.
        first_name (str): The first name of the contributor to search for.
        selenium_id (str): The HTML `id` of the table element containing the contributions data.
        df (pd.DataFrame, optional): An existing DataFrame to append results to. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame containing all the contribution records found.
    """
    def extract_table(selenium_id, df=None): # function to extract a table from an HTML id then return a pandas df   
        element = driver.find_element(By.XPATH, f"//table[@id='{selenium_id}']").get_attribute('outerHTML') # get table from provided id
        new_df = pd.read_html(StringIO(element))[0] # convert the HTML table element into a pandas table
        if df is None: # If there's no previous table, then df is just the newly generated df 
            df = new_df
        else: # If there is a previous df provided, concat the two tables together
            df = pd.concat([df, new_df], ignore_index=True)
        
        try: # Try to click 'Next' on the page (for cases where a long record is broken up into multiple pages of content)
            element = driver.find_element(By.XPATH, f"//a[text()='Next >']")
            element.click()
            time.sleep(5)
            return extract_table(selenium_id, df) # recursively extract table on each page
        except: # If there's no more pages, return the whole table
            return df
    element = driver.find_element(By.XPATH, f"//input[@id='txtContributorName']")
    element.clear()
    element.send_keys(last_name) # Fill in the last name of the committee
    time.sleep(1)
    
    element = driver.find_element(By.XPATH, f"//input[@id='txtFirstName']") 
    element.clear() 
    element.send_keys(first_name) # Fill in the first name of the committee
    time.sleep(1)
    
    element = driver.find_element(By.XPATH, f"//input[@id='btnSearch']") 
    element.click() # Click the 'Search' button
    time.sleep(7)

    return extract_table('LoadViewReceipts', df) # return the table for each committee

df = None
for index, row in official_df.iterrows(): # iterate through all the names in the provided dataset
    df = get_contribution_df(row['Last'], row['First'], 'LoadViewReceipts', df) # extract contribution data for all names

    # save the data into data/contributions.csv, uncomment the line below if you want to start a whole new scraping process
    # df.to_csv('../data/contributions.csv', index=False) 

## Visualization

In [None]:
zip_code_pattern = r'\b([A-Z]{2})\s(\d{5})\b' # The pattern is 2 letters for state code and 5 numbers for zip code 

contrs = pd.read_csv('../data/contributions.csv') # read in the contribution data saved in data/contributions.csv
contrs = contrs.replace("No records to view.", np.nan) # Replace every "No records to view." with np.nan
contrs.dropna(how='all', inplace=True) # Drop all rows that have no information

# convert string into datetime object
contrs['Transaction Date'] = contrs['Transaction Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').date()) 
# drop all the '$' symbol from the 'Contribution Amount' column and convert the value into float
contrs['Contribution Amount'] = contrs['Contribution Amount'].apply(lambda x: float(x.translate(str.maketrans("","", "$,"))))
# extract the zip code pattern from each address to make a new column called 'Contributor Zip Code'
contrs['Contributor Zip Code'] = contrs['Contributor Address'].apply(lambda x: re.search(zip_code_pattern, x).group() 
                                                                     if isinstance(x, str) else 'Not Found')

In [None]:
def make_dynamic_chart(name):
    """
    Generate and display a series of pie charts showing political contributions by a specific contributor.

    For each unique ZIP code associated with the contributor's name, a separate pie chart is generated.
    Each chart displays the total contributions to different receiving committees from that ZIP code,
    with hover text providing detailed information about individual contributions.

    Parameters:
    -----------
    name : str
        The (partial or full) name of the contributor to search for. Case-insensitive.

    Returns:
    --------
    str or None
        Returns a message if no contributor is found with the given name.
        Otherwise, displays an interactive Plotly figure and returns the data.
    """ 
    subdf = contrs[contrs['Contributor Name'].str.contains(name, case=False)] # make a subdf that contains the provided contributor name
    subdf = subdf.drop_duplicates() # drop all duplicate rows in subdf
    zipcodes = subdf['Contributor Zip Code'].unique() # extract all unique zipcodes in the subdf
    if len(zipcodes) == 0: # If there's no info found for the person, return a string that notifies such
        return f"No information found for {name}"
    
    fig = make_subplots(rows=len(zipcodes), cols=1,
                        specs=[[{'type': 'domain'}]] * len(zipcodes),
                        subplot_titles=[f"Zip Code: {z}" for z in zipcodes]) # initiate the subplots 
    def make_hovertext(group): # function to make the displayed text when hovered
        row_text = (f"{row['Transaction Date']} | "
                    f"${row['Contribution Amount']} | "
                    f"{row['Contributor Type']} | "
                    f"{row['Contribution Type']}<br>" for _, row in group.iterrows())
        rows = ''.join(row_text)
        return rows # Hovered text contains the Transaction Date, Contribution Amount, Contributor Type, and Contribution Type
        
    for i, zipcode in enumerate(zipcodes): # iterate through all the unique zipcodes
        zip_df = subdf[subdf['Contributor Zip Code'] == zipcode].sort_values('Transaction Date') # make a zip_df for each unique zip code
        plot_df = zip_df.groupby('Receiving Committee')['Contribution Amount'].sum().reset_index() 
        hover_df = zip_df.groupby('Receiving Committee').apply(make_hovertext).reset_index(name='HoverText')
        pie_data = pd.merge(plot_df, hover_df, on='Receiving Committee') # generate a df that's grouped by the receiving committee
        fig.add_trace(go.Pie(
            labels=pie_data['Receiving Committee'],
            values=pie_data['Contribution Amount'],
            rotation=90,
            customdata=pie_data[['HoverText']].values,  # must be 2D array
            hovertemplate=('<b>%{label}</b><br>Total: $%{value:,}<br><br>'
                           '<i>Date | Amount | Contributor | Contribution</i><br>'
                           '%{customdata[0]}<extra></extra>')
        ), row=i+1, col=1) # make the pie chart with hovering
    
    fig.update_layout(
        height=300 * len(zipcodes),  # Adjust based on number of charts
        title_text=f"Contributions by {name} per ZIP Code",
        showlegend=False
    ) # adjust the layout
    
    fig.show() # show the pie charts
    return subdf # return the subdf

In [None]:
make_dynamic_chart("Mary Sheffield") # generate pie chart for Mary Sheffield, change the name if needed