In [1]:

import pandas as pd
import os
import sys
import plotly.express as px
import ipywidgets as widgets
import ipywidgets as widgets
from IPython.display import display, clear_output
module_directory = '/Users/necabotheking/Documents/Github/2023-fall-clinic-climate-cabinet'
sys.path.append(module_directory)

from utils.preprocess_mi_campaign_data import read_and_skip_errors
from utils.constants import FILEPATH, MI_CONTRIBUTION_COLUMNS


## Michigan Campaign Finance Data 1995 - 2023 Exploratory Data Analysis

#### Task 1: Read in the Dataset and merge into one Pandas DataFrame

In [2]:
df_lst = []

for file in os.listdir(FILEPATH):
        filepath = FILEPATH + file
        df_lst.append(read_and_skip_errors(filepath, MI_CONTRIBUTION_COLUMNS))

In [3]:
pd.options.display.max_columns = 100
df_lst[0].head()

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
0,472254,0,464000,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347,IND,,,DIRECT,TERRY,LOZANO,2034 EDWARD LANE WEST,KIMBALL,MI,48074-0000,FACTORY WORKER,FIAT CHRYSLER AUTOMOBILES N.V.,10/01/2018,15.0,165.0,
1,472254,0,464001,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347,IND,,,DIRECT,PATRICK,CLAERHOUT,121 MURPHY DR.,ST. CLAIR,MI,48079-0000,,,10/01/2018,5.0,55.0,
2,472254,0,464002,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347,IND,,,DIRECT,MIROSLAV,MARINKOVSKA,14433 COULEE DR,STERLING HTS,MI,48313-5320,,,10/01/2018,5.0,60.0,
3,472254,0,464003,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347,IND,,,DIRECT,TODD,GWYNN,8368 YOLANDA,DETROIT,MI,48234-0000,,,10/01/2018,5.0,35.0,
4,472254,0,464004,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347,IND,,,DIRECT,KENT,SMITH,5800 FORESTER RD,CARSONVILLE,MI,48419-0000,,,10/01/2018,5.0,55.0,


In [4]:
merged_df = pd.concat(df_lst)
merged_df_copy = merged_df.copy()
# concatenate all of the dataframes into the merged dataframe 
# and create a copy of the original dataframe prior to coercing errors into NAN

In [5]:
merged_df.dtypes

# amount and aggregate columns should be converted to numeric

doc_seq_no           int64
page_no              int64
contribution_id      int64
cont_detail_id       int64
doc_stmnt_year       int64
doc_type_desc       object
com_legal_name      object
common_name         object
cfr_com_id         float64
com_type            object
can_first_name      object
can_last_name       object
contribtype         object
f_name              object
l_name_or_org       object
address             object
city                object
state               object
zip                 object
occupation          object
employer            object
received_date       object
amount              object
aggregate           object
extra_desc          object
dtype: object

In [6]:
merged_df['amount'] = pd.to_numeric(merged_df['amount'], errors='coerce')
merged_df['aggregate'] = pd.to_numeric(merged_df['aggregate'], errors='coerce')

#### Task 2: Answer the following questions
- Is the dataset relational (are there multiple tables that relate to each other)?
    - Example → Donor ID that connects to another dataset with the other information. If not we must make the dataset relational
- For each column, what are the contents of it? How many blanks or nulls are there? What is the format? If there it is one of several types, what are those types?
    - Percentage of nulls/blanks
- Who are the top 10 contributors in your data? The top 10 recipients?
- Make a bar chart with plotly comparing contributions by donor type or recipient type (PAC, individual, etc) and one comparing recipients by the office type they are running for
- If you have multiple years, are they all similar? If not, is the difference explicable (maybe by election schedules)
    - Have an option to toggle at the top of the notebook to use different years.
    - Utility functions to import into the jupyter notebook, should be generalizable

---

#### Question: 
- For each column, what are the contents of it? How many blanks or nulls are there? What is the format? If there it is one of several types, what are those types?


In [7]:
merged_df.dtypes

doc_seq_no           int64
page_no              int64
contribution_id      int64
cont_detail_id       int64
doc_stmnt_year       int64
doc_type_desc       object
com_legal_name      object
common_name         object
cfr_com_id         float64
com_type            object
can_first_name      object
can_last_name       object
contribtype         object
f_name              object
l_name_or_org       object
address             object
city                object
state               object
zip                 object
occupation          object
employer            object
received_date       object
amount             float64
aggregate          float64
extra_desc          object
dtype: object

In [8]:
null_percentage = (merged_df.isna().mean() * 100).reset_index()
null_percentage.columns = ['Column Name', 'Missing Percentage']

# The percentage of null values in each column is as follows below
display(null_percentage)

Unnamed: 0,Column Name,Missing Percentage
0,doc_seq_no,0.0
1,page_no,0.0
2,contribution_id,0.0
3,cont_detail_id,0.0
4,doc_stmnt_year,0.0
5,doc_type_desc,0.0
6,com_legal_name,0.000549
7,common_name,0.0
8,cfr_com_id,0.000549
9,com_type,0.0


#### Question: Are commmittee's relational in the different dataset based on the contribution_id or cfr_com_id ?

- The datasets are relational based upon the cfr_com_id, as shown below

In [9]:
merged_df[merged_df['contribution_id'] == 1].head(2)

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
15317,527446,0,1,0,2022,JULY QUARTERLY CS,CONSTELLATION ENERGY CORPORATION EMPLOYEE POLI...,CONSTELLATION ENERGY CORPORATION EMP,520856.0,IND,,,DIRECT,JORGE,ACEVEDO,104 HUNTS BLUFF RD,SPARKS GLENCOE,MD,21152-9702,SVP COMMERCIALIZATION & DEV,CONSTELLATION ENERGY GENERATIO,04/29/2022,138.8,961.46,
58061,527582,0,1,0,2022,JULY QUARTERLY CS,FRIENDS OF FARMERS INSURANCE PAC,FRIENDS OF FARMERS INSURANCE PAC,513260.0,IND,,,DIRECT,GBADE,AKINJIDE,13516 NORTHLINE RD,SOUTHGATE,MI,48195-1080,,,05/01/2022,10.0,50.0,


In [10]:
merged_df[merged_df['contribution_id'] == 1].tail(2)

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
427436,300635,0,1,0,2008,JULY TRIANNUAL CS,ROCHE INC GOOD GOVERNMENT COMMITTEE,ROCHE INC GOOD GOVERNMENT COMMITTEE,508368.0,POL,,,DIRECT,JOY,RUSSELL,1185 STONEGATE RD,HUMMELSTOWN,PA,17036-9776,SENIOR MANAGER STATE GOVERNMEN,HLR SERVICE CORP.,04/30/2008,100.0,600.0,
451950,300760,0,1,0,2008,JULY TRIANNUAL CS,HUNTINGTON NATIONAL BANK-MICHIGAN POLITICAL AC...,HUNTINGTON NATIONAL BANK-MICHIGAN PO,843.0,IND,,,DIRECT,SUSAN,BLANCHARD,1 NORTH MAIN STREET,MOUNT CLEMENS,MI,48043-5613,VICE PRESIDENT,HUNTINGTON NATIONAL BANK,04/30/2008,2.0,26.0,


In [11]:
merged_df[merged_df['cfr_com_id'] == 508347.0].tail(2)

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
279613,297851,0,832353,0,2008,APRIL TRIANNUAL CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,,,DIRECT,ROBERT I,CUTHRELL,200 CASEMER RD,LAKE ORION,MI,48360-1305,,,01/31/2008,20.0,20.0,
279614,297851,0,832355,0,2008,APRIL TRIANNUAL CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,,,DIRECT,EDWARD,DUNCAN,17424 ARLENE,FRASER,MI,48026-1781,,,01/31/2008,16.48,16.48,


In [12]:
merged_df[merged_df['cfr_com_id'] == 508347.0].head(2)

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
0,472254,0,464000,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,,,DIRECT,TERRY,LOZANO,2034 EDWARD LANE WEST,KIMBALL,MI,48074-0000,FACTORY WORKER,FIAT CHRYSLER AUTOMOBILES N.V.,10/01/2018,15.0,165.0,
1,472254,0,464001,0,2018,OCTOBER QUARTERLY CS,UAW MICHIGAN VOLUNTARY POLITICAL ACTION COMMITTEE,UAW MICHIGAN VOLUNTARY POLITICAL ACT,508347.0,IND,,,DIRECT,PATRICK,CLAERHOUT,121 MURPHY DR.,ST. CLAIR,MI,48079-0000,,,10/01/2018,5.0,55.0,


#### Question: 
- Who are the top 10 contributors in your data? The top 10 recipients?

In [13]:

# display the 10 contributors from the merged dataframe (1995 - 2023)
top_10_contributors = merged_df.groupby('com_legal_name')['amount'].sum().reset_index()
top_10_contributors = top_10_contributors.sort_values(by='amount', ascending=False).head(10)
display(top_10_contributors)

Unnamed: 0,com_legal_name,amount
5293,MICHIGAN REPUBLICAN PARTY,61185192.16
4077,GRETCHEN WHITMER FOR GOVERNOR,49676252.13
2959,DEMOCRATIC STATE CENTRAL COMMITTEE,46894893.06
5846,REPRODUCTIVE FREEDOM FOR ALL,45250239.65
4179,HOUSE REPUBLICAN CAMPAIGN COMMITTEE,42246910.0
3017,DICK DEVOS FOR GOVERNOR,41787282.72
5212,MICHIGAN HOUSE DEMOCRATIC FUND,35020446.07
6075,SENATE REPUBLICAN CAMPAIGN COMMITTEE,31190387.33
5756,PROTECTING MICHIGAN TAXPAYERS,26547849.19
5903,RICK SNYDER FOR MICHIGAN,25195856.96


In [14]:
# display the top 10 recipients from the merged dataframe (1995 - 2023)

top_10_recipients = merged_df.groupby(['can_first_name', 'can_last_name'])['amount'].sum().reset_index()
top_10_recipients = top_10_recipients.sort_values(by='amount', ascending=False).head(10)
display(top_10_recipients)

Unnamed: 0,can_first_name,can_last_name,amount
1375,GRETCHEN,WHITMER,50526077.01
3247,RICHARD,DEVOS,41787282.72
3277,RICHARD,SNYDER,25195856.96
307,BILL,SCHUETTE,18122344.6
1698,JENNIFER,GRANHOLM,16019575.71
3613,SHRI,THANEDAR,13105555.22
2238,KEVIN,RINKE,10493219.07
3960,TUDOR,DIXON,8786024.47
1781,JOCELYN,BENSON,8522913.3
3114,PERRY,JOHNSON,7964951.42


- Make a bar chart with plotly comparing contributions by donor type or recipient type (PAC, individual, etc) and one comparing recipients by the office type they are running for

Note: 
- DIS (District Party)
- STA (State Party)
- BAL (Ballot Question)
- COU (County Party)
- POL (Political Party)
- GUB (Gubernatorial)
- CAN (Candidate)
- IND (Independent PAC)

In [15]:
# plotly bar chart of contributions by com_type
com_type_count = merged_df[merged_df['com_type'] != 'MENOMINEE COUNTY DEMOCRATIC PARTY']
# removes the columns containing an error 
com_type_count = com_type_count['com_type'].value_counts().reset_index()
com_type_count.columns = ['Committee_Type', 'Count']

In [16]:
# Plot 1999-2023 Contributions by Committee Type
fig = px.bar(com_type_count, x='Committee_Type', y='Count', title='Michigan Contributons by Committee Type', text='Count')
fig.update_layout(
    xaxis_title='Committee Types',
    yaxis_title='1999-2023 Count',
    xaxis={'categoryorder': 'total ascending'}
)
fig.show()

In [17]:
# bar chart with contribution type 
contribution_type_count = merged_df['contribtype'].value_counts().reset_index()
contribution_type_count.columns = ['Cont_Type', 'Count']

In [18]:
# Plot 1999-2023 Contributions by Type
fig = px.bar(contribution_type_count, x='Cont_Type', y='Count', title='Michigan Committee Contributions Type', text='Count')
fig.update_layout(
    xaxis_title='Contribution Types',
    yaxis_title='1999-2023 Count',
    xaxis={'categoryorder': 'total ascending'}
)
fig.show()

Contribution Type and Amounts by Year

In [39]:
# Utility functions for graph & dropdown

years = sorted(merged_df['doc_stmnt_year'].unique())

# Create a toggle widget
year_selector = widgets.Dropdown(
    options=years,
    value=years[0],
    description='Select Year: ',
    button_style='primary',
    disabled=False
    )

def plot_year_contribution_types(year):
    """
    Plots contributions per year by type
    
    Inputs:
            year (int): year to plot
    
    Return: None
    """
    filtered_data = merged_df[merged_df['doc_stmnt_year'] == year]
    filtered_data = filtered_data['contribtype'].value_counts().reset_index()
    filtered_data.columns = ['Cont_Type', 'Count']
    
    
    fig = px.bar(filtered_data, x='Cont_Type', y='Count', title=f'Polital Contributions Count from {year} by Type', 
                 text='Count')
    fig.update_layout(
    xaxis_title='Contribution Types',
    yaxis_title= f'{year} Count',
    )
    fig.show()
    
def plot_committee_types_by_year(year):
    """
    Plots committees contributions per year by type
    
    Inputs:
            year (int): year to plot
    
    Return: None
    """
    filtered_data = merged_df[merged_df['doc_stmnt_year'] == year]
    filtered_data = filtered_data[filtered_data['com_type'] != 'MENOMINEE COUNTY DEMOCRATIC PARTY']
    filtered_data = filtered_data['com_type'].value_counts().reset_index()
    filtered_data.columns = ['Committee_Type', 'Count']
    
    fig = px.bar(filtered_data, x='Committee_Type', y='Count', title=f'Donations by Committee Type from {year}',
                 text='Count')
    fig.update_layout(
        xaxis_title='Committee Types',
        yaxis_title=f'{year} Count'
    )
    fig.show()
    
def update_plots(change):
    """
    Interactively update the plots using the Ipywidget
    
    Inputs:
            year (int): year from the selected dropdown
            
    Returns: None
    """
    selected_year = year_selector.value 
    output = widgets.Output()
    with output:
        clear_output(wait=True)
        plot_committee_types_by_year(selected_year)
        plot_year_contribution_types(selected_year)

#### Select a year below and rerun the cells

In [36]:
# Select a year 

output = widgets.Output()
display(year_selector, output)

Dropdown(description='Select Year: ', index=6, options=(1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, …

Output()

In [37]:
# Graph by Year
year_selector.observe(update_plots, names='value')
update_plots(None)

---

#### Miscellaneous Data Analysis & Issues

In [None]:
merged_df.shape

(20777043, 25)

Display the 5 largest donations given (1999 - 2023)

In [None]:
merged_df.nlargest(5, 'amount')

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
61331,367248,0,4344,0,2012,AMENDED PRE-GENERAL CS,THE PEOPLE SHOULD DECIDE,THE PEOPLE SHOULD DECIDE,515899.0,BAL,,,DIRECT,,DIBC HOLDINGS INC,12225 STEPHENS ROAD,WARREN,MI,48089-0000,,,10/31/2012,9175000.0,33389560.0,
66249,368738,0,4344,0,2012,POST-GENERAL CS,THE PEOPLE SHOULD DECIDE,THE PEOPLE SHOULD DECIDE,515899.0,BAL,,,DIRECT,,DIBC HOLDINGS INC,12225 STEPHENS ROAD,WARREN,MI,48089-0000,,,10/31/2012,9175000.0,33389560.0,
2434,278674,0,84316,0,2006,PRE-GENERAL CS,DICK DEVOS FOR GOVERNOR,DICK DEVOS FOR GOVERNOR,512426.0,GUB,RICHARD,DEVOS,DIRECT,DICK,DEVOS,126 OTTAWA AVE,GRAND RAPIDS,MI,49503-0000,CHAIRMAN,WINDQUEST COMPANIES,09/08/2006,9000000.0,28385131.95,
59449,527618,0,4603,0,2022,PRE-PRIMARY CS,KEVIN RINKE FOR GOVERNOR,KEVIN RINKE FOR GOVERNOR,520134.0,GUB,KEVIN,RINKE,LOAN FROM A PERSON,KEVIN,RINKE,P. O. BOX 80915,LANSING,MI,48908-0000,CANDIDATE,KEVIN RINKE FOR GOVERNOR,03/31/2022,8000000.0,10000000.0,
156477,536252,0,36268,0,2022,PRE-GENERAL CS,REPRODUCTIVE FREEDOM FOR ALL,REPRODUCTIVE FREEDOM FOR ALL,520255.0,BAL,,,DIRECT,,OPEN SOCIETY POLICY CENTER,1730 PENNSYLVANIA AVE NW 7TH FLOOR,WASHINGTON,DC,20006-0000,,,09/13/2022,4500000.0,4500000.0,


Issues:

There are columns that are being read in "correctly" but the data is being thrown into the incorrect columns specifically causing errors in the "amount" column. For example, there are rows with the amount listed as "USDA FOREST SERVICE" or "UNEMPLOYED"

Additionally the rows containing "MENOMINEE COUNTY DEMOCRATIC PARTY" data, are being read in incorrectly after the candidate common name. Below displays the 112 rows with this issue. 

In [None]:
error_rows = merged_df[merged_df['com_type'] == 'MENOMINEE COUNTY DEMOCRATIC PARTY']
display(error_rows)

Unnamed: 0,doc_seq_no,page_no,contribution_id,cont_detail_id,doc_stmnt_year,doc_type_desc,com_legal_name,common_name,cfr_com_id,com_type,can_first_name,can_last_name,contribtype,f_name,l_name_or_org,address,city,state,zip,occupation,employer,received_date,amount,aggregate,extra_desc
356,517630,0,4409,0,2022,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,WILLIAM,HUPY,1415 7TH STREET,MENOMINEE,MI,49858-0000,,,,22.0
357,517630,0,4410,0,2022,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,GRACE,CORRINGE,W8728 NORTH LAKE DRIVE 37,VULCAN,MI,49892-0000,,,,40.0
358,517630,0,4412,0,2022,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,DEEANNE,POHLMANN,4203 MICHIGAN SHORES DR,MENOMINEE,MI,49858-0000,,,,10.0
359,517630,0,4414,0,2022,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,JOANNE,HIESHETTER,W5094 BIRCH CREEK ROAD NO 6,MENOMINEE,MI,49858-0000,,,,5.0
361,517630,0,4416,0,2022,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,NATALIE,LASHMET,N194 WEST DR,MENOMINEE,MI,49858-0000,NURSE,,,235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35815,505509,0,4324,0,2021,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,MARI,NEGRO,5131 W. NO. 41 RD.,HERMANSVILLE,MI,49847-0000,RETIRED,,,100.0
35816,505509,0,4350,0,2021,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,LOIS,BERTRAND,400 2ND ST.,MENOMINEE,MI,49858-0000,,,,50.0
35817,505509,0,4352,0,2021,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,ELIZABETH,TUCKER,W3709 GROVE LN,SPAULDING,MI,49886-0000,,,,50.0
35818,505509,0,4354,0,2021,ANNUAL CS,,MENOMINEE COUNTY DEMOCRATIC PARTY,,MENOMINEE COUNTY DEMOCRATIC PARTY,513814,COU,,,DIRECT,VICKIE,KNUTH,N11882 29.60 LN.,BARK RIVER,MI,49807-0000,RETIRED,,,20.0
