In [1]:
# Import dependencies
# Basic modules
import re
import datetime
from operator import add
import os

# Data analysis modules
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Instantiate variables for processing data moving forward
# Spark configuration
spark = SparkSession.builder.appName('2016_Presidental_Election_Indpendent_Expenditures').getOrCreate()

# The path to the data -- defaults to the demo data that is downloaded by installer script.
# Change this path to a different data set as desired
datapath = os.environ['FEC_data_path']
#datapath = '../../data/2016/'

# Set the election year; modify this value to look at a different year
election_year = "2016"

# Set the file paths for the data file, based on the data path and election year
independent_expenditure_file = '{0}/{1}/independent_expenditure_{1}.csv'.format(datapath, election_year)

In [3]:
# Read the CSV as a spark dataframe
ind_exp = spark.read.csv(independent_expenditure_file, header=True)

# Convert the dataframe to an RDD, and 
# filter out the empty lines
presidental_expenditures_rdd = (
    ind_exp
    .rdd
    .filter(lambda x: len(x) != 0)
    .filter(lambda x: 'None' not in x)
    .filter(lambda x: x.ele_type == 'G' and x.can_office == 'P')
)

In [4]:
trump_expenditures = (
    presidental_expenditures_rdd
    .filter(lambda x: 'trump' in (x.cand_name).lower() and 'donald' in x.cand_name.lower() or x.cand_id == 'P80001571')
)

In [5]:
clinton_expenditures = (
    presidental_expenditures_rdd
    .filter(lambda x: 'hillary' in (x.cand_name).lower() and 'clinton' in x.cand_name.lower())
)

In [102]:
class candidate_expenditures_summary:
    
    '''
        A class for summaraizing independent expenditure data and generating
        a text report as a formatted string which outlines the number and sum
        of independent expenditures
    '''
    
    def __init__(self, candidate_expenditures_rdd, cand_name=None, election_year=None):
        if cand_name:
            self.cand_name = cand_name
        else:
            self.cand_name = None
        if election_year:
            self.election_year = election_year
        else:
            self.election_year = None
        self.candidate_expenditures = candidate_expenditures_rdd
        self.candidate_opposition = (
            self.candidate_expenditures
            .filter(lambda x: x.sup_opp == 'O')
        )    
        self.candidate_support = (
            self.candidate_expenditures
            .filter(lambda x: x.sup_opp == 'S')
        )
        self.sum_expenditures = (
            self.candidate_expenditures
            .filter(lambda x: x.exp_amo != None)
            .map(lambda x: (1, float(x.exp_amo)))
            .reduceByKey(add)
            .map(lambda x: x[1])
        )
        self.sum_support = (
            self.candidate_support
            .filter(lambda x: x.exp_amo != None)
            .map(lambda x: (1, float(x.exp_amo)))
            .reduceByKey(add)
            .map(lambda x: x[1])
        )
        self.sum_opposition = (
            self.candidate_opposition
            .filter(lambda x: x.exp_amo != None)
            .map(lambda x: (1, float(x.exp_amo)))
            .reduceByKey(add)
            .map(lambda x: x[1])
        )
        
    def collect_all(self):
        self.total_expenditures_count = self.candidate_expenditures.count()
        self.support_expenditures_count = self.candidate_support.count()
        self.opposition_expenditures_count = self.candidate_opposition.count()
        
        self.total_expenditures_sum = round(self.sum_expenditures.collect()[0], 2)
        self.support_expenditures_sum = round(self.sum_support.collect()[0], 2)
        self.oppostion_expenditures_sum = round(self.sum_opposition.collect()[0], 2)
    
    # it might be worth it to move this kind of report out to its own class
    def generate_report(self):
        self.collect_all()
        _exp_sum = self.total_expenditures_sum
        _sup_sum = self.support_expenditures_sum
        _opp_sum = self.oppostion_expenditures_sum
        _exp_count = self.total_expenditures_count
        _sup_count = self.support_expenditures_count
        _opp_count = self.opposition_expenditures_count
        _report_string = f"{_exp_count:,} independent expenditures totaling ${_exp_sum:,.2f}.  {_sup_count:,} supporting expenditures totaling ${_sup_sum:,.2f}; {_opp_count:,} opposing expenditures totaling ${_opp_sum:,.2f}"
        if self.cand_name:
            _report_candidate = f"{self.cand_name} was the focus of "
            _report_string = _report_candidate + _report_string
        if self.election_year:
            _report_year = f"In election year {self.election_year} "
            _report_string = _report_year + _report_string
        return _report_string

In [103]:
clinton_summary = candidate_expenditures_summary(clinton_expenditures, cand_name="Hilary Clinton", election_year="2016")
clinton_report = clinton_summary.generate_report()

In [104]:
trump_summary = candidate_expenditures_summary(trump_expenditures, cand_name="Donald Trump", election_year="2016")
trump_report = trump_summary.generate_report()

In [105]:
print(trump_report)
print(clinton_report)

In election year 2016 Donald Trump was the focus of 30,636 independent expenditures totaling $384,636,978.31.  990 supporting expenditures totaling $164,685,202.53; 29,639 opposing expenditures totaling $219,928,129.39
In election year 2016 Hilary Clinton was the focus of 44,446 independent expenditures totaling $177,452,001.93.  40,893 supporting expenditures totaling $51,573,455.02; 3,549 opposing expenditures totaling $125,862,557.20
