In [7]:
import pandas as pd
import numpy as np
import plotly.express as px
import zipfile

#### FEC Data Retrieval and Processing

In [8]:
#define retrieval years for data pull from FEC
retYears = [*range(2004, 2026, 2)]
retYearsTrim = [str(a)[2:] for a in retYears]
retYears = [str(a) for a in retYears]

In [9]:
#define key election years to filter out mid-cycle noise and zero in on desired date range
elecYears = [*range(2004, 2026, 2)]
elecYears = [str(a) for a in elecYears]

In [10]:
#iterate data load and processing by number of desired retrieval years
for i in range(len(retYears)):
    
    #read in spend data csv
    tailDF = pd.read_csv("https://www.fec.gov/files/bulk-downloads/" + retYears[i] + "/oppexp" + retYearsTrim[i] + ".zip", compression="zip", delimiter="|", header=None)

    #read in spend header csv
    headDF = pd.read_csv("https://www.fec.gov/files/bulk-downloads/data_dictionaries/oppexp_header_file.csv", header=None)

    #add header to main DF
    rawDF = pd.concat([headDF, tailDF], axis=0)
    rawDF.columns = rawDF.iloc[0]
    rawDF = rawDF[1:]
    rawDF.reset_index(drop=True, inplace=True)

    #read in committee master data csv
    cmtTailDF = pd.read_csv("https://www.fec.gov/files/bulk-downloads/" + retYears[i] + "/cm" + retYearsTrim[i] + ".zip", compression="zip", delimiter="|", header=None)

    #read in committee header csv
    cmtHeadDF = pd.read_csv("https://www.fec.gov/files/bulk-downloads/data_dictionaries/cm_header_file.csv", header=None)

    #add cmt header to main cmt DF
    cmtRawDF = pd.concat([cmtHeadDF, cmtTailDF], axis=0)
    cmtRawDF.columns = cmtRawDF.iloc[0]
    cmtRawDF = cmtRawDF[1:]
    cmtRawDF.reset_index(drop=True, inplace=True)

    #add party affiliation from cmt DF to spend DF
    cmtPtyID = cmtRawDF[["CMTE_ID", "CMTE_PTY_AFFILIATION", "CAND_ID"]]
    spendDF = rawDF.merge(cmtPtyID, how="left", on="CMTE_ID")

    #read in candidate master data csv
    canTailDF = pd.read_csv("https://www.fec.gov/files/bulk-downloads/" + retYears[i] + "/cn" + retYearsTrim[i] + ".zip", compression="zip", delimiter="|", header=None)

    #read in candidate header csv
    canHeadDF = pd.read_csv("https://www.fec.gov/files/bulk-downloads/data_dictionaries/cn_header_file.csv", header=None)

    #add can header to main can DF
    canRawDF = pd.concat([canHeadDF, canTailDF], axis=0)
    canRawDF.columns = canRawDF.iloc[0]
    canRawDF = canRawDF[1:]
    canRawDF.reset_index(drop=True, inplace=True)

    #add office type from can DF to spend DF
    canOffID = canRawDF[["CAND_ID", "CAND_ELECTION_YR", "CAND_OFFICE"]]
    spendDF = spendDF.merge(canOffID, how="left", on="CAND_ID")

    #trim spend DF to desired fields
    spendTrimDF = spendDF[["CMTE_ID", "RPT_YR", "STATE", "TRANSACTION_AMT", "PURPOSE", "CATEGORY_DESC", "CMTE_PTY_AFFILIATION", "CAND_ID", "CAND_ELECTION_YR", "CAND_OFFICE"]]

    #pivot to aggregate on desired fields
    spendPivotDF = pd.pivot_table(spendTrimDF, index=["CAND_ELECTION_YR", "CMTE_PTY_AFFILIATION", "CAND_OFFICE", "STATE"], values="TRANSACTION_AMT", aggfunc="sum")
    spendPivotDF = spendPivotDF.reset_index()

    #add processed DF to cumulative DF
    if (i == 0):
        spendFinalDF = spendPivotDF
    else:
        spendFinalDF = pd.concat([spendFinalDF, spendPivotDF], axis=0)
        spendFinalDF.reset_index(drop=True, inplace=True)

#repivot entire DF to reaggregate
spendFinalDF = pd.pivot_table(spendFinalDF, index=["CAND_ELECTION_YR", "CMTE_PTY_AFFILIATION", "CAND_OFFICE", "STATE"], values="TRANSACTION_AMT", aggfunc="sum")
spendFinalDF = spendFinalDF.reset_index()

#convert years to string
spendFinalDF["CAND_ELECTION_YR"] = spendFinalDF["CAND_ELECTION_YR"].astype(str)

#convert candidate office field to a more understandable name
spendFinalDF.loc[spendFinalDF["CAND_OFFICE"] == "H", "CAND_OFFICE"] = "House"
spendFinalDF.loc[spendFinalDF["CAND_OFFICE"] == "S", "CAND_OFFICE"] = "Senate"
spendFinalDF.loc[spendFinalDF["CAND_OFFICE"] == "P", "CAND_OFFICE"] = "President"

#filter for only desired election years
spendFinalDF = spendFinalDF[spendFinalDF["CAND_ELECTION_YR"].isin(elecYears)]

#sort by year for visualization
spendFinalDF = spendFinalDF.sort_values(by="CAND_ELECTION_YR")


Columns (5,11,14,16,17) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (5,11,16) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (5,11,16,17) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (5,11,16) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (5,11,16,17) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (5,11,16) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (16) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (11,16) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (11,16) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (16) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (11,16) have mixed types. Specify dtype option on imp

#### Exploratory Analysis and Visualization

In [11]:
fig = px.histogram(spendFinalDF, x="CAND_ELECTION_YR", y="TRANSACTION_AMT", color="CAND_OFFICE", barmode="stack")
fig.show()

In [12]:
#export to csv
spendFinalDF.to_csv("FEC_Spend_Data.csv", index=False)