In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
plt.style.use('fivethirtyeight') 
%matplotlib inline 

# Step 1: Cleaning and prepping our datasets for analysis

Data source: https://www.electionreturns.pa.gov/ 

In [None]:
# Importing data; dropping unnecessary/empty columns
pres_2012 = pd.read_csv("data/2012_presidential.csv").iloc[:, :7]
pres_2016 = pd.read_csv("data/2016_presidential.csv").iloc[:, :7]
midterm_2018 = pd.read_csv("data/2018_midterm.csv").iloc[:, :7]
pres_2020 = pd.read_csv("data/2020_presidential.csv").iloc[:, :7]
media_markets = pd.read_csv("data/media_markets.csv")

In [None]:
# Data cleaning: prepping for merge
media_markets["County"] = media_markets["County"].str.upper()

# Making a big dataset for our primary analysis: 2012-2018 national elections
elections = pd.concat([pres_2012, pres_2016, midterm_2018, pres_2020])
# and adding the media market info
elections_mm = elections.merge(media_markets, left_on = "County Name", right_on = "County").drop("County", axis = "columns")

elections_mm.head()

In [None]:
# One problem: votes look like the wrong datatype
type(elections_mm["Votes"][0])

In [None]:
# Data cleaning: votes are strings! need to switch to int for quant analysis
elections_mm["Votes"] = elections_mm["Votes"].str.replace(",", "").astype(int)
elections_mm.head()

In [None]:
# We don't care too much about 3rd parties, so let's consolidate and then remove
def parties(name_str):
    if (name_str == "Republican") or (name_str == "Democratic"):
        return name_str
    else:
        return "Other"

elections_mm["Party Name"] = elections_mm["Party Name"].apply(parties)

In [None]:
# Separating cleaned data into different dataframes and using lambda for brevity
by_election = lambda election, office: elections_mm[(elections_mm["Election Name"] == election) & (elections_mm["Office Name"] == office)]

potus_2012 = by_election("2012 General Election", "President of the United States")
potus_2016 = by_election("2016 Presidential Election", "President of the United States")
gov_2018 = by_election("2018 General Election", "Governor")
sen_2018 = by_election("2018 General Election", "United States Senator")

In [None]:
# Changing votes to percents to compare across media market
def vote_percent(market):
    return 100 * market / sum(market)

# Dropping 3rd parties after calculating percents to maintain correct values
def props_by_mm(election):
    return election.pivot_table("Votes", "Media Market", "Party Name", sum).apply(vote_percent, axis = "columns").loc[:, ["Democratic", "Republican"]]


potus_2012_mm = props_by_mm(potus_2012)
potus_2016_mm = props_by_mm(potus_2016)
gov_2018_mm = props_by_mm(gov_2018)
sen_2018_mm = props_by_mm(sen_2018)

# Step 2: What Media Markets should we focus on? What's our strategy?

Let's compare the shares of votes by proportion (so we can accurately compare across media market).

In [None]:
# For all graphs: Blue = Dem, Red = Republican
ax1 = potus_2012_mm.plot(kind = "barh", legend = None)
ax1.set_title("Voting by Media Market, POTUS 2012 (Obama - D v. Romney - R)");

ax2 = potus_2016_mm.plot(kind = "barh", legend = None)
ax2.set_title("Voting by Media Market, POTUS 2016 (Clinton - D v. Trump - R)");

ax3 = gov_2018_mm.plot(kind = "barh", legend = None)
ax3.set_title("Voting by Media Market, Governor 2018 (Wolf - D v. Wagner - R)");

ax4 = sen_2018_mm.plot(kind = "barh", legend = None)
ax4.set_title("Voting by Media Market, US Senate 2018 (Casey Jr. - D v. Barletta - R)");


These graphs are useful to get a general idea of what party tends to win each media market, but we want to summarize the data across all 4 elections to get a better overview of general trends.

In [None]:
# We're going to just summarize these 4 elections by making a big dataset with props

props = pd.concat([potus_2012_mm, potus_2016_mm, gov_2018_mm, sen_2018_mm])
props.head()

In [None]:
# Now let's summarize the media markets into 3 groups: 
# Saints = always votes for your party
# Sinners = never votes for your party
# Salvageables = sometimes votes for your party, sometimes not

def winner(props):
    """Gives a score of -1 if won by Democrats, +1 if won by Republicans
    Assume that Democrats = 1st value, Republican = 2nd value"""
    if props[0] > props[1]:
        return -1
    elif props[0] < props[1]:
        return +1

props["Winner"] = props.apply(winner, axis = "columns")
props.head(4)

In [None]:
# Comparing media markets for the 4 elections. Who's our saints/sinners/salvageables?
scores = props.groupby("Media Market").agg(sum)[["Winner"]]

plt.barh(scores.index, scores["Winner"], color = ["Blue", "Red", "Red", "Blue", "Blue", "Red"]);
plt.title("Winners in each Media Market by Party");
# Recall that left = Democratic leaning, Right = Republican leaning


Now that we know the history of Pennsylvania voting for the President, Governor, and Senate in the 2010s, we can figure out a strategy!

There are two main options: 

>1) A base-motivation strategy: convince your saints to come out in significant numbers compared to the opposition and depress the opposition's votes
>
> 2) An salvageable strategy: convince salvageable voters (i.e. voters we can flip or who are undecided) to vote for your party to bring you into the majority
 
Most campaigns will use a combination of both options. To help us decide this, let's look at raw numbers for votes. In particular, let's focus on the battlegrounds of Erie and Pittsburgh, since those are the regions that have flipped from Obama to Trump. 

In [None]:
# One note: dropping 2018 to focus on presidential elections only
elections_mm[(elections_mm["Media Market"] == "Pittsburgh") & (elections_mm["Party Name"] != "Other") & (elections_mm["Election Name"] != "2020 Presidential Election") & (elections_mm["Election Name"] != "2018 General Election")].pivot_table("Votes", "Election Name", "Party Name", sum).plot(kind = "bar")
plt.legend(loc = "lower center");
plt.title("Votes in Pittsburgh by Election and Party");


In [None]:
elections_mm[(elections_mm["Media Market"] == "Erie") & (elections_mm["Party Name"] != "Other") & (elections_mm["Election Name"] != "2020 Presidential Election") & (elections_mm["Election Name"] != "2018 General Election")].pivot_table("Votes", "Election Name", "Party Name", sum).plot(kind = "bar");
plt.legend(loc = "lower center");
plt.title("Votes in Erie by Election and Party");


So it appears that Republican had a very high turnout in 2016 compared to Democrats, which helped them win these battleground markets (especially in Erie, which was hit hard by an economic downturn in manufacturing prior to 2016).


# Step 3: So, what happened in 2020?

In [None]:
potus_2020 = by_election("2020 Presidential Election", "President of the United States")
props_by_mm(potus_2020).plot(kind = "barh", legend = None);

# Trump won Wilkes Barre Scranton, Pittsburgh, Johnstown Altoona, Harrisburg, and Erie
# So how did Biden win? 

In [None]:
# Recall 2016: a very similar picture
ax2 = potus_2016_mm.plot(kind = "barh", legend = None)
ax2.set_title("Voting by Media Market, POTUS 2016 (Clinton - D v. Trump - R)");

In [None]:
# Total votes by party in 2016
print(sum(potus_2016.groupby("Party Name").agg(sum)["Votes"]), "people voted in 2016.")
potus_2016.groupby("Party Name").agg(sum)

In [None]:
# Total votes by party in 2020
print(sum(potus_2020.groupby("Party Name").agg(sum)["Votes"]), "people voted in 2020.")
potus_2020.groupby("Party Name").agg(sum)

In [None]:
# Let's break it down by media market. 2016
votes_2016 = potus_2016.pivot_table("Votes", "Media Market", "Party Name", sum)[["Democratic", "Republican"]]
votes_2016

In [None]:
# 2020 by media market
# ~800k more people voted in 2020! But how were those votes spread out? Let's find out below.
votes_2020 = potus_2020.pivot_table("Votes", "Media Market", "Party Name", sum)[["Democratic", "Republican"]]
votes_2020

In [None]:
print("The increase in Democratic votes from 2016 to 2020 was:", sum(votes_2020["Democratic"] - votes_2016["Democratic"]))
votes_2020["Democratic"] - votes_2016["Democratic"]

In [None]:
print("The increase in Republican votes from 2016 to 2020 was:", sum(votes_2020["Republican"] - votes_2016["Republican"]))
votes_2020["Republican"] - votes_2016["Republican"]

In [None]:
# Democrats beat Republicans in turnout for most regions in 2020
(votes_2020["Democratic"] - votes_2016["Democratic"]) > (votes_2020["Republican"] - votes_2016["Republican"])

In [None]:
# By how many votes in each region? + = Democratic gain, - = Republican gain
(votes_2020["Democratic"] - votes_2016["Democratic"]) - (votes_2020["Republican"] - votes_2016["Republican"])

In [None]:
# In 2016, Democrats lost by this many votes: 0.6% of the total vote share
sum(votes_2016["Democratic"] - votes_2016["Republican"])

In [None]:
# But, in 2020, Democrats increased turnout by this many votes overall compared to Republicans
sum(votes_2020["Democratic"] - votes_2016["Democratic"]) - sum(votes_2020["Republican"] - votes_2016["Republican"])

In [None]:
# and won by this many votes! That's just 1.3% of the total vote share in 2020.
sum(votes_2020["Democratic"] - votes_2020["Republican"])

In [None]:
# And we can see how they won here in 2020 in terms of raw numbers and markets: it's all Philadelphia
votes_2020["Democratic"] - votes_2020["Republican"]

In [None]:
# Compared to 2016 (notice lower votes in Philly + larger Republican shares everywhere else)
votes_2016["Democratic"] - votes_2016["Republican"]