# 2016 US Election Forecast

This is a re-implementation of [Drew Linzer's election forecasting model](http://votamatic.org/wp-content/uploads/2013/07/Linzer-JASA13.pdf), originally implemented in Stan by [Pierre-Antoine Kremp](https://github.com/pkremp/polls). The model is fit using PyMC3.

In [175]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import us
from datetime import date

## Import data

Download and process data from the Huffington Post. 

In [3]:
states = [state.name.lower() for state in us.STATES]
bad_states = 'district of columbia', 'florida', 'california'
stubs = ["2016-{0}-president-trump-vs-clinton".format(state) for state in states if state not in bad_states]
stubs += ["2016-general-election-trump-vs-clinton",
           "2016-california-presidential-general-election-trump-vs-clinton",
           "2016-florida-presidential-general-election-trump-vs-clinton"]

In [10]:
url = lambda stub: "http://elections.huffingtonpost.com/pollster/{0}.csv".format('-'.join(stub.split(' ')))

In [29]:
raw_polls = [pd.read_csv(url(stub)).assign(state=stub.split('-')[1]) for stub in stubs]

In [44]:
all_polls = pd.concat(raw_polls)
all_polls.columns = all_polls.columns.str.lower()
all_polls.shape

(2892, 20)

In [45]:
all_polls.isnull().sum()

affiliation                  0
clinton                      0
end date                     0
entry date/time (et)         0
johnson                   2535
mcmullin                  2883
mode                         0
number of observations     528
other                      923
partisan                     0
pollster                     0
pollster url                 0
population                   0
question iteration           0
question text             1793
source url                   0
start date                   0
trump                        1
undecided                  146
state                        0
dtype: int64

Date-time conversion

In [119]:
all_polls['end'] = pd.to_datetime(all_polls['end date'])
all_polls['begin'] = pd.to_datetime(all_polls['start date'])
all_polls['poll_time'] = (all_polls.end - all_polls.begin).dt.days
all_polls['poll_date'] = (all_polls.end - (all_polls.end - all_polls.begin) / 2)
all_polls['week'] = all_polls.poll_date.dt.week
all_polls['day_of_week'] = all_polls.poll_date.dt.dayofweek

Deal with inconsistency in pollster names

In [81]:
all_polls.pollster = all_polls.pollster.replace({"Fox News":"FOX",
                            "WashPost":"Washington Post",
                            "ABC News":"ABC"})

Combine other candidate categories

In [68]:
all_polls['other'] = all_polls[['johnson', 'mcmullin', 'other']].fillna(0).sum(1)

In [83]:
all_polls['both'] = all_polls.clinton + all_polls.trump

Fill NA values where needed.

In [82]:
all_polls.undecided = all_polls.undecided.fillna(0)

Important dates

In [174]:
start_date = date(2016, 4, 1)
election_date = date(2016, 11, 8)

Rows and columns we need for analysis

In [110]:
rows_to_keep = ((all_polls['number of observations']>1)
               & (all_polls.poll_date >= start_date)
               & (all_polls.population.isin(['Likely Voters', 'Registered Voters', 'Adults'])))

cols_to_keep = ['state', 'begin', 'end', 'poll_time', 'poll_date', 'week', 'day_of_week', 
               'pollster', 'mode', 'population', 'number of observations',
               'clinton', 'trump', 'both', 'other']

In [146]:
poll_data = (all_polls.loc[rows_to_keep, cols_to_keep]
                .rename(columns={'mode':'method', 'population':'vtype', 'number of observations':'n_obs'}))

Derived columns

In [147]:
poll_data['poll_type'] = poll_data.vtype.replace({"Likely Voters":0, 
                                                     "Registered Voters":1,
                                                     "Adults":2})
poll_data['p_clinton'] = poll_data.clinton / poll_data.both
poll_data['n_clinton'] = poll_data.n_obs * poll_data.clinton / 100
poll_data['n_respondents'] = poll_data.n_obs * poll_data.both / 100

In [148]:
poll_data.head()

Unnamed: 0,state,begin,end,poll_time,poll_date,week,day_of_week,pollster,method,vtype,n_obs,clinton,trump,both,other,poll_type,p_clinton,n_clinton,n_respondents
0,alabama,2016-10-25,2016-10-31,6,2016-10-28 00:00:00,43,4,SurveyMonkey,Internet,Likely Voters,485.0,35.0,55.0,90.0,0.0,0,0.388889,169.75,436.5
1,alabama,2016-10-23,2016-10-29,6,2016-10-26 00:00:00,43,2,UPI/CVOTER,Internet,Likely Voters,349.0,37.0,58.0,95.0,0.0,0,0.389474,129.13,331.55
2,alabama,2016-10-07,2016-10-27,20,2016-10-17 00:00:00,42,0,Ipsos/Reuters,Internet,Likely Voters,505.0,39.0,51.0,90.0,0.0,0,0.433333,196.95,454.5
3,alabama,2016-10-18,2016-10-24,6,2016-10-21 00:00:00,42,4,SurveyMonkey,Internet,Likely Voters,415.0,36.0,52.0,88.0,0.0,0,0.409091,149.4,365.2
4,alabama,2016-10-09,2016-10-16,7,2016-10-12 12:00:00,41,2,UPI/CVOTER,Internet,Likely Voters,327.0,38.0,57.0,95.0,0.0,0,0.4,124.26,310.65


In [150]:
poll_data.shape

(1670, 19)

Remove overlapping polls

In [152]:
poll_data = poll_data.drop_duplicates(['state', 'poll_date', 'pollster'])

In [153]:
poll_data.shape

(1242, 19)

In [154]:
poll_data.to_csv('data/clean/poll_data_2016.csv')

### 2012 data

For use in deriving priors, weights and getting electoral votes

In [164]:
data_2012 = pd.read_csv('data/raw/2012.csv', index_col=-3)

In [165]:
national_score = data_2012.obama_count.sum() / (data_2012.romney_count + data_2012.obama_count).sum()
national_score

0.51963863890611295

In [166]:
data_2012['score'] = data_2012.obama_count / (data_2012.romney_count + data_2012.obama_count)
data_2012['diff_score'] = data_2012.score - national_score
data_2012['share_national'] = (data_2012.total_count * (1 + data_2012.adult_pop_growth_2011_15)
                               / (data_2012.total_count*(1+data_2012.adult_pop_growth_2011_15)).sum())

In [167]:
data_2012.head()

Unnamed: 0_level_0,state,obama,romney,obama_count,romney_count,total_count,ev,adult_pop_growth_2011_15,score,diff_score,share_national
state_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,AL,38.36,60.55,795696,1255925,2074338,9,0.021734,0.387838,-0.131801,0.015766
Alaska,AK,40.81,54.8,122640,164676,300495,3,0.033483,0.426847,-0.092792,0.00231
Arizona,AZ,44.59,53.65,1025232,1233654,2299254,11,0.071607,0.453866,-0.065772,0.018329
Arkansas,AR,36.88,60.57,394409,647744,1069468,6,0.020381,0.378456,-0.141183,0.008118
California,CA,60.24,37.12,7854285,4839958,13038547,55,0.056436,0.618728,0.099089,0.102468


### Constants

In [173]:
POLLS = poll_data.shape[0]
POLLSTERS = poll_data.pollster.unique().shape[0]
STATES = poll_data.state.unique().shape[0]

## Specify model

In [168]:
from pymc3 import Model, sample
from pymc3 import Binomial, Normal
from pymc3.math import invlogit

In [None]:
with Model() as election_model:
    
    # Pollster house effect
    μ_c = Normal('μ_c', 0, 1, shape=POLLSTERS)
    
    # Polling error
    poll_error = Normal('poll_error', 0, 1, shape=STATES-1)
    
    # Prior for the difference between national and weighted average of state parameters
    α = Normal('α', 0, 1)
    
    δ_a = Normal('δ_a', )
    
    # Binomial likelihood of Clinton count
    likelihood = Binomial('likelihood', n_respondents, π, observed=n_clinton)

## Platform information

In [1]:
%load_ext watermark

In [7]:
%watermark -v -m -g -p pandas,numpy,pymc3

CPython 3.5.2
IPython 5.1.0

pandas 0.19.0
numpy 1.11.2
pymc3 3.0.rc2

compiler   : GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.54)
system     : Darwin
release    : 16.1.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
Git hash   : 6c363171114ef79674b6b85be416ad70c121ed5d
