In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# import the processed polls from 2004 to 2020
polls_2004 = pd.read_csv('../../data/processed/2004_mean_polls.csv')
polls_2008 = pd.read_csv('../../data/processed/2008_mean_polls.csv')
polls_2012 = pd.read_csv('../../data/processed/2012_mean_polls.csv')
polls_2016 = pd.read_csv('../../data/processed/2016_mean_polls.csv')
polls_2020 = pd.read_csv('../../data/processed/2020_mean_polls.csv')

In [3]:
# import the results margin from 2004 to 2016
results_2004 = pd.read_csv('../../data/processed/2004_presidential_election.csv')[['state', '2004_results_margin']]
results_2008 = pd.read_csv('../../data/processed/2008_presidential_election.csv')[['state', '2008_results_margin']]
results_2012 = pd.read_csv('../../data/processed/2012_presidential_election.csv')[['state', '2012_results_margin']]
results_2016 = pd.read_csv('../../data/processed/2016_presidential_election.csv')[['state', '2016_results_margin']]
results_2020 = pd.read_csv('../../data/processed/2020_presidential_election.csv')[['state', '2020_results_margin']]

In [4]:
# calculate the error for 2004
error_2004 = polls_2004.merge(results_2004, how = 'right', on = 'state', sort = False)
error_2004['2004_error'] = error_2004['2004_polls_margin'] - error_2004['2004_results_margin']
error_2004

Unnamed: 0,state,2004_polls_margin,2004_results_margin,2004_error
0,AL,19.619048,25.62,-6.000952
1,AK,25.0,25.55,-0.55
2,AZ,7.705882,10.47,-2.764118
3,AR,3.513889,9.76,-6.246111
4,CA,-10.285714,-9.95,-0.335714
5,CO,4.138889,4.67,-0.531111
6,CT,-11.923077,-10.37,-1.553077
7,DE,-8.3,-7.59,-0.71
8,DC,-67.0,-79.84,12.84
9,FL,1.064706,5.01,-3.945294


In [5]:
# calculate the error for 2008
error_2008 = polls_2008.merge(results_2008, how = 'right', on = 'state', sort = False)
error_2008['2008_error'] = error_2008['2008_polls_margin'] - error_2008['2008_results_margin']
error_2008

Unnamed: 0,state,2008_polls_margin,2008_results_margin,2008_error
0,AL,21.203846,21.58,-0.376154
1,AK,11.85,21.54,-9.69
2,AZ,10.0,8.52,1.48
3,AR,13.384615,19.85,-6.465385
4,CA,-15.078947,-24.06,8.981053
5,CO,-3.801563,-8.95,5.148437
6,CT,-16.0,-22.37,6.37
7,DE,-17.344444,-25.0,7.655556
8,DC,-67.5,-85.92,18.42
9,FL,0.745361,-2.82,3.565361


In [6]:
# calculate the error for 2012
error_2012 = polls_2012.merge(results_2012, how = 'right', on = 'state', sort = False)
error_2012['2012_error'] = error_2012['2012_polls_margin'] - error_2012['2012_results_margin']
error_2012

Unnamed: 0,state,2012_polls_margin,2012_results_margin,2012_error
0,AL,16.5,22.2,-5.7
1,AK,,14.0,
2,AZ,2.842105,9.1,-6.257895
3,AR,23.833333,23.7,0.133333
4,CA,-21.305,-23.1,1.795
5,CO,-3.173134,-5.4,2.226866
6,CT,-12.736842,-17.3,4.563158
7,DE,,-18.6,
8,DC,-80.0,-83.6,3.6
9,FL,-1.915414,-0.9,-1.015414


In [7]:
# calculate the error for 2016
error_2016 = polls_2016.merge(results_2016, how = 'right', on = 'state', sort = False)
error_2016['2016_error'] = error_2016['2016_polls_margin'] - error_2016['2016_results_margin']
error_2016

Unnamed: 0,state,2016_polls_margin,2016_results_margin,2016_error
0,AL,20.0,27.73,-7.73
1,AK,4.133333,14.73,-10.596667
2,AZ,1.926829,3.5,-1.573171
3,AR,21.2,26.92,-5.72
4,CA,-23.09375,-30.11,7.01625
5,CO,-4.829268,-4.91,0.080732
6,CT,-8.75,-13.64,4.89
7,DE,-15.5,-11.37,-4.13
8,DC,,-86.77,
9,FL,-2.25831,1.2,-3.45831


# 2020 based on the mean error from 2004 to 2016 (best prediction so far)

In [8]:
# merge all errors up to 2016
error_for_2020 = error_2004[['state', '2004_error']].merge(error_2008[['state', '2008_error']], how = 'right', on = 'state', sort = False).merge(error_2012[['state', '2012_error']], how = 'right', on = 'state', sort = False).merge(error_2016[['state', '2016_error']], how = 'right', on = 'state', sort = False)
error_for_2020

Unnamed: 0,state,2004_error,2008_error,2012_error,2016_error
0,AL,-6.000952,-0.376154,-5.7,-7.73
1,AK,-0.55,-9.69,,-10.596667
2,AZ,-2.764118,1.48,-6.257895,-1.573171
3,AR,-6.246111,-6.465385,0.133333,-5.72
4,CA,-0.335714,8.981053,1.795,7.01625
5,CO,-0.531111,5.148437,2.226866,0.080732
6,CT,-1.553077,6.37,4.563158,4.89
7,DE,-0.71,7.655556,,-4.13
8,DC,12.84,18.42,3.6,
9,FL,-3.945294,3.565361,-1.015414,-3.45831


In [9]:
# calculate the mean error
error_for_2020['mean_error'] = error_for_2020.mean(numeric_only = True, axis = 1)
error_for_2020

Unnamed: 0,state,2004_error,2008_error,2012_error,2016_error,mean_error
0,AL,-6.000952,-0.376154,-5.7,-7.73,-4.951777
1,AK,-0.55,-9.69,,-10.596667,-6.945556
2,AZ,-2.764118,1.48,-6.257895,-1.573171,-2.278796
3,AR,-6.246111,-6.465385,0.133333,-5.72,-4.574541
4,CA,-0.335714,8.981053,1.795,7.01625,4.364147
5,CO,-0.531111,5.148437,2.226866,0.080732,1.731231
6,CT,-1.553077,6.37,4.563158,4.89,3.56752
7,DE,-0.71,7.655556,,-4.13,0.938519
8,DC,12.84,18.42,3.6,,11.62
9,FL,-3.945294,3.565361,-1.015414,-3.45831,-1.213414


In [10]:
# convert the dataframe to latex
print(error_for_2020.to_latex(index = False))

\begin{tabular}{lrrrrr}
\toprule
state &  2004\_error &  2008\_error &  2012\_error &  2016\_error &  mean\_error \\
\midrule
   AL &   -6.000952 &   -0.376154 &   -5.700000 &   -7.730000 &   -4.951777 \\
   AK &   -0.550000 &   -9.690000 &         NaN &  -10.596667 &   -6.945556 \\
   AZ &   -2.764118 &    1.480000 &   -6.257895 &   -1.573171 &   -2.278796 \\
   AR &   -6.246111 &   -6.465385 &    0.133333 &   -5.720000 &   -4.574541 \\
   CA &   -0.335714 &    8.981053 &    1.795000 &    7.016250 &    4.364147 \\
   CO &   -0.531111 &    5.148437 &    2.226866 &    0.080732 &    1.731231 \\
   CT &   -1.553077 &    6.370000 &    4.563158 &    4.890000 &    3.567520 \\
   DE &   -0.710000 &    7.655556 &         NaN &   -4.130000 &    0.938519 \\
   DC &   12.840000 &   18.420000 &    3.600000 &         NaN &   11.620000 \\
   FL &   -3.945294 &    3.565361 &   -1.015414 &   -3.458310 &   -1.213414 \\
   GA &   -3.266667 &    2.731176 &    0.488235 &   -2.283846 &   -0.582775 \\
   HI

  print(error_for_2020.to_latex(index = False))


In [12]:
# create a dataframe for 2020
polls_2020['error_mean'] = error_for_2020['mean_error']
polls_2020['2020_polls_adjusted'] = polls_2020['2020_polls_margin'] - polls_2020['error_mean']
polls_2020['2020_results_margin'] = results_2020['2020_results_margin']
polls_2020['2020_error_raw'] = polls_2020['2020_polls_margin'] - polls_2020['2020_results_margin']
polls_2020['2020_error_adjusted'] = polls_2020['2020_polls_adjusted'] - polls_2020['2020_results_margin']
lower_error = []
pred_raw = []
pred_adjusted = []
result = []
for i in range(len(polls_2020)):
    if polls_2020['2020_error_raw'][i] > polls_2020['2020_error_adjusted'][i]:
        lower_error.append('raw')
    if polls_2020['2020_error_raw'][i] < polls_2020['2020_error_adjusted'][i]:
        lower_error.append('adjusted')
    if polls_2020['2020_polls_margin'][i] < 0:
        pred_raw.append('D')
    if polls_2020['2020_polls_margin'][i] > 0:
        pred_raw.append('R')
    if polls_2020['2020_polls_adjusted'][i] < 0:
        pred_adjusted.append('D')
    if polls_2020['2020_polls_adjusted'][i] > 0:
        pred_adjusted.append('R')
    if polls_2020['2020_results_margin'][i] < 0:
        result.append('D')
    if polls_2020['2020_results_margin'][i] > 0:
        result.append('R')
polls_2020['lower_error'] = lower_error
polls_2020['pred_raw'] = pred_raw
polls_2020['pred_adjusted'] = pred_adjusted
polls_2020['result'] = result
polls_2020

Unnamed: 0,state,2020_polls_margin,error_mean,2020_polls_adjusted,2020_results_margin,2020_error_raw,2020_error_adjusted,lower_error,pred_raw,pred_adjusted,result
0,AL,20.277778,-4.951777,25.229554,25.46,-5.182222,-0.230446,adjusted,R,R,R
1,AK,6.533333,-6.945556,13.478889,10.06,-3.526667,3.418889,adjusted,R,R,R
2,AZ,-2.808163,-2.278796,-0.529367,-0.31,-2.498163,-0.219367,adjusted,D,D,D
3,AR,24.222222,-4.574541,28.796763,27.62,-3.397778,1.176763,adjusted,R,R,R
4,CA,-28.84375,4.364147,-33.207897,-29.16,0.31625,-4.047897,raw,D,D,D
5,CO,-12.514286,1.731231,-14.245517,-13.5,0.985714,-0.745517,raw,D,D,D
6,CT,-23.307692,3.56752,-26.875213,-20.07,-3.237692,-6.805213,raw,D,D,D
7,DE,-25.555556,0.938519,-26.494074,-18.97,-6.585556,-7.524074,raw,D,D,D
8,DC,-78.166667,11.62,-89.786667,-86.75,8.583333,-3.036667,raw,D,D,D
9,FL,-2.859649,-1.213414,-1.646235,3.36,-6.219649,-5.006235,adjusted,D,D,R


In [14]:
# calculate the number of states where adjusting the polls result in a lower error
(polls_2020['lower_error'] == 'adjusted').sum()

31

In [15]:
# find the misclassified states
misclassification = []
for i in range(len(polls_2020)):
    if polls_2020['2020_polls_adjusted'][i] * polls_2020['2020_results_margin'][i] < 0:
        misclassification.append(polls_2020['state'][i])
misclassification

['FL', 'GA']

In [16]:
# export the dataframe to a csv file
polls_2020.to_csv('../../data/processed/2020_adjusted_polls_and_results.csv', index = False)

In [17]:
# convert the dataframe to latex
print(polls_2020.to_latex(index = False))

\begin{tabular}{lrrrrrrllll}
\toprule
state &  2020\_polls\_margin &  error\_mean &  2020\_polls\_adjusted &  2020\_results\_margin &  2020\_error\_raw &  2020\_error\_adjusted & lower\_error & pred\_raw & pred\_adjusted & result \\
\midrule
   AL &          20.277778 &   -4.951777 &            25.229554 &                25.46 &       -5.182222 &            -0.230446 &    adjusted &        R &             R &      R \\
   AK &           6.533333 &   -6.945556 &            13.478889 &                10.06 &       -3.526667 &             3.418889 &    adjusted &        R &             R &      R \\
   AZ &          -2.808163 &   -2.278796 &            -0.529367 &                -0.31 &       -2.498163 &            -0.219367 &    adjusted &        D &             D &      D \\
   AR &          24.222222 &   -4.574541 &            28.796763 &                27.62 &       -3.397778 &             1.176763 &    adjusted &        R &             R &      R \\
   CA &         -28.843750 &    4.

  print(polls_2020.to_latex(index = False))


# 2016 based on the mean error from 2004 to 2012

In [89]:
# try the same with 2016
error_for_2016 = error_2004[['state', '2004_error']].merge(error_2008[['state', '2008_error']], how = 'right', on = 'state', sort = False).merge(error_2012[['state', '2012_error']], how = 'right', on = 'state', sort = False)
error_for_2016['mean_error'] = error_for_2016.mean(numeric_only = True, axis = 1)
error_for_2016

Unnamed: 0,state,2004_error,2008_error,2012_error,mean_error
0,AL,-6.000952,-0.376154,-5.7,-4.025702
1,AK,-0.55,-9.69,,-5.12
2,AZ,-2.764118,1.48,-6.257895,-2.514004
3,AR,-6.246111,-6.465385,0.133333,-4.192721
4,CA,-0.335714,8.981053,1.795,3.480113
5,CO,-0.531111,5.148437,2.226866,2.281397
6,CT,-1.553077,6.37,4.563158,3.126694
7,DE,-0.71,7.655556,,3.472778
8,DC,12.84,18.42,3.6,11.62
9,FL,-3.945294,3.565361,-1.015414,-0.465116


In [90]:
# merge the dataframe for 2016
polls_2016 = polls_2016.merge(error_for_2016[['state', 'mean_error']], how = 'right', on = 'state')
polls_2016

Unnamed: 0,state,2016_polls_margin,mean_error
0,AL,20.0,-4.025702
1,AK,4.133333,-5.12
2,AZ,1.926829,-2.514004
3,AR,21.2,-4.192721
4,CA,-23.09375,3.480113
5,CO,-4.829268,2.281397
6,CT,-8.75,3.126694
7,DE,-15.5,3.472778
8,DC,,11.62
9,FL,-2.25831,-0.465116


In [91]:
# create a dataframe for 2016
polls_2016['2016_polls_adjusted'] = polls_2016['2016_polls_margin'] - polls_2016['mean_error']
polls_2016['2016_results_margin'] = results_2016['2016_results_margin']
polls_2016['2016_error_raw'] = polls_2016['2016_polls_margin'] - polls_2016['2016_results_margin']
polls_2016['2016_error_adjusted'] = polls_2016['2016_polls_adjusted'] - polls_2016['2016_results_margin']
lower_error = []
pred_raw = []
pred_adjusted = []
result = []
for i in range(len(polls_2016)):
    if polls_2016['2016_error_raw'][i] > polls_2016['2016_error_adjusted'][i]:
        lower_error.append('raw')
    if polls_2016['2016_error_raw'][i] < polls_2016['2016_error_adjusted'][i]:
        lower_error.append('adjusted')
    if math.isnan(polls_2016['2016_error_adjusted'][i]):
        lower_error.append(np.nan)
    if polls_2016['2016_polls_margin'][i] < 0:
        pred_raw.append('D')
    if polls_2016['2016_polls_margin'][i] > 0:
        pred_raw.append('R')
    if math.isnan(polls_2016['2016_polls_margin'][i]):
        pred_raw.append(np.nan)
    if polls_2016['2016_polls_adjusted'][i] < 0:
        pred_adjusted.append('D')
    if polls_2016['2016_polls_adjusted'][i] > 0:
        pred_adjusted.append('R')
    if math.isnan(polls_2016['2016_polls_adjusted'][i]):
        pred_adjusted.append(np.nan)
    if polls_2016['2016_results_margin'][i] < 0:
        result.append('D')
    if polls_2016['2016_results_margin'][i] > 0:
        result.append('R')
    if math.isnan(polls_2016['2016_results_margin'][i]):
        result.append(np.nan)
polls_2016['lower_error'] = lower_error
polls_2016['pred_raw'] = pred_raw
polls_2016['pred_adjusted'] = pred_adjusted
polls_2016['result'] = result
polls_2016

Unnamed: 0,state,2016_polls_margin,mean_error,2016_polls_adjusted,2016_results_margin,2016_error_raw,2016_error_adjusted,lower_error,pred_raw,pred_adjusted,result
0,AL,20.0,-4.025702,24.025702,27.73,-7.73,-3.704298,adjusted,R,R,R
1,AK,4.133333,-5.12,9.253333,14.73,-10.596667,-5.476667,adjusted,R,R,R
2,AZ,1.926829,-2.514004,4.440833,3.5,-1.573171,0.940833,adjusted,R,R,R
3,AR,21.2,-4.192721,25.392721,26.92,-5.72,-1.527279,adjusted,R,R,R
4,CA,-23.09375,3.480113,-26.573863,-30.11,7.01625,3.536137,raw,D,D,D
5,CO,-4.829268,2.281397,-7.110666,-4.91,0.080732,-2.200666,raw,D,D,D
6,CT,-8.75,3.126694,-11.876694,-13.64,4.89,1.763306,raw,D,D,D
7,DE,-15.5,3.472778,-18.972778,-11.37,-4.13,-7.602778,raw,D,D,D
8,DC,,11.62,,-86.77,,,,,,D
9,FL,-2.25831,-0.465116,-1.793194,1.2,-3.45831,-2.993194,adjusted,D,D,R


In [92]:
# calculate the number of states where adjusting the polls result in a lower error
(polls_2016['lower_error'] == 'adjusted').sum()

24

In [93]:
# find the misclassified states
misclassification = []
for i in range(len(polls_2016)):
    if polls_2016['2016_polls_adjusted'][i] * polls_2016['2016_results_margin'][i] < 0:
        misclassification.append(polls_2016['state'][i])
misclassification

['FL', 'IA', 'MI', 'OH', 'PA', 'WI']

In [94]:
# export the dataframe to a csv file
polls_2016.to_csv('../../data/processed/2016_adjusted_polls_and_results.csv', index = False)

# 2020 based on the error of 2016

In [95]:
# create a dataframe for 2020 based on 2016
polls_2020_2016 = polls_2020[['state', '2020_polls_margin']].merge(error_2016[['state', '2016_error']])
polls_2020_2016['2020_polls_adjusted'] = polls_2020_2016['2020_polls_margin'] - polls_2020_2016['2016_error']
polls_2020_2016['2020_results_margin'] = results_2020['2020_results_margin']
polls_2020_2016['2020_error_raw'] = polls_2020_2016['2020_polls_margin'] - polls_2020_2016['2020_results_margin']
polls_2020_2016['2020_error_adjusted'] = polls_2020_2016['2020_polls_adjusted'] - polls_2020_2016['2020_results_margin']
lower_error = []
for i in range(len(polls_2020_2016)):
    if polls_2020_2016['2020_error_raw'][i] > polls_2020_2016['2020_error_adjusted'][i]:
        lower_error.append('raw')
    if polls_2020_2016['2020_error_raw'][i] < polls_2020_2016['2020_error_adjusted'][i]:
        lower_error.append('adjusted')
    if math.isnan(polls_2020_2016['2020_error_adjusted'][i]):
        lower_error.append(np.nan)
polls_2020_2016['lower_error'] = lower_error
polls_2020_2016

Unnamed: 0,state,2020_polls_margin,2016_error,2020_polls_adjusted,2020_results_margin,2020_error_raw,2020_error_adjusted,lower_error
0,AL,20.277778,-7.73,28.007778,25.46,-5.182222,2.547778,adjusted
1,AK,6.533333,-10.596667,17.13,10.06,-3.526667,7.07,adjusted
2,AZ,-2.808163,-1.573171,-1.234993,-0.31,-2.498163,-0.924993,adjusted
3,AR,24.222222,-5.72,29.942222,27.62,-3.397778,2.322222,adjusted
4,CA,-28.84375,7.01625,-35.86,-29.16,0.31625,-6.7,raw
5,CO,-12.514286,0.080732,-12.595017,-13.5,0.985714,0.904983,raw
6,CT,-23.307692,4.89,-28.197692,-20.07,-3.237692,-8.127692,raw
7,DE,-25.555556,-4.13,-21.425556,-18.97,-6.585556,-2.455556,adjusted
8,DC,-78.166667,,,-86.75,8.583333,,
9,FL,-2.859649,-3.45831,0.598661,3.36,-6.219649,-2.761339,adjusted


In [96]:
# calculate the number of states where adjusting the polls result in a lower error
(polls_2020_2016['lower_error'] == 'adjusted').sum()

35

In [97]:
# find the misclassified states
misclassification = []
for i in range(len(polls_2020_2016)):
    if polls_2020_2016['2020_polls_adjusted'][i] * polls_2020_2016['2020_results_margin'][i] < 0:
        misclassification.append(polls_2020_2016['state'][i])
misclassification

['GA', 'MI', 'PA', 'WI']