### Import modules

In [1]:
%%capture
!pip install realclearpolitics

import pandas as pd 
import numpy as np 
import json 
import requests 

from bokeh.io import output_notebook, show 
from bokeh.plotting import figure 
import bokeh.palettes 
import bokeh.transform

import rcp

### test api

In [2]:
polls = rcp.get_polls(candidate = 'Biden')
polls[0]

{'url': 'https://www.realclearpolitics.com/epolls/2020/president/us/general_election_trump_vs_biden-6247.html',
 'title': 'General Election: Trump vs. Biden',
 'poll': 'IBD/TIPP'}

In [3]:
d = {}
for poll in polls: 
    if poll['title'] not in d: 
        d[poll['title']] = poll['url']

In [4]:
d # see what's there 

{'General Election: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/us/general_election_trump_vs_biden-6247.html',
 'New Hampshire: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/nh/new_hampshire_trump_vs_biden-6779.html',
 'Texas: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/tx/texas_trump_vs_biden-6818.html',
 'North Carolina: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/nc/north_carolina_trump_vs_biden-6744.html',
 'Utah: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/ut/utah_trump_vs_biden-7195.html',
 'New York: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/ny/new_york_trump_vs_biden-7040.html',
 'Florida: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/fl/florida_trump_vs_biden-6841.html',
 'Pennsylvania: Trump vs. Biden': 'https://www.realclearpolitics.com/epolls/2020/president/pa/pe

In [5]:
url = d['General Election: Trump vs. Biden']

In [6]:
res = rcp.get_poll_data(url)

In [7]:
df = pd.DataFrame(res[0]['data'])

In [8]:
df = df.iloc[:40, :]

In [9]:
df['Biden (D)'] = pd.to_numeric(df['Biden (D)'])

In [10]:
df['Trump (R)'] = pd.to_numeric(df['Trump (R)'])

In [11]:
df.loc[df['MoE'] == '--', 'MoE'] = np.nan

In [12]:
df['MoE'] = pd.to_numeric(df['MoE'])

In [13]:
df['start_date'] = [x.split('-')[0].strip() for x in df['Date']]

df['end_date'] = [x.split('-')[1].strip() for x in df['Date']]

df[['start_date','end_date']] = df[['start_date','end_date']].apply(lambda x: x + '/20')

df['start_date'] = pd.to_datetime(df['start_date'])

df['end_date'] = pd.to_datetime(df['end_date'])

df['poll_date'] = ((df['end_date'] - df['start_date']) / 2) + df['start_date']

In [14]:
df.to_csv('rcp_project_intermediate_df.csv')

In [68]:
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
src = ColumnDataSource(df)
output_notebook()

In [16]:
df.tail()

Unnamed: 0,Poll,Date,Sample,MoE,Biden (D),Trump (R),Spread,start_date,end_date,poll_date
35,EmersonEmerson,2/16 - 2/18,1250 RV,2.7,48.0,52.0,Trump +4,2020-02-16,2020-02-18,2020-02-17 00:00:00
36,NBC News/Wall St. JrnlNBC/WSJ,2/14 - 2/17,900 RV,3.3,52.0,44.0,Biden +8,2020-02-14,2020-02-17,2020-02-15 12:00:00
37,NPR/PBS/MaristNPR/PBS,2/13 - 2/16,1164 RV,3.7,50.0,44.0,Biden +6,2020-02-13,2020-02-16,2020-02-14 12:00:00
38,QuinnipiacQuinnipiac,2/5 - 2/9,1519 RV,2.5,50.0,43.0,Biden +7,2020-02-05,2020-02-09,2020-02-07 00:00:00
39,NBC News/Wall St. JrnlNBC/WSJ,1/26 - 1/29,1000 RV,3.1,50.0,44.0,Biden +6,2020-01-26,2020-01-29,2020-01-27 12:00:00


In [43]:
df['trump_rolling'] = df['Trump (R)'].rolling(10).mean()

In [44]:
df['trump_rolling'] = df['trump_rolling'].fillna(method = 'backfill')

In [45]:
df['biden_rolling'] = df['Biden (D)'].rolling(10).mean().fillna(method = 'backfill')

In [56]:
df.head(10)

Unnamed: 0,Poll,Date,Sample,MoE,Biden (D),Trump (R),Spread,start_date,end_date,poll_date,trump_rolling,biden_rolling
0,RCP Average,4/13 - 4/29,--,,47.4,42.1,Biden +5.3,2020-04-13,2020-04-29,2020-04-21 00:00:00,42.21,47.54
1,IBD/TIPPIBD/TIPP,4/26 - 4/29,948 RV,3.3,43.0,43.0,Tie,2020-04-26,2020-04-29,2020-04-27 12:00:00,42.21,47.54
2,EmersonEmerson,4/26 - 4/28,1200 RV,2.8,48.0,42.0,Biden +6,2020-04-26,2020-04-28,2020-04-27 00:00:00,42.21,47.54
3,Economist/YouGovYouGov,4/26 - 4/28,1222 RV,3.2,47.0,41.0,Biden +6,2020-04-26,2020-04-28,2020-04-27 00:00:00,42.21,47.54
4,USA Today/SuffolkUSA Today,4/21 - 4/25,1000 RV,3.0,50.0,40.0,Biden +10,2020-04-21,2020-04-25,2020-04-23 00:00:00,42.21,47.54
5,Economist/YouGovYouGov,4/19 - 4/21,1144 RV,3.4,48.0,42.0,Biden +6,2020-04-19,2020-04-21,2020-04-20 00:00:00,42.21,47.54
6,The Hill/HarrisXThe Hill,4/19 - 4/20,958 RV,3.2,42.0,40.0,Biden +2,2020-04-19,2020-04-20,2020-04-19 12:00:00,42.21,47.54
7,NBC News/Wall St. JrnlNBC/WSJ,4/13 - 4/15,900 RV,3.3,49.0,42.0,Biden +7,2020-04-13,2020-04-15,2020-04-14 00:00:00,42.21,47.54
8,Harvard-HarrisHarris,4/14 - 4/16,LV,,53.0,47.0,Biden +6,2020-04-14,2020-04-16,2020-04-15 00:00:00,42.21,47.54
9,Economist/YouGovYouGov,4/12 - 4/14,1166 RV,3.4,48.0,43.0,Biden +5,2020-04-12,2020-04-14,2020-04-13 00:00:00,42.21,47.54


In [89]:
### Make LOESS Line 

from statsmodels.nonparametric.smoothers_lowess import lowess

def make_lowess(series):
    endog = series.values
    exog = series.index.values

    smooth = lowess(endog, exog)
    index, data = np.transpose(smooth)

    return pd.Series(data, index=pd.to_datetime(index)) 

polling_data = df.copy()

polling_data = polling_data.set_index('poll_date')

trump_lowess = make_lowess(polling_data['Trump (R)'])
biden_lowess = make_lowess(polling_data['Biden (D)'])

trump_lowess.head(3)

In [93]:
dem = '#0015BC'
rep = '#E9141D'

In [97]:
df.head()

Unnamed: 0,Poll,Date,Sample,MoE,Biden (D),Trump (R),Spread,start_date,end_date,poll_date,trump_rolling,biden_rolling
0,RCP Average,4/13 - 4/29,--,,47.4,42.1,Biden +5.3,2020-04-13,2020-04-29,2020-04-21 00:00:00,42.21,47.54
1,IBD/TIPPIBD/TIPP,4/26 - 4/29,948 RV,3.3,43.0,43.0,Tie,2020-04-26,2020-04-29,2020-04-27 12:00:00,42.21,47.54
2,EmersonEmerson,4/26 - 4/28,1200 RV,2.8,48.0,42.0,Biden +6,2020-04-26,2020-04-28,2020-04-27 00:00:00,42.21,47.54
3,Economist/YouGovYouGov,4/26 - 4/28,1222 RV,3.2,47.0,41.0,Biden +6,2020-04-26,2020-04-28,2020-04-27 00:00:00,42.21,47.54
4,USA Today/SuffolkUSA Today,4/21 - 4/25,1000 RV,3.0,50.0,40.0,Biden +10,2020-04-21,2020-04-25,2020-04-23 00:00:00,42.21,47.54


In [111]:
df = df.merge(trump_lowess.rename('trump_trend'), left_on = 'poll_date', right_index = True)
df = df.merge(biden_lowess.rename('biden_trend'), left_on = 'poll_date', right_index = True)
df.head()

Unnamed: 0,Poll,Date,Sample,MoE,Biden (D),Trump (R),Spread,start_date,end_date,poll_date,trump_rolling,biden_rolling,trump_trend,biden_trend
0,RCP Average,4/13 - 4/29,--,,47.4,42.1,Biden +5.3,2020-04-13,2020-04-29,2020-04-21 00:00:00,42.21,47.54,41.739657,47.413975
1,IBD/TIPPIBD/TIPP,4/26 - 4/29,948 RV,3.3,43.0,43.0,Tie,2020-04-26,2020-04-29,2020-04-27 12:00:00,42.21,47.54,41.616366,47.197934
2,EmersonEmerson,4/26 - 4/28,1200 RV,2.8,48.0,42.0,Biden +6,2020-04-26,2020-04-28,2020-04-27 00:00:00,42.21,47.54,41.625971,47.216388
2,EmersonEmerson,4/26 - 4/28,1200 RV,2.8,48.0,42.0,Biden +6,2020-04-26,2020-04-28,2020-04-27 00:00:00,42.21,47.54,41.625971,47.216388
2,EmersonEmerson,4/26 - 4/28,1200 RV,2.8,48.0,42.0,Biden +6,2020-04-26,2020-04-28,2020-04-27 00:00:00,42.21,47.54,41.625971,47.216388


In [112]:
src = ColumnDataSource(df)

In [106]:
from bokeh.models import Span
from bokeh.models import HoverTool

hover = HoverTool(
tooltips = [('Date', '@poll_date'), 
           ('Trump Average:', "@{Biden (D)}"), 
           ('Biden Average:', '@')], 
    formatters = {
        '@poll_date' : 'datetime'
    }, 
    mode = 'vline'
)

p = figure(plot_height = 400, plot_width = 400, x_axis_type = 'datetime', 
           y_axis_type = 'linear',
           title = 'General Election Polls, March / April 2020',
          tools = [hover])

p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = '% Support'

p.circle(df['poll_date'],  y = df['Trump (R)'], color = '#E9141D', size = 7, alpha = .4)
p.circle('poll_date', 'Biden (D)', color = '#0015BC', size = 7, alpha = .4, source = src)

ln = Span(location = 50, dimension = 'width', line_color = 'black', line_dash = [4], line_width = 2, line_alpha = .5)

p.line(x = trump_lowess.index, y = trump_lowess, line_color = '#E9141D', alpha = .75, line_width = 2.5) # Trump trend
p.line(x = biden_lowess.index, y = biden_lowess, line_color = dem, alpha = .75, line_width = 2.5 )

p.add_layout(ln)
p.y_range.start = 30
p.y_range.end = 60
show(p)

In [113]:
p = figure(plot_height = 400, plot_width = 400, x_axis_type = 'datetime', 
           y_axis_type = 'linear',
           title = 'General Election Polls, March / April 2020')

In [116]:
import bokeh.plotting as bkp

In [None]:
bkp.figure.Figure.circle

In [178]:
df['spread'] = df['biden_trend'] - df['trump_trend']

In [220]:
src = ColumnDataSource(df)

### Add band dimensions

In [199]:
df.head(1)

Unnamed: 0,Poll,Date,Sample,MoE,Biden (D),Trump (R),Spread,start_date,end_date,poll_date,trump_rolling,biden_rolling,trump_trend,biden_trend,spread
0,RCP Average,4/13 - 4/29,--,,47.4,42.1,Biden +5.3,2020-04-13,2020-04-29,2020-04-21,42.21,47.54,41.739657,47.413975,5.674318


In [204]:
df['biden_lower'] = df['biden_trend'] - df['MoE']

In [223]:
import bokeh.models as bkm

hover = HoverTool(
tooltips = [('Date', '@poll_date{%F}'), 
           ('Biden Average:', "@biden_trend"), 
           ('Trump Average', '@trump_trend'),
           ('Spread', '@spread')], 
    formatters = {'@poll_date' : 'datetime'}, 
    mode = 'vline',
    names = ['biden_trend']

)

p = figure(plot_height = 400, plot_width = 400, x_axis_type = 'datetime', 
           y_axis_type = 'linear',
           title = 'General Election Polls, YTD 2020')

p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = '% Support'

p.circle('poll_date',  y = 'Trump (R)', color = '#E9141D', size = 7, alpha = .4, source = src)
p.circle('poll_date', 'Biden (D)', color = '#0015BC', size = 7, alpha = .4, source = src)



ln = Span(location = 50, dimension = 'width', line_color = 'black', line_dash = [4], line_width = 2, line_alpha = .5)

p.line(x = 'poll_date', y = 'trump_trend', source = src, line_color = '#E9141D', alpha = .75, line_width = 2.5, name = 'trump_trend') # Trump trend
p.line(x = 'poll_date', y = 'biden_trend', source = src, line_color = '#0015BC', alpha = .75, line_width = 2.5, name = 'biden_trend' )

b1 = bkm.Band(base = 'poll_date', upper = 'trump_upper', lower = 'trump_lower', level = 'underlay', line_color = None, fill_color = rep, 
             fill_alpha = .25, 
            source = src)

b2 = bkm.Band(base = 'poll_date', upper = 'biden_upper', lower = 'biden_lower', level = 'underlay', line_color = None, fill_color = dem, 
             fill_alpha = .25, 
            source = src)

p.add_layout(b1)
p.add_layout(b2)
# hover.renderers.append([src])
p.add_tools(hover)
p.add_layout(ln)
p.y_range.start = 30
p.y_range.end = 60
show(p)

Timestamp('2020-04-27 12:00:00')