### Creating Bar Charts for Bokeh Visualization
Historical Trend Comparison for Significant Variables

In [3]:
# imports
import geopandas as gpd
import pandas as pd
import numpy as np
import json
from bokeh.io import output_notebook
from bokeh.models import (CDSView, ColorBar, ColumnDataSource,
                          CustomJS, CustomJSFilter, 
                          GeoJSONDataSource, HoverTool,
                          LinearColorMapper, Slider, LabelSet)
from bokeh.layouts import column, row, widgetbox
from bokeh.palettes import brewer, Spectral5
from bokeh.plotting import figure, show

In [4]:
# set pandas to display all columns in dataframe
pd.set_option("display.max_columns", None)

In [5]:
# read in data
data = "https://raw.githubusercontent.com/ehuang13/w210-presidential-election/master/data/Data-Jul07/combined_jul05.csv"
df = pd.read_csv(data, encoding = "ISO-8859-1")

In [6]:
# checkout data
df.sample(5)

Unnamed: 0,YEAR_FIPS,YEAR,STATE_FIPS,COUNTY_FIPS,STATE,COUNTY,REP_CANDIDATE,DEM_CANDIDATE,REP_VOTES,DEM_VOTES,COUNTY_TOTALVOTES,WINNING_CANDIDATE,WINNING_PARTY,WINNING_PARTY_BINARY,HOUSE_WINNING_BINARY,SENATE_WINNING_BINARY,UNEMPLOYMENT_RATE,AVG_WAGE_SALARY,AA_FEMALE,AA_MALE,BA_FEMALE,BA_MALE,H_FEMALE,H_MALE,IA_FEMALE,IA_MALE,NA_FEMALE,NA_MALE,TOT_FEMALE,TOT_MALE,TOT_POP,WA_FEMALE,WA_MALE,TOT_POP_LESS19,TOT_MALE_LESS19,TOT_FEMALE_LESS19,TOT_POP_20to39,TOT_MALE_20to39,TOT_FEMALE_20to39,TOT_POP_40to59,TOT_MALE_40to59,TOT_FEMALE_40to59,TOT_POP_Above60,TOT_MALE_Above60,TOT_FEMALE_Above60,AA_FEMALE%,AA_MALE%,BA_FEMALE%,BA_MALE%,H_FEMALE%,H_MALE%,IA_FEMALE%,IA_MALE%,NA_FEMALE%,NA_MALE%,WA_FEMALE%,WA_MALE%,TOT_FEMALE%,TOT_MALE%,TOT_POP_LESS19%,TOT_POP_20to39%,TOT_POP_40to59%,TOT_POP_Above60%,MARGIN_VICTORY
14414,200839013,2008,39,13,Ohio,Belmont County,John McCain,Barack Obama,15422,16302,32411.0,Barack Obama,democrat,0,0,0,6.1,30784,136,121,627,2290,157,245,42,42,1,7,35051,35379,70430,33856,32584,15824,8228,7596,16847,9495,7352,21315,10693,10622,16444,6963,9481,0.001931,0.001718,0.008902,0.032515,0.002229,0.003479,0.000596,0.000596,1.4e-05,9.9e-05,0.480704,0.462644,0.497671,0.502329,0.224677,0.239202,0.302641,0.23348,0.027151
1767,201228089,2012,28,89,Mississippi,Madison County,Mitt Romney,Barack Obama,28507,20722,49571.0,Mitt Romney,republican,1,1,-1,6.2,0,1070,1062,19759,16970,1254,1548,179,230,41,55,49784,45754,95538,28377,27140,28177,14409,13768,24939,11720,13219,27486,13071,14415,14936,6554,8382,0.0112,0.011116,0.206818,0.177626,0.013126,0.016203,0.001874,0.002407,0.000429,0.000576,0.297023,0.284075,0.521091,0.478909,0.29493,0.261037,0.287697,0.156336,-0.157047
5379,201641047,2016,41,47,Oregon,Marion County,Donald Trump,Hillary Clinton,63377,57788,136840.0,Donald Trump,republican,1,0,0,5.1,0,4008,3311,1826,2794,39836,43419,3995,4257,1551,1428,162946,161632,324578,146317,144669,91766,47085,44681,87140,44754,42386,80326,40294,40032,65346,29499,35847,0.012348,0.010201,0.005626,0.008608,0.122732,0.133771,0.012308,0.013115,0.004779,0.0044,0.450791,0.445714,0.502024,0.497976,0.282724,0.268472,0.247478,0.201326,-0.040843
5681,201648021,2016,48,21,Texas,Bastrop County,Donald Trump,Hillary Clinton,16328,10569,28454.0,Donald Trump,republican,1,1,1,3.7,36689,379,340,2940,3249,12954,14254,654,735,51,60,38226,39768,77994,33393,34588,21856,11213,10643,18277,9667,8610,22096,11304,10792,15765,7584,8181,0.004859,0.004359,0.037695,0.041657,0.16609,0.182758,0.008385,0.009424,0.000654,0.000769,0.428148,0.44347,0.490115,0.509885,0.280227,0.234339,0.283304,0.202131,-0.202397
11812,200448111,2004,48,111,Texas,Dallam County,George W. Bush,John Kerry,1473,305,1782.0,George W. Bush,republican,1,1,1,3.8,29666,13,10,49,58,947,1009,40,38,0,0,2992,3156,6148,2863,3017,2043,1071,972,1724,911,813,1517,787,730,864,387,477,0.002115,0.001627,0.00797,0.009434,0.154034,0.164118,0.006506,0.006181,0.0,0.0,0.46568,0.490729,0.486662,0.513338,0.332303,0.280416,0.246747,0.140534,-0.655443


### Create Static Bar Chart
* x-axis: any feature selected, such as unemployment rate
* y-axis: all the election years

**Steps Required:**
* aggregate the selected feature by year
* plot bar chart

**Reference Sources:**
* https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html
* https://towardsdatascience.com/interactive-bar-charts-with-bokeh-7230e5653ba3

In [30]:
# set visualization to display inline within jupyter notebook
output_notebook()

In [7]:
# pick any random selected feature
feature = "UNEMPLOYMENT_RATE"

# aggregate selected feature by year
grouped_feature = df[["YEAR", feature]].groupby(df["YEAR"]).mean()

In [8]:
# checkout grouped feature dataframe
grouped_feature

Unnamed: 0_level_0,YEAR,UNEMPLOYMENT_RATE
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,2000,4.381991
2004,2004,5.694162
2008,2008,5.828448
2012,2012,7.855599
2016,2016,5.243806


In [61]:
# bokeh visualization
years = list(grouped_feature["YEAR"])
plot_feature = list(grouped_feature[feature])

# initialize the plot
p = figure(plot_height=400, 
           title="Average Unemployment Rate by Year")

# plot bar chart
p.vbar(years, top = plot_feature, width=3, fill_color = Spectral5)

# label axis and ticks
p.xaxis.axis_label="Years"
p.yaxis.axis_label="Average Unemployment Rate"
p.xaxis.ticker = years

# display plot
show(p)

### Package Bar Chart Plot into a Function

In [69]:
def bar_plot(feature = "UNEMPLOYMENT_RATE"):
    
    # set default feature is unemployment rate

    # aggregate selected feature by year
    grouped_feature = df[["YEAR", feature]].groupby(df["YEAR"]).mean()
    
    # bokeh visualization
    years = [2000, 2004, 2008, 2012, 2016]
    plot_feature = list(grouped_feature[feature])

    # initialize the plot
    p = figure(plot_height=400, 
               title="Selected Feature Summary Statistics by Year")

    # plot bar chart
    p.vbar(years, top = plot_feature, width=3, fill_color = Spectral5)

    # label axis and ticks
    p.xaxis.axis_label="Years"
    p.yaxis.axis_label="Average Selected Feature"
    p.xaxis.ticker = years

    return show(p)
        

Test plotting other selected features

In [82]:
# see all available features for selection
df.columns[16:]

Index(['UNEMPLOYMENT_RATE', 'AVG_WAGE_SALARY', 'AA_FEMALE', 'AA_MALE',
       'BA_FEMALE', 'BA_MALE', 'H_FEMALE', 'H_MALE', 'IA_FEMALE', 'IA_MALE',
       'NA_FEMALE', 'NA_MALE', 'TOT_FEMALE', 'TOT_MALE', 'TOT_POP',
       'WA_FEMALE', 'WA_MALE', 'TOT_POP_LESS19', 'TOT_MALE_LESS19',
       'TOT_FEMALE_LESS19', 'TOT_POP_20to39', 'TOT_MALE_20to39',
       'TOT_FEMALE_20to39', 'TOT_POP_40to59', 'TOT_MALE_40to59',
       'TOT_FEMALE_40to59', 'TOT_POP_Above60', 'TOT_MALE_Above60',
       'TOT_FEMALE_Above60', 'AA_FEMALE%', 'AA_MALE%', 'BA_FEMALE%',
       'BA_MALE%', 'H_FEMALE%', 'H_MALE%', 'IA_FEMALE%', 'IA_MALE%',
       'NA_FEMALE%', 'NA_MALE%', 'WA_FEMALE%', 'WA_MALE%', 'TOT_FEMALE%',
       'TOT_MALE%', 'TOT_POP_LESS19%', 'TOT_POP_20to39%', 'TOT_POP_40to59%',
       'TOT_POP_Above60%', 'MARGIN_VICTORY'],
      dtype='object')

In [83]:
test1 = bar_plot("MARGIN_VICTORY")

In [84]:
# aggregate selected feature by year
test1_grouped = df[["YEAR", "MARGIN_VICTORY"]].groupby(df["YEAR"]).mean()
test1_grouped

Unnamed: 0_level_0,YEAR,MARGIN_VICTORY
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,2000,-0.171779
2004,2004,-0.2153
2008,2008,-0.152396
2012,2012,-0.208777
2016,2016,-0.314008


In [72]:
test2 = bar_plot("TOT_POP_LESS19%")

In [76]:
# aggregate selected feature by year
test2_grouped = df[["YEAR", "TOT_POP_LESS19%"]].groupby(df["YEAR"]).mean()
test2_grouped

Unnamed: 0_level_0,YEAR,TOT_POP_LESS19%
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,2000,0.284323
2004,2004,0.275984
2008,2008,0.267129
2012,2012,0.261081
2016,2016,0.251323
