In [1]:
#My first question is whether the proportion of voters who voted in the 2016 primary but not in the 2016 general
#is significantly different from other years
#in the data cleaning workbook I talked about how this data is less accurate for earlier timepoints
#since people may have moved away and no longer be counted 
#and there will be voters who may have lived in other counties during a particular election & only now live in HC
#But, I think it's reasonable to assume that this movement is independent of a voters likelihood to not vote
#in a general afer voting in the primary

In [2]:
#Will perform chi-square test to test the null hypothesis of no difference in the proportions 
#among the 9 years, using a significance level of .01. 
#If the null hypothesis is rejected, will use the Marascuillo procedure to compare all years pairwise
import pandas as pd
import scipy.stats as stats
votes = pd.read_csv('voting_hist.csv')
votes.head()

Unnamed: 0,SOS_VOTERID,YEAR,P,G
0,OH0013910067,0,,X
1,OH0013638654,0,,X
2,OH0013744969,0,R,X
3,OH0020648383,0,,
4,OH0021001272,0,,


In [3]:
votes['P']=[1 if pd.isna(x) == False else 0 for x in votes['P']]
votes['G']=[1 if pd.isna(x) == False else 0 for x in votes['G']]
votes.head()

Unnamed: 0,SOS_VOTERID,YEAR,P,G
0,OH0013910067,0,0,1
1,OH0013638654,0,0,1
2,OH0013744969,0,1,1
3,OH0020648383,0,0,0
4,OH0021001272,0,0,0


In [4]:
v_counts = votes[['YEAR', 'P', 'G']][votes['P']==1]
v_counts['YEAR']=v_counts['YEAR']+2000
v_counts = v_counts.groupby('YEAR').agg({'P':'sum', 'G':'sum'}).reset_index()
v_counts

Unnamed: 0,YEAR,P,G
0,2000,95741,93950
1,2002,42996,39812
2,2004,91939,91016
3,2006,75629,71211
4,2008,195812,190398
5,2010,87715,81948
6,2012,93744,92151
7,2014,65694,61202
8,2016,225535,217373
9,2018,110227,107268


In [5]:
v_prop = v_counts.rename(columns={'P':'TOTAL', 'G':'G_VOTED'})
v_prop['G_NOTVOTED']=v_prop['TOTAL']-v_prop['G_VOTED']
v_prop['PHAT_VOTED']=v_prop['G_VOTED']/v_prop['TOTAL']
v_prop['PHAT_NOTVOTED']=v_prop['G_NOTVOTED']/v_prop['TOTAL']
v_prop

Unnamed: 0,YEAR,TOTAL,G_VOTED,G_NOTVOTED,PHAT_VOTED,PHAT_NOTVOTED
0,2000,95741,93950,1791,0.981293,0.018707
1,2002,42996,39812,3184,0.925947,0.074053
2,2004,91939,91016,923,0.989961,0.010039
3,2006,75629,71211,4418,0.941583,0.058417
4,2008,195812,190398,5414,0.972351,0.027649
5,2010,87715,81948,5767,0.934253,0.065747
6,2012,93744,92151,1593,0.983007,0.016993
7,2014,65694,61202,4492,0.931622,0.068378
8,2016,225535,217373,8162,0.96381,0.03619
9,2018,110227,107268,2959,0.973155,0.026845


In [8]:
import numpy as np
voted = v_prop['G_VOTED'].tolist()
not_voted = v_prop['G_NOTVOTED'].tolist()
voted_array = np.asarray(voted).reshape((1,10))
notvoted_array = np.asarray(not_voted).reshape((1,10))
c_table = np.vstack((voted_array, notvoted_array))
c_table

array([[ 93950,  39812,  91016,  71211, 190398,  81948,  92151,  61202,
        217373, 107268],
       [  1791,   3184,    923,   4418,   5414,   5767,   1593,   4492,
          8162,   2959]])

In [24]:
#Chi-square - from https://machinelearningmastery.com/chi-squared-test-for-machine-learning/
stat, p, dof, expected = stats.chi2_contingency(c_table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.99
critical = stats.chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

dof=9
[[ 92325.92659848  41462.33630345  88659.54361807  72931.31994356
  188827.40246186  84586.21334209  90400.15942018  63350.7005563
  217490.18555674 106295.21219927]
 [  3415.07340152   1533.66369655   3279.45638193   2697.68005644
    6984.59753814   3128.78665791   3343.84057982   2343.2994437
    8044.81444326   3931.78780073]]
probability=0.990, critical=21.666, stat=11454.216
Dependent (reject H0)
significance=0.010, p=0.000
Dependent (reject H0)


In [31]:
#So we reject the null hypothesis & will now use the Marascuillo procedure to compare all years pairwise
#Code adopted from R code on https://itl.nist.gov/div898/handbook/prc/section4/prc474.htm

# Set the proportions of interest. (proportion of primary voters who did not vote in corresponding general)
p = v_prop['PHAT_NOTVOTED'].tolist()
n = v_prop['TOTAL'].tolist()
years = v_prop['YEAR'].tolist()

N = len(p)
dof = N-1

results = []
import math
# Compute critical values.
for i in range(0,N):
    for j in range(1,N):
        value = round(math.fabs(p[i]-p[j]),3)
        critical_range = round(math.sqrt(stats.chi2.pdf(prob, dof)*math.sqrt(p[i]*(1-p[i])/(n[i]) + p[j]*(1-p[j])/n[j])),3)
        compared = str(years[i])+'-'+str(years[j])
        results.append([compared, value, critical_range])
results_df = pd.DataFrame(results, columns=['Years', 'TestStatistic', 'CriticalValue'])
#Those pairs that have a test statistic that exceeds the critical value are significant at the α level.
results_df['Significant'] = results_df['TestStatistic']-results_df['CriticalValue']
sig = results_df.loc[results_df['Significant'] > 0]
n_sig = results_df.loc[results_df['Significant'] <= 0]

In [32]:
sig
#this is kind of interestng to me because I would have expected a lot more uniformity between at least
#presidential vs/ non-presidential years - i.e. I would have thought comparing 2000 & 2004 would have not been sig
#at .01, granted it's a very small difference
#this looks like 2008 & 2018 are the only years with similar primary vs. general attendance

Unnamed: 0,Years,TestStatistic,CriticalValue,Significant
0,2000-2002,0.055,0.002,0.053
1,2000-2004,0.009,0.001,0.008
2,2000-2006,0.040,0.001,0.039
3,2000-2008,0.009,0.001,0.008
4,2000-2010,0.047,0.001,0.046
...,...,...,...,...
83,2018-2006,0.032,0.001,0.031
85,2018-2010,0.039,0.001,0.038
86,2018-2012,0.010,0.001,0.009
87,2018-2014,0.042,0.002,0.040


In [33]:
n_sig

Unnamed: 0,Years,TestStatistic,CriticalValue,Significant
9,2002-2002,0.0,0.002,-0.002
19,2004-2004,0.0,0.001,-0.001
29,2006-2006,0.0,0.002,-0.002
39,2008-2008,0.0,0.001,-0.001
44,2008-2018,0.001,0.001,0.0
49,2010-2010,0.0,0.002,-0.002
59,2012-2012,0.0,0.001,-0.001
69,2014-2014,0.0,0.002,-0.002
79,2016-2016,0.0,0.001,-0.001
84,2018-2008,0.001,0.001,0.0


In [1]:
#Mapping - based on https://towardsdatascience.com/how-safe-are-the-streets-of-santiago-e01ba483ce4b
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point, Polygon

%matplotlib inline
sf=gpd.read_file('cenblk2010.shp')
sf

UnboundLocalError: local variable 'arith_flex' referenced before assignment

In [2]:
conda install geopandas

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Eliza\Anaconda3

  added / updated specs:
    - geopandas


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.7.12               |           py37_0         3.0 MB
    psycopg2-2.8.3             |   py37h7a1dbc1_0         170 KB
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

  descartes          pkgs/main/noarch::descartes-1.1.0-py_3
  mapclassify        pkgs/main/noarch::mapclassify-2.0.1-py_0
  psycopg2           pkgs/main/win-64::psycopg2-2.8.3-py37h7a1dbc1_0

The following packages will be UPDATED:

  certifi             conda-forge::certifi-2019.6.16-py37_1 --> pkgs/main::certifi-2019.9.11-py37