In [1]:
# Import Moddules needed
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF
# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [10]:
# Building a naive bayesian classifier

# Set seed
np.random.seed(1234)

# read in data
turnout_data = pd.read_csv("turnout.csv")

In [16]:
# Create bins for continuous variables so they can be transformed into dummy variables
turnout_data['edu_bins'] = pd.cut(turnout_data['educate'], bins = 5, labels = ['low', 'midlow', 'mid', 'midhigh', 'high'])
turnout_data['inc_bins'] = pd.cut(turnout_data['income'], bins = 5, labels = ['low', 'midlow', 'mid', 'midhigh', 'high'])
turnout_data['age_bins'] = pd.cut(turnout_data['age'], bins = 8, labels = ['17-26','27-36','37-46','47-56','57-66','67-76','77-86','86-95'])

# Add binned dummy variables to new pandas df
tdat_new = pd.get_dummies(turnout_data)

# Drop continuous variables and id (because of redundancy) from df
tdat_new.drop(['id','age','educate','income'], axis=1, inplace=True)

tdat_new.head()

Unnamed: 0,vote,white,edu_bins_low,edu_bins_midlow,edu_bins_mid,edu_bins_midhigh,edu_bins_high,inc_bins_low,inc_bins_midlow,inc_bins_mid,inc_bins_midhigh,inc_bins_high,age_bins_17-26,age_bins_27-36,age_bins_37-46,age_bins_47-56,age_bins_57-66,age_bins_67-76,age_bins_77-86,age_bins_86-95
0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
3,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,1,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [17]:
# Train-Test split (just using Pandas)
train = tdat_new.sample(frac=.8).reset_index(drop=True)
test = tdat_new.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,vote,white,edu_bins_low,edu_bins_midlow,edu_bins_mid,edu_bins_midhigh,edu_bins_high,inc_bins_low,inc_bins_midlow,inc_bins_mid,inc_bins_midhigh,inc_bins_high,age_bins_17-26,age_bins_27-36,age_bins_37-46,age_bins_47-56,age_bins_57-66,age_bins_67-76,age_bins_77-86,age_bins_86-95
0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0
3,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [18]:
# Calculate class probabilities: Pr(class)
N = train.shape[0]

# Subset the data by class
v1 = train.query("vote == 1")
v0 = train.query("vote == 0")

# Calculate the probability for each class
pr_v1 = v1.shape[0]/N
pr_v0 = v0.shape[0]/N

# Print the probabilities
print(
f"""
Pr(vote = 1): {pr_v1}
Pr(vote = 0): {pr_v0}
""")


Pr(vote = 1): 0.740625
Pr(vote = 0): 0.259375



In [19]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data using a specified outcome variable, outcome_var
    
    This function uses dictionaries with tuple keys to keep
    track of the variable, its value, and the outcome, which we're conditioning on. 
    
    This function returns the probabilities of each class and the condition probabilities
    for each variable that is iterated over
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs

# Save class and conditional probabilities
class_probs, cond_probs = calc_probs(train, outcome_var='vote')

In [20]:
# Print the class and conditional probabilities dictionaries
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.259375, 1: 0.740625}


conditional probabilities

{('age_bins_17-26', 0, 0): 0.7421686746987952,
 ('age_bins_17-26', 0, 1): 0.869198312236287,
 ('age_bins_17-26', 1, 0): 0.25783132530120484,
 ('age_bins_17-26', 1, 1): 0.1308016877637131,
 ('age_bins_27-36', 0, 0): 0.7566265060240964,
 ('age_bins_27-36', 0, 1): 0.7805907172995781,
 ('age_bins_27-36', 1, 0): 0.2433734939759036,
 ('age_bins_27-36', 1, 1): 0.21940928270042195,
 ('age_bins_37-46', 0, 0): 0.8650602409638555,
 ('age_bins_37-46', 0, 1): 0.7932489451476793,
 ('age_bins_37-46', 1, 0): 0.13493975903614458,
 ('age_bins_37-46', 1, 1): 0.20675105485232068,
 ('age_bins_47-56', 0, 0): 0.8795180722891567,
 ('age_bins_47-56', 0, 1): 0.8421940928270042,
 ('age_bins_47-56', 1, 0): 0.12048192771084337,
 ('age_bins_47-56', 1, 1): 0.15780590717299578,
 ('age_bins_57-66', 0, 0): 0.8987951807228916,
 ('age_bins_57-66', 0, 1): 0.8742616033755274,
 ('age_bins_57-66', 1, 0): 0.10120481927710843,
 ('age_bins_57-66', 1, 1

In [21]:
def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditiona probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run 
preds = predict(train, class_probs, cond_probs)
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,5.2e-05,0.001137,1
1,0.000281,0.003093,1
2,2e-06,7.2e-05,1
3,0.001297,0.000399,0
4,0.002502,0.002277,0


In [22]:
# Calculate predictive accuracy of the training data
accuracy = sum(train.vote == preds.pred)/train.shape[0]
accuracy

0.7275

In [24]:
# Calculate predictive accuracy of the test data
test_preds = predict(test, class_probs, cond_probs)
test_accuracy = sum(test.vote == test_preds.pred)/test.shape[0]
test_accuracy

0.71

After separating the continuous variables into categorical dummies and calculating the conditional probabilities of voting given age bracket, income bracket, education level, and race, we have found that our bayesian classifier correctly predicts voting outcomes with 72.8% accuracy on our training data and 71% accuracy on our test data. While the model could improve its predictive accuracy, it also performs better than a coin flip (or, it has more than 50% predictive accuracy).