# Coding Discussion 5
# Joanne Lauer jml450
# November 14, 2021

# Break the data up into a training (1600 entries, 80%) and test dataset (400 entries, 20%).
# Build a Naive Bayesian Classifier from scratch that tries to predict whether a respondent 
# will vote in a presidential election or not, pr(Vote==1).

In [1]:
# import libraries
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

# Plotting libraries 
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set seed
np.random.seed(1234)


In [4]:

# Read file using url
url = 'https://raw.githubusercontent.com/edunford/coding_discussions_ppol564_fall2021/main/05_coding_discussion/turnout.csv'
turnout = pd.read_csv(url)
print(turnout.head(5))
type(turnout)


   id  age  educate  income  vote  white
0   1   60     14.0  3.3458     1      1
1   2   51     10.0  1.8561     0      1
2   3   24     12.0  0.6304     0      1
3   4   38      8.0  3.4183     1      1
4   5   25     12.0  2.7852     1      1


pandas.core.frame.DataFrame

In [5]:
# drop the id column which is the same as the pandas indicies
turnout.drop(['id'], axis=1, inplace=True)

In [16]:
# create bins for non bianary variables to transform into binary dummy variables

turnout['e_bins'] = pd.cut(turnout['educate'], bins=3, labels=['below','average','high'])
turnout['i_bins'] = pd.cut(turnout['income'], bins=3, labels=['low','mid','high'])
turnout['a_bins'] = pd.cut(turnout['age'], bins=9, labels=['01-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99'])

turnout_b = pd.get_dummies(turnout)
turnout_b.head

<bound method NDFrame.head of       age  educate  income  vote  white  e_bins_below  e_bins_average  \
0      60     14.0  3.3458     1      1             0               0   
1      51     10.0  1.8561     0      1             0               1   
2      24     12.0  0.6304     0      1             0               1   
3      38      8.0  3.4183     1      1             0               1   
4      25     12.0  2.7852     1      1             0               1   
...   ...      ...     ...   ...    ...           ...             ...   
1995   26     16.0  3.3834     0      1             0               0   
1996   34     12.0  2.9170     1      1             0               1   
1997   51     16.0  7.8949     1      1             0               0   
1998   22     10.0  2.4811     0      1             0               1   
1999   59     10.0  0.5523     0      1             0               1   

      e_bins_high  i_bins_low  i_bins_mid  i_bins_high  a_bins_01-19  \
0               1    

In [17]:
# drop the categorical variables and replace with the new bin based dummy variables
turnout_b.drop(['age','educate','income'],axis=1, inplace=True)
turnout_b.head

<bound method NDFrame.head of       vote  white  e_bins_below  e_bins_average  e_bins_high  i_bins_low  \
0        1      1             0               0            1           1   
1        0      1             0               1            0           1   
2        0      1             0               1            0           1   
3        1      1             0               1            0           1   
4        1      1             0               1            0           1   
...    ...    ...           ...             ...          ...         ...   
1995     0      1             0               0            1           1   
1996     1      1             0               1            0           1   
1997     1      1             0               0            1           0   
1998     0      1             0               1            0           1   
1999     0      1             0               1            0           1   

      i_bins_mid  i_bins_high  a_bins_01-19  a_bins_20-29

In [18]:
# Train-Test split (just using Pandas)
train = turnout_b.sample(frac=.8).reset_index(drop=True)
test = turnout_b.drop(train.index).reset_index(drop=True)
# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])
# Look at the head of the data
train.head()     

Training Data: 1600 
Test Data: 400


Unnamed: 0,vote,white,e_bins_below,e_bins_average,e_bins_high,i_bins_low,i_bins_mid,i_bins_high,a_bins_01-19,a_bins_20-29,a_bins_30-39,a_bins_40-49,a_bins_50-59,a_bins_60-69,a_bins_70-79,a_bins_80-89,a_bins_90-99
0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0
3,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0
4,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0


In [19]:
N = train.shape[0]

# Subset the data by class
vote1 = train.query("vote == 1")
vote0 = train.query("vote == 0")

# Calculate the probability for each class
pr_vote1 = vote1.shape[0]/N
pr_vote0 = vote0.shape[0]/N

# Print the probabilities
print(
f"""
Pr(vote1 = 1): {pr_vote1}
Pr(vote0 = 0): {pr_vote0}
""")


Pr(vote1 = 1): 0.740625
Pr(vote0 = 0): 0.259375



In [21]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities for binary data. 
    
    Dictionaries with tuple keys to track of the variable, value, and the outcome, which we're conditioning on. 

    Returns the probabilities of each class and the condition probabilities for each variable given which
    class is being iterated on in the for loop.
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


# Run
class_probs, cond_probs = calc_probs(train,outcome_var="vote")

# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.259375, 1: 0.740625}


conditional probabilities

{('a_bins_01-19', 0, 0): 0.7734939759036145,
 ('a_bins_01-19', 0, 1): 0.8978902953586498,
 ('a_bins_01-19', 1, 0): 0.22650602409638554,
 ('a_bins_01-19', 1, 1): 0.1021097046413502,
 ('a_bins_20-29', 0, 0): 0.7831325301204819,
 ('a_bins_20-29', 0, 1): 0.8,
 ('a_bins_20-29', 1, 0): 0.21686746987951808,
 ('a_bins_20-29', 1, 1): 0.2,
 ('a_bins_30-39', 0, 0): 0.8289156626506025,
 ('a_bins_30-39', 0, 1): 0.810126582278481,
 ('a_bins_30-39', 1, 0): 0.1710843373493976,
 ('a_bins_30-39', 1, 1): 0.189873417721519,
 ('a_bins_40-49', 0, 0): 0.8987951807228916,
 ('a_bins_40-49', 0, 1): 0.8514767932489451,
 ('a_bins_40-49', 1, 0): 0.10120481927710843,
 ('a_bins_40-49', 1, 1): 0.14852320675105485,
 ('a_bins_50-59', 0, 0): 0.9156626506024097,
 ('a_bins_50-59', 0, 1): 0.8675105485232067,
 ('a_bins_50-59', 1, 0): 0.08433734939759036,
 ('a_bins_50-59', 1, 1): 0.13248945147679325,
 ('a_bins_60-69', 0, 0): 0.9156626506024097,
 ('a

In [22]:
def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run 
preds = predict(train, class_probs, cond_probs)
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,7e-05,0.00094,1
1,0.000557,0.004486,1
2,1.8e-05,0.000818,1
3,0.007656,0.003587,0
4,0.007656,0.003587,0


# Run your algorithm and see how it predicts on the test data by calculating the predictive accuracy.
# Does your model perform better than chance (i.e. coin flip)?

In [23]:
accuracy = sum(train.vote == preds.pred)/train.shape[0]
accuracy

0.7475

# The accuracy is .74 which is on par for the training values.  

In [25]:
test_preds = predict(test, class_probs, cond_probs)
test_accuracy = sum(test.vote == test_preds.pred)/test.shape[0]
test_accuracy

0.7175

# The prediction is .71 which is better than a coin flip but not out performing the accuracy of the model so it seems to be relatively good.  