# Can we predict whether someone will vote or not?


##### Please break the data up into a training (1600 entries, 80%) and test dataset (400 entries, 20%).

In [144]:
#importing relevant methods

import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

# Plotting libraries 
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [145]:
turnout = pd.read_csv('/Users/juliezantao/coding_discussions_ppol564_fall2021/05_coding_discussion/turnout.csv')
turnout.head()

Unnamed: 0,id,age,educate,income,vote,white
0,1,60,14.0,3.3458,1,1
1,2,51,10.0,1.8561,0,1
2,3,24,12.0,0.6304,0,1
3,4,38,8.0,3.4183,1,1
4,5,25,12.0,2.7852,1,1


In [146]:
# Train-Test split (just using Pandas)
train = turnout.sample(frac=.8).reset_index(drop=True) #pulling out random 80%
test = turnout.drop(train.index).reset_index(drop=True) #pulling out the other 20%

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

Training Data: 1600 
Test Data: 400


In [147]:
# Look at the head of the data
train.head()

Unnamed: 0,id,age,educate,income,vote,white
0,159,40,15.0,13.6702,1,1
1,1271,45,12.0,0.4607,1,0
2,1723,22,13.0,0.7568,1,1
3,1215,33,19.0,5.6622,1,1
4,599,48,12.0,3.7042,1,1


##### Build a Naive Bayesian Classifier from scratch that tries to predict whether a respondent will vote in a presidential election or not, pr(Vote==1). The classifier must be built from scratch. Do not use a third party ML or statistical package.

In [148]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data. 
    
    Note that I'm using dictionaries with tuple keys to keep
    track of the variable, it's val, and the outcome, which we're conditioning on. 
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


# Run
class_probs, cond_probs = calc_probs(train,outcome_var="vote")

# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.246875, 1: 0.753125}


conditional probabilities

{('age', 0, 0): -41.278481012658226,
 ('age', 0, 1): -45.38091286307054,
 ('age', 1, 0): 42.278481012658226,
 ('age', 1, 1): 46.38091286307054,
 ('educate', 0, 0): -9.716455696202532,
 ('educate', 0, 1): -11.605394190871369,
 ('educate', 1, 0): 10.716455696202532,
 ('educate', 1, 1): 12.605394190871369,
 ('id', 0, 0): -1058.427848101266,
 ('id', 0, 1): -979.0730290456431,
 ('id', 1, 0): 1059.427848101266,
 ('id', 1, 1): 980.0730290456431,
 ('income', 0, 0): -1.8713470886075951,
 ('income', 0, 1): -3.2998618257261416,
 ('income', 1, 0): 2.871347088607595,
 ('income', 1, 1): 4.299861825726142,
 ('white', 0, 0): 0.22784810126582278,
 ('white', 0, 1): 0.1203319502074689,
 ('white', 1, 0): 0.7721518987341772,
 ('white', 1, 1): 0.8796680497925311}


In [149]:
vote1 = train.query("vote == 1")
vote0 = train.query("vote == 0")

# Class probabilities.
prob_v1 = y1.shape[0]/train.shape[0]
prob_v0 = y0.shape[0]/train.shape[0]



In [150]:
#Calculating the conditional probability for white

#Given vote =0 
white1_vote0 = vote0.query("white == 1").shape[0]/vote0.shape[0]
white0_vote0 = vote0.query("white == 0").shape[0]/vote0.shape[0]

#Given vote =1
white1_vote1 = vote1.query("white == 1").shape[0]/vote1.shape[0]
white0_vote1 = vote1.query("white == 0").shape[0]/vote1.shape[0]


#Printing the results
print(
f"""
Pr(white = 1 |vote = 0): {white1_vote0}
Pr(white = 0 |vote = 0): {white0_vote0}
Pr(white = 1 |vote = 1): {white1_vote1}
Pr(white = 0 |vote = 1): {white0_vote1}
""")


Pr(white = 1 |vote = 0): 0.7721518987341772
Pr(white = 0 |vote = 0): 0.22784810126582278
Pr(white = 1 |vote = 1): 0.8796680497925311
Pr(white = 0 |vote = 1): 0.12033195020746888



In [151]:
# Collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("age",1):{'mean':y1.age.mean(),'sd':y1.age.std()},
 ("age",0):{'mean':y0.age.mean(),'sd':y0.age.std()},
 ("educate",1):{'mean':y1.educate.mean(),'sd':y1.educate.std()},
 ("educate",0):{'mean':y0.educate.mean(),'sd':y0.educate.std()},
 ("income",1):{'mean':y1.income.mean(),'sd':y1.income.std()},
 ("income",0):{'mean':y0.income.mean(),'sd':y0.income.std()},
 ("white",1):{'mean':y1.white.mean(),'sd':y1.white.std()},
 ("white",0):{'mean':y0.white.mean(),'sd':y0.white.std()}
}

# Print
pp.pprint(dist_locs)

{('age', 0): {'mean': 42.13895781637717, 'sd': 18.684352298158444},
 ('age', 1): {'mean': 46.345029239766085, 'sd': 16.803818419680265},
 ('educate', 0): {'mean': 10.805210918114144, 'sd': 3.2678743289074386},
 ('educate', 1): {'mean': 12.52172096908939, 'sd': 3.299746019262978},
 ('income', 0): {'mean': 2.855613151364764, 'sd': 2.1986517629056457},
 ('income', 1): {'mean': 4.216433416875523, 'sd': 2.9060407685079377},
 ('white', 0): {'mean': 0.7766749379652605, 'sd': 0.41699214476797114},
 ('white', 1): {'mean': 0.8830409356725146, 'sd': 0.321505825475229}}


In [168]:
def predict(data,dist_locs):
    
    #An empty list that will be used to store data in the function
    store_preds = []
    
    #Iterating through the data
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        
        for j in range(1,4): 
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        if(row["white"]==0):
            pr_0*=white0_vote0
        else:
            pr_0*=white1_vote0
    
            
        if(row["white"]==0):
            pr_1*=white0_vote1
        else:
            pr_1*=white1_vote1
        
        pr_0 *= prob_v0
        pr_1 *= prob_v1
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["Vote_0","Vote_1","Final_Prediction"])

In [163]:
# Running function
pred_train = predict(train, dist_locs)

In [165]:
#Getting snapshot 
pred_train.head(10)

Unnamed: 0,Vote_0,Vote_1,Final_Prediction
0,2.235721e-10,9.169134e-07,1
1,1.386429e-05,1.514891e-05,1
2,2.603869e-05,4.423111e-05,1
3,1.557438e-06,2.433082e-05,1
4,7.602855e-05,0.0002509295,1
5,7.187075e-05,0.0001796456,1
6,1.384173e-05,9.083325e-06,0
7,3.233333e-05,0.0001168288,1
8,3.615985e-05,5.936521e-05,1
9,2.063777e-05,8.815863e-05,1


##### Run your algorithm and see how it predicts on the test data by calculating the predictive accuracy.

In [166]:
#Examining the predictive accuracy of the training data.
accuracy_train = sum(train.vote == pred_train.Final_Prediction)/train.shape[0]
accuracy_train

0.7475

In [156]:
#Run the function for test data
pred_test = predict(test, dist_locs)

In [157]:
pred_test.head(10)

Unnamed: 0,Vote_0,Vote_1,Final_Prediction
0,5.688398e-06,1.335242e-05,1
1,2.07704e-05,0.0001815165,1
2,2.660769e-05,7.008527e-05,1
3,7.602813e-05,0.0001876561,1
4,1.767109e-06,4.169324e-05,1
5,5.036046e-05,8.363252e-05,1
6,6.165724e-08,1.480605e-08,0
7,3.838394e-05,7.603654e-05,1
8,1.126187e-05,7.306106e-06,0
9,3.65449e-05,0.0001059103,1


In [158]:
#Examining the predictive accuracy on the test data.
accuracy_test = sum(test.vote == pred_test.Final_Prediction)/test.shape[0]
accuracy_test

0.7075

##### Does your model perform better than chance (i.e. coin flip)?

My model does perform better than a chance flip. As we can see, the train model's (1600 entries, 80%) accuracy is 0.7475 and the test model is 0.7075. Since these are both greater than 0.5 which is the probability of a coin flip by chance, it is possible to see that this model does in fact perform better than a chance flip. 