In [1]:
#Import packages

import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

# Silence warnings 
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Set seed
np.random.seed()

# read in data
turnout_data = pd.read_csv("turnout.csv")

# Train-Test split (just using Pandas)
train = turnout_data.sample(frac=.8).reset_index(drop=True)
test = turnout_data.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,id,age,educate,income,vote,white
0,113,54,12.0,2.917,1,1
1,270,24,12.0,6.7838,1,1
2,432,36,12.0,6.5589,1,1
3,187,23,14.0,2.2706,0,1
4,1242,70,0.0,1.7723,1,1


In [3]:
# For binary predictor variable
# Drop the unrelated columns
train_white = train.drop(columns = ['id', 'age','educate', 'income'])
train_white
test_white =test.drop(columns = ['id', 'age','educate', 'income'])
test_white

Unnamed: 0,vote,white
0,1,0
1,1,1
2,0,1
3,1,1
4,0,1
...,...,...
395,0,1
396,1,1
397,1,1
398,0,1


In [4]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data. 
    
    Parameters
    ------
    data: a dataframe of the character of the binary varibale
    outcome_var: a string of the vairable we're conditioning on
    
    Returns
    ------
    class_probs: a dictionary contains class probabilities data
    cond_probs:a dictionary contains conditional probabilities data
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


# Run
class_probs, cond_probs = calc_probs(train_white,outcome_var="vote")

# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.24625, 1: 0.75375}


conditional probabilities

{('white', 0, 0): 0.2182741116751269,
 ('white', 0, 1): 0.12520729684908793,
 ('white', 1, 0): 0.7817258883248731,
 ('white', 1, 1): 0.8747927031509121}


In [5]:
# For continuous predictor variable 
# Drop the unrelated columns
train_con = train.drop(columns = ['id','white'])
order = ['vote','age', 'educate', 'income']
train_con = train_con[order]
test_con =test.drop(columns = ['id','white'])
test_con=test_con[order]
train_con

Unnamed: 0,vote,age,educate,income
0,1,54,12.0,2.9170
1,1,24,12.0,6.7838
2,1,36,12.0,6.5589
3,0,23,14.0,2.2706
4,1,70,0.0,1.7723
...,...,...,...,...
1595,1,48,16.0,8.3565
1596,0,26,11.0,3.0640
1597,1,41,12.0,6.2740
1598,1,30,19.0,2.7068


In [6]:
#Calculate class probabilities

vote1 = train_con.query("vote == 1")
vote0 = train_con.query("vote == 0")

# Class probabilities.
pr_vote1 = vote1.shape[0]/train_con.shape[0]
pr_vote0 = vote0.shape[0]/train_con.shape[0]
print(pr_vote1)
print(pr_vote0)

0.75375
0.24625


In [7]:
# Collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("age",1):{'mean':vote1.age.mean(),'sd':vote1.age.std()},
 ("age",0):{'mean':vote0.age.mean(),'sd':vote0.age.std()},
 ("educate",1):{'mean':vote1.educate.mean(),'sd':vote1.educate.std()},
 ("educate",0):{'mean':vote0.educate.mean(),'sd':vote0.educate.std()},
 ("income",1):{'mean':vote1.income.mean(),'sd':vote1.income.std()},
 ("income",0):{'mean':vote0.income.mean(),'sd':vote0.income.std()},

}

# Print
pp.pprint(dist_locs)

{('age', 0): {'mean': 43.21827411167513, 'sd': 19.37596609600872},
 ('age', 1): {'mean': 46.549751243781095, 'sd': 17.066346368256582},
 ('educate', 0): {'mean': 10.558375634517766, 'sd': 3.3637255791793828},
 ('educate', 1): {'mean': 12.541873963515755, 'sd': 3.3211425525925637},
 ('income', 0): {'mean': 2.782161167512692, 'sd': 2.214063372959448},
 ('income', 1): {'mean': 4.240855970149254, 'sd': 2.899198155663477}}


In [8]:
def predict(data,dist_locs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    Parameters
    ------
    data: a dataframe of the character of the continuous varibales
    dist_locs: a dictionary contains the mean and standard deviation of the continous variables
    
    Returns
    ------
    storepred: a dataframe  of both the probabilities and the most likely class
    '''
    store_preds = []
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        #Multiply the continuous variables
        for j in range(1,4):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
           
        
        #Multiply the binary variables    
        pr_0 *= cond_probs[(row.index[5],row.values[5],0)]
        pr_1 *= cond_probs[(row.index[5],row.values[5],1)]
        
        pr_0 *= pr_vote0
        pr_1 *= pr_vote1
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
        storepred = pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])
    
    return storepred

# Run
preds_train = predict(train,dist_locs)

In [9]:
# Predicted output
preds_train.head(10)

Unnamed: 0,pr_0,pr_1,pred
0,6.606318e-05,0.0002059285,1
1,9.226594e-06,7.148479e-05,1
2,1.682764e-05,0.0001508629,1
3,2.83489e-05,7.088282e-05,1
4,2.129968e-07,5.520862e-08,0
5,3.10708e-06,2.556835e-05,1
6,1.754821e-05,9.227792e-05,1
7,5.118322e-05,6.705961e-05,1
8,5.837086e-06,6.129951e-05,1
9,2.89396e-05,9.274136e-05,1


In [10]:
#Calculate predictive accuracy for training data
accuracy_train = sum(train.vote == preds_train.pred)/train.shape[0]
accuracy_train

0.7475

In [11]:
#Calculate predictive accuracy for test data
preds_test = predict(test,dist_locs)
accuracy_test = sum(test.vote == preds_test.pred)/test.shape[0]
accuracy_test

0.7175

For this Naive Bayesian Classifier, we obtained predictive accuracy of 74.8% on the training data and 71.8% on the test data. The model's accuracy is not very high but still performs better than coin flip.