In [6]:
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

# Plotting libraries 
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [4]:
dat = pd.read_csv("/Users/Lawrence/Desktop/Georgetown_G1/PPOL_564_DS/coding_discussions_ppol564_fall2021/05_coding_discussion/turnout.csv")
dat

Unnamed: 0,id,age,educate,income,vote,white
0,1,60,14.0,3.3458,1,1
1,2,51,10.0,1.8561,0,1
2,3,24,12.0,0.6304,0,1
3,4,38,8.0,3.4183,1,1
4,5,25,12.0,2.7852,1,1
...,...,...,...,...,...,...
1995,1996,26,16.0,3.3834,0,1
1996,1997,34,12.0,2.9170,1,1
1997,1998,51,16.0,7.8949,1,1
1998,1999,22,10.0,2.4811,0,1


In [8]:
np.random.seed(100)

#Train_Test split
train = dat.sample(frac=.8).reset_index(drop=True)
test = dat.drop(train.index).reset_index(drop=True)

print("Training Data:", train.shape[0],
     "\nTest Data:", test.shape[0])

train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,id,age,educate,income,vote,white
0,1026,64,12.0,1.8967,0,1
1,1209,22,14.0,6.0903,1,1
2,1056,48,14.0,2.3866,1,1
3,368,84,7.0,0.2364,1,1
4,816,30,12.0,2.9193,1,1


In [11]:
cols_cate = train[['vote', 'white']]
cols_cont = train[['age', 'educate', 'income', 'vote']]

In [38]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data. 
    
    Note that I'm using dictionaries with tuple keys to keep
    track of the variable, it's val, and the outcome, which we're conditioning on. 
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs

In [39]:
class_probs, cond_probs = calc_probs(cols_cate, 'vote')

In [41]:
#Subset the train data by the 'vote' column.
v_1 = train.query("vote == 1")
v_0 = train.query("vote == 0")

# Collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("age",1):{'mean':v_1.age.mean(),'sd':v_1.age.std()},
 ("age",0):{'mean':v_0.age.mean(),'sd':v_0.age.std()},
 ("educate",1):{'mean':v_1.educate.mean(),'sd':v_1.educate.std()},
 ("educate",0):{'mean':v_0.educate.mean(),'sd':v_0.educate.std()},
 ("income",1):{'mean':v_1.income.mean(),'sd':v_1.income.std()},
 ("income",0):{'mean':v_0.income.mean(),'sd':v_0.income.std()}
}

print(dist_locs)

{('age', 1): {'mean': 45.90939597315436, 'sd': 16.823282983666747}, ('age', 0): {'mean': 42.21078431372549, 'sd': 18.7406895835342}, ('educate', 1): {'mean': 12.59270134228188, 'sd': 3.284125431568544}, ('educate', 0): {'mean': 10.713235294117647, 'sd': 3.3343379634605665}, ('income', 1): {'mean': 4.281981879194631, 'sd': 2.8969616440117725}, ('income', 0): {'mean': 2.7734566176470583, 'sd': 2.1651119385592343}}


In [65]:
def predict(data,class_probs,cond_probs, dist_locs):
    '''
    Function calculates the conditiona probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    #Iterate through each row.
    for i,row in data.iterrows():
        #Set the initial probabilities to 0.
        pr_1 = 1; pr_0 = 1
        #Multiply conditional prob of categorical variable. 
        pr_0 *= cond_probs[(row.index[5],row.values[5],0)]
        pr_1 *= cond_probs[(row.index[5],row.values[5],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        #Multiply the conditional probabilities of continuous variables. 
        for j in range(1,4):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
    
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

In [66]:
preds=predict(train,class_probs,cond_probs,dist_locs)
accuracy = sum(train.vote == preds.pred)/train.shape[0]
accuracy
# Calculate the predictive accuracy on training data. 

0.741875

In [67]:
test_preds = predict(test, class_probs, cond_probs, dist_locs)
test_accuracy = sum(test.vote == test_preds.pred)/test.shape[0]
test_accuracy
#Calculate the predictive accuracy on test data.
#The accuracy is 74.31% and 71%, it is better than coin flips.

0.71