### Coding Discussion: Classifier

Goal: create a classifier to predict whether or not an individual voted

In [108]:
#import packages
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

#display full 
pd.set_option("display.max_rows", None, "display.max_columns", None)

import warnings
warnings.filterwarnings("ignore")

import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

In [109]:
#load the data
turnout = pd.read_csv('turnout.csv')
turnout.head()

Unnamed: 0,id,age,educate,income,vote,white
0,1,60,14.0,3.3458,1,1
1,2,51,10.0,1.8561,0,1
2,3,24,12.0,0.6304,0,1
3,4,38,8.0,3.4183,1,1
4,5,25,12.0,2.7852,1,1


In [110]:
#break the data into test (1600, 80%) and training (400, 20%) data sets
train = turnout.sample(frac=.8).reset_index(drop=True)
test = turnout.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

Training Data: 1600 
Test Data: 400


In [111]:
train_binary_vals = train.drop(columns=['id', 'age', 'educate', 'income'])
test_binary_vals = train.drop(columns=['id', 'age', 'educate', 'income'])

#number 
N = train.shape[0]

#subset the data
vote1 = train.query("vote==1")
vote0 = train.query("vote==0")

#calculate the probability for each class
pr_vote_1 = vote1.shape[0]/N
pr_vote_0 = vote0.shape[0]/N

#print the probabilities 
print(
f"""
Pr(vote = 1): {pr_vote_1}
Pr(vote = 0): {pr_vote_0}
""")


Pr(vote = 1): 0.753125
Pr(vote = 0): 0.246875



In [112]:
# Given vote == 1
w1_vote1 = vote1.query("white == 1").shape[0]/vote1.shape[0]
w0_vote1 = vote1.query("white == 0").shape[0]/vote1.shape[0]


# Given vote == 0
w1_vote0 = vote0.query("white == 1").shape[0]/vote0.shape[0]
w0_vote0 = vote0.query("white == 0").shape[0]/vote0.shape[0]

print(
f"""
Pr(white = 1 |vote = 1): {w1_vote1}
Pr(white = 0 |vote = 1): {w0_vote1}
Pr(white = 1 |vote = 0): {w1_vote0}
Pr(white = 0 |vote = 0): {w0_vote0}
""")


Pr(white = 1 |vote = 1): 0.8780082987551867
Pr(white = 0 |vote = 1): 0.12199170124481327
Pr(white = 1 |vote = 0): 0.789873417721519
Pr(white = 0 |vote = 0): 0.21012658227848102



In [113]:
prob_vote1 = w0_vote1 * pr_vote_1
prob_vote0 = w0_vote0 * pr_vote_0

print(f"""
Pr(vote = 1) = {prob_vote1}
Pr(vote = 0) = {prob_vote0}
""")


Pr(vote = 1) = 0.091875
Pr(vote = 0) = 0.051875000000000004



In [114]:
prob_vote1 = w1_vote1 * pr_vote_1
prob_vote0 = w1_vote0 * pr_vote_0

print(f"""
Pr(cw = 1) = {prob_vote1}
Pr(cw = 0) = {prob_vote0}
""")


Pr(cw = 1) = 0.66125
Pr(cw = 0) = 0.195



In [115]:
#calculate the underlying probabilities and 
#then calculate the predictions for each observation in the data.

def calc_probs(data,outcome_var=""):
    '''
    calculates the class + conditional probabilities from 
    the binary data
    
    Note that I'm using dictionaries with tuple keys to keep
    track of the variable, it's val, and the outcome, which we're conditioning on. 
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


class_probs, cond_probs = calc_probs(train_binary_vals,outcome_var="vote")

print(class_probs)

{0: 0.246875, 1: 0.753125}


In [116]:
#build a prediction function that combs through the observations in the 
#data and calculates the probabilities and makes a class prediction.

def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditiona probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

preds = predict(train_binary_vals, class_probs, cond_probs)
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,0.195,0.66125,1
1,0.195,0.66125,1
2,0.195,0.66125,1
3,0.195,0.66125,1
4,0.195,0.66125,1


In [117]:
#calculate predictive accuracy 
#(i.e. how many correct prediction did we make).

accuracy = sum(train_binary_vals.vote == preds.pred)/train.shape[0]
accuracy

0.753125

In [118]:
#We obtained predictive accuracy of 73.8% on the training data, not bad!
#now try to predict the outcomes in the test data and see how we do.

test_preds = predict(test_binary_vals, class_probs, cond_probs)
test_accuracy = sum(test_binary_vals.vote == test_preds.pred)/test.shape[0]
test_accuracy

3.0125

In [119]:
# Reorganize the data -- training
train_drop = train.drop(columns=['id', 'white', 'age'])
train_drop = train_drop[['vote', 'educate', 'income']]
y,x1,x2 = train_drop.iloc[1,:]
train_drop.columns = ['y', 'x1', 'x2']

#test 
test_drop = test.drop(columns=['id', 'white', 'age'])
test_drop = test_drop[['vote', 'educate', 'income']]
test_drop.columns = ['y', 'x1', 'x2']

In [120]:
#probabilities

y1 = train_drop.query("y == 1")
y0 = train_drop.query("y == 0")


pr_y1 = y1.shape[0]/train_drop.shape[0]
pr_y0 = y0.shape[0]/train_drop.shape[0]

In [121]:
# mean and standard deviation for each conditional distribution
dist_locs = \
{("x1",1):{'mean':y1.x2.mean(),'sd':y1.x1.std()},
 ("x1",0):{'mean':y0.x2.mean(),'sd':y0.x1.std()},
 ("x2",1):{'mean':y1.x1.mean(),'sd':y1.x2.std()},
 ("x2",0):{'mean':y0.x1.mean(),'sd':y0.x2.std()}
}

# Print
pp.pprint(dist_locs)

{('x1', 0): {'mean': 2.6539081012658228, 'sd': 3.1636351995604635},
 ('x1', 1): {'mean': 4.293774937759336, 'sd': 3.273200337414391},
 ('x2', 0): {'mean': 10.546835443037974, 'sd': 2.1238444238780207},
 ('x2', 1): {'mean': 12.569294605809128, 'sd': 2.964062784451219}}


In [122]:
# Prediction for the 1 class
a = st.norm(dist_locs[("x1",1)]['mean'], dist_locs[("x1",1)]['sd']).pdf(x1)
b = st.norm(dist_locs[("x2",1)]['mean'], dist_locs[("x2",1)]['sd']).pdf(x2)
c = pr_y1
pr_1 = a * b * c 

# Prediction for the 0 class
a = st.norm(dist_locs[("x1",0)]['mean'], dist_locs[("x1",0)]['sd']).pdf(x1)
b = st.norm(dist_locs[("x2",0)]['mean'], dist_locs[("x2",0)]['sd']).pdf(x2)
c = pr_y0
pr_0 = a * b * c 

print(
f'''
    Pr(y == 1| X): {pr_1}
    Pr(y == 0| X): {pr_0}
''')


    Pr(y == 1| X): 3.3278635761960385e-06
    Pr(y == 0| X): 9.369806075462246e-08



In [123]:
def predict(data,dist_locs):
    ''''''
    store_preds = []
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row)):
            
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        pr_0 *= pr_y0
        pr_1 *= pr_y1
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

preds_train = predict(train_drop,dist_locs)

In [124]:
# Predictive accuracy of training data 
accuracy_train = sum(train_drop.y == preds_train.pred)/train_drop.shape[0]
accuracy_train

0.753125

In [125]:
# Test test df 
preds_test = predict(test_drop, dist_locs)

# Predictive accuracy of test data 
accuracy_train = sum(test_drop.y == preds_test.pred)/test_drop.shape[0]
accuracy_train

0.7

In [126]:
#Predicted output
preds_train.head()

Unnamed: 0,pr_0,pr_1,pred
0,1.549384e-05,0.0001150544,1
1,9.369806e-08,3.327864e-06,1
2,1.364836e-08,5.013006e-07,1
3,5.679197e-10,1.345477e-07,1
4,7.356374e-08,3.386269e-06,1


In [127]:
#Predicted output
preds_test.head()

Unnamed: 0,pr_0,pr_1,pred
0,6.321212e-09,2.976234e-07,1
1,1.042173e-05,8.465148e-05,1
2,2.113932e-09,3.027764e-07,1
3,2.520785e-07,6.348042e-06,1
4,5.21432e-05,0.0003376867,1


Results: The model gives us the ability to predict whether or not someone voted with 70% accuracy (per the test data). Notably, this accuracy is slightly less than that acheived via the training data. However, 70% accuracy is still far better than a coin flip!  