### Coding Discussion 5
##### Alia Abdelkader

#### Can we predict whether or not someone will vote?

In [1]:
import os
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

# Silence warnings
import warnings
warnings.filterwarnings("ignore")

# Change working directory to the location of the cloned repository, where the data file is located
os.chdir('/Users/Alia/Desktop')

#### Building a Naive Bayesian Classifier - Binary Predictor

- vote = 1 means the person voted
- vote = 0 means the person did not vote

- white = 1 means the person is white
- white = 0 means the person is not white

In [2]:
# Set seed
np.random.seed(1234)

# Read in data
edata = pd.read_csv("turnout.csv")

# Train-Test split
train = edata.sample(frac=.8).reset_index(drop=True)
test = edata.drop(train.index).reset_index(drop=True)

# Print off the split count
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

Training Data: 1600 
Test Data: 400


##### Calculate class probabilities

In [3]:
N = train.shape[0]

# Subset the data by class (voted or did not vote)
v1 = train.query("vote == 1")
v0 = train.query("vote == 0")

# Calculate the probability for each class
pr_v_1 = v1.shape[0]/N
pr_v_0 = v0.shape[0]/N

# Print the probabilities
print(
f"""
Pr(vote = 1): {pr_v_1}
Pr(vote = 0): {pr_v_0}
""")


Pr(vote = 1): 0.7425
Pr(vote = 0): 0.2575



##### Calculate conditional probabilities

In [4]:
# Given V == 1
w1_v1 = v1.query("white == 1").shape[0]/v1.shape[0]
w0_v1 = v1.query("white== 0").shape[0]/v1.shape[0]

# Given V == 0
w1_v0 = v0.query("white == 1").shape[0]/v0.shape[0]
w0_v0 = v0.query("white == 0").shape[0]/v0.shape[0]

print(
f"""
Pr(white = 1 | vote = 1): {w1_v1}
Pr(white = 0 | vote = 1): {w0_v1}
Pr(white = 1 | vote = 0): {w1_v0}
Pr(white = 0 | vote = 0): {w0_v0}
""")

# Predict whether a person will vote if they are not white

prob_v1 = w0_v1 * pr_v_1
prob_v0 = w0_v0 * pr_v_0

print(f"""
Pr(v = 1) = {prob_v1}
Pr(v = 0) = {prob_v0}
""")

print("Because 0.0975 is greater than 0.0519, we would predict that the person would vote (v=1).")


Pr(white = 1 | vote = 1): 0.8686868686868687
Pr(white = 0 | vote = 1): 0.13131313131313133
Pr(white = 1 | vote = 0): 0.7985436893203883
Pr(white = 0 | vote = 0): 0.20145631067961164


Pr(v = 1) = 0.09750000000000002
Pr(v = 0) = 0.051875

Because 0.0975 is greater than 0.0519, we would predict that the person would vote (v=1).


##### Predicting Multiple Observations - Binary

In [5]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in
    the binary data.
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var):
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs

# Drop columns not needed for binary predictors
train_binary = train.drop(columns=['id', 'age', 'educate', 'income'])
test_binary = test.drop(columns=['id', 'age', 'educate', 'income'])

# Run
class_probs, cond_probs = calc_probs(train_binary,outcome_var="vote")

# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.2575, 1: 0.7425}


conditional probabilities

{('white', 0, 0): 0.20145631067961167,
 ('white', 0, 1): 0.13131313131313127,
 ('white', 1, 0): 0.7985436893203883,
 ('white', 1, 1): 0.8686868686868687}


In [6]:
def predict_binary(data,class_probs,cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class.
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run
preds = predict_binary(train_binary,class_probs,cond_probs)
print(preds.head(10))
print('\n')

# Calculate the model's predictive accuracy
accuracy = sum(train_binary.vote == preds.pred)/train_binary.shape[0]
print(f"""Training accuracy = {accuracy}""")

# Check the model's accuracy on the test data
binary_preds = predict_binary(test_binary, class_probs, cond_probs)
binary_test_accuracy = sum(test_binary.vote == binary_preds.pred)/test_binary.shape[0]
print(f"""Test accuracy = {binary_test_accuracy}""")

       pr_0    pr_1  pred
0  0.205625  0.6450     1
1  0.051875  0.0975     1
2  0.205625  0.6450     1
3  0.205625  0.6450     1
4  0.205625  0.6450     1
5  0.205625  0.6450     1
6  0.205625  0.6450     1
7  0.205625  0.6450     1
8  0.205625  0.6450     1
9  0.205625  0.6450     1


Training accuracy = 0.7425
Test accuracy = 0.7


The classifier's accuracy when run on the test data was 70%, which is better than chance (i.e. a coin flip.) Predictably, the accuracy on the training data, which has more observations, was 74%.

#### Building a Naive Bayesian Classifier - Continuous Predictors

For this model, we'll use a probability density function for Gaussian (normal) distribution to convert continuous values into probabilities, thereby mapping the continuous predictors into a probability space.

We can use information regarding the distribution of each continuous predictor and find out where any single point is on that continuous variable's probability distribution.

In this model, we'll use only *income* and *educate* as our two predictors.

In [7]:
# Drop columns not needed for the specified model, rearrange
train_cont = train.drop(columns=['id', 'white', 'age'])
train_cont = train_cont[['vote', 'educate', 'income']]
test_cont = test.drop(columns=['id', 'white', 'age'])
test_cont = test_cont[['vote', 'educate', 'income']]

# Subset data by "vote"
v1 = train_cont.query("vote == 1")
v0 = train_cont.query("vote == 0")

# Calculate class probabilities
pr_v1 = v1.shape[0]/train_cont.shape[0]
pr_v0 = v0.shape[0]/train_cont.shape[0]

# Calculate mean and stdev of each continuous distribution
dist_locs = \
{("educate",1):{'mean':v1.educate.mean(), 'sd': v1.educate.std()}, # Educate given v1
 ("educate",0):{'mean':v0.educate.mean(), 'sd': v0.educate.std()}, # Educate given v0
 ("income",1):{'mean':v1.income.mean(), 'sd': v1.income.std()}, # Income given v1
 ("income",0):{'mean':v0.income.mean(), 'sd': v0.income.std()} # Income given v0
}

pp.pprint(dist_locs)

{('educate', 0): {'mean': 10.62864077669903, 'sd': 3.304381091983527},
 ('educate', 1): {'mean': 12.558922558922559, 'sd': 3.295714127444309},
 ('income', 0): {'mean': 2.7381618932038836, 'sd': 2.2429913729337625},
 ('income', 1): {'mean': 4.229461952861947, 'sd': 2.8482089910676964}}


Now that we have mapped the variables, we can predict whether each person will vote.

##### Predicting Multiple Observations - Continuous

In [8]:
def predict_cont(data,dist_locs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class.
    '''
    store_preds = []
    for i,row in data.iterrows():

        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row)):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'],
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        pr_0 *= pr_v0
        pr_1 *= pr_v1

        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1

        store_preds.append([pr_0,pr_1,class_pred])

    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run
preds = predict_cont(train_cont,dist_locs)
print(preds.head(10))

print('\n')

# Calculate the model's predictive accuracy
accuracy = sum(train_cont.vote == preds.pred)/train_cont.shape[0]
print(f"""Training accuracy = {accuracy}""")

cont_preds = predict_cont(test_cont, dist_locs)
cont_accuracy = sum(test_cont.vote == cont_preds.pred)/test_cont.shape[0]
print(f"""Test accuracy = {cont_accuracy}""")

       pr_0      pr_1  pred
0  0.001206  0.004321     1
1  0.000252  0.000086     0
2  0.004438  0.005513     1
3  0.003170  0.002730     0
4  0.003276  0.010273     1
5  0.000466  0.000190     0
6  0.004891  0.009462     1
7  0.001431  0.006917     1
8  0.002290  0.004604     1
9  0.002755  0.011423     1


Training accuracy = 0.73125
Test accuracy = 0.7075


For continuous predictors, the classifier's accuracy when run on the test data was 70.7%, which is better still than chance (i.e. a coin flip.) The accuracy on the training data, which is more robust, was 73.1%, which is to be expected.