## Coding Discussion 5
### Abigail Paterson
#### arp140

In [1]:
import pandas as pd
import numpy as np

import pprint as pp
import scipy.stats as st # for Normal PDF

# Set seed
np.random.seed(37)

In [2]:
#read data
turnout = pd.read_csv("../turnout.csv")

In [3]:
#drop the id col
turnout.drop(["id"], axis=1, inplace=True)
turnout

Unnamed: 0,age,educate,income,vote,white
0,60,14.0,3.3458,1,1
1,51,10.0,1.8561,0,1
2,24,12.0,0.6304,0,1
3,38,8.0,3.4183,1,1
4,25,12.0,2.7852,1,1
...,...,...,...,...,...
1995,26,16.0,3.3834,0,1
1996,34,12.0,2.9170,1,1
1997,51,16.0,7.8949,1,1
1998,22,10.0,2.4811,0,1


In [4]:
#create dummy variables
turnout['edu_bins'] = pd.cut(turnout['educate'], bins=5, labels=['low','midlow','mid','midhigh','high'])
turnout['inc_bins'] = pd.cut(turnout['income'], bins=5, labels=['low','midlow','mid','midhigh','high'])
turnout['age_bins'] = pd.cut(turnout['age'], bins=8, labels=['16-26','27-36','37-46','47-56','57-65','65-75','75-85','85-95'])

turn_dummies = pd.get_dummies(turnout)

In [5]:
# drop original variables
turn_dummies.drop(['age','educate','income'],axis=1, inplace=True)
turn_dummies

Unnamed: 0,vote,white,edu_bins_low,edu_bins_midlow,edu_bins_mid,edu_bins_midhigh,edu_bins_high,inc_bins_low,inc_bins_midlow,inc_bins_mid,inc_bins_midhigh,inc_bins_high,age_bins_16-26,age_bins_27-36,age_bins_37-46,age_bins_47-56,age_bins_57-65,age_bins_65-75,age_bins_75-85,age_bins_85-95
0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
3,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,1,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0
1996,1,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1997,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
1998,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [6]:
# split the data into train and test data for evaluating the model
train = turn_dummies.sample(frac=.8).reset_index(drop=True)
test = turn_dummies.drop(train.index).reset_index(drop=True)

In [7]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities for the input dataset of binary
    variables stored in 'data' including a specified dependent variable column name, 'outcome_var'.
    
    Uses dictionaries with tuple keys to keep track of the variable,
    it's val, and the outcome being conditioned on.
    
    Returns the probabilities of each class and the condition probabilities for each variable given which
    class is being iterated on in the for loop.
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


# Run the model to determine voting probability based on included dummary variables
class_probs, cond_probs = calc_probs(train,outcome_var="vote")

In [8]:
# Print the class probability for voting or not
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
# Pring the condition probability for voting based on each individual dummy variable included
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.2625, 1: 0.7375}


conditional probabilities

{('age_bins_16-26', 0, 0): 0.7476190476190476,
 ('age_bins_16-26', 0, 1): 0.8728813559322034,
 ('age_bins_16-26', 1, 0): 0.2523809523809524,
 ('age_bins_16-26', 1, 1): 0.1271186440677966,
 ('age_bins_27-36', 0, 0): 0.7666666666666666,
 ('age_bins_27-36', 0, 1): 0.773728813559322,
 ('age_bins_27-36', 1, 0): 0.23333333333333334,
 ('age_bins_27-36', 1, 1): 0.22627118644067798,
 ('age_bins_37-46', 0, 0): 0.8666666666666667,
 ('age_bins_37-46', 0, 1): 0.8008474576271186,
 ('age_bins_37-46', 1, 0): 0.13333333333333333,
 ('age_bins_37-46', 1, 1): 0.19915254237288135,
 ('age_bins_47-56', 0, 0): 0.8785714285714286,
 ('age_bins_47-56', 0, 1): 0.8364406779661017,
 ('age_bins_47-56', 1, 0): 0.12142857142857143,
 ('age_bins_47-56', 1, 1): 0.1635593220338983,
 ('age_bins_57-65', 0, 0): 0.888095238095238,
 ('age_bins_57-65', 0, 1): 0.8779661016949153,
 ('age_bins_57-65', 1, 0): 0.11190476190476191,
 ('age_bins_57-65', 1, 1): 0.1

In [9]:
def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])


In [10]:
#run 
preds = predict(train, class_probs, cond_probs)
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,3.1e-05,1.6e-05,0
1,0.000253,0.000196,0
2,4.3e-05,0.000772,1
3,0.000207,0.001593,1
4,4.6e-05,0.000879,1


In [11]:
# evaluate the accuracy of the model on the training data

accuracy = sum(train.vote == preds.pred)/train.shape[0]
accuracy

0.730625

In [12]:
test_preds = predict(test, class_probs, cond_probs)
test_accuracy = sum(test.vote == test_preds.pred)/test.shape[0]
test_accuracy

0.7125

Our model is able to predict if someone will vote with 73% accuracy on the training data and 71% accuracy on the test data. This is much better a coin flip. In order to do this, we transformed our data into binary dummy variables. Additioinially, we can be fairly confident that our model is not overfitting because it neither perfeclty predicts the training data nor see a dramatic change in accuracy for the test data.