### Coding Discussions

In [1]:
# import the necessary packages.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

import warnings
warnings.filterwarnings("ignore")

In [2]:
# we load the data as a pandas data frame and name it.
turnout_whole = pd.read_csv("turnout.csv")

In [3]:
# we check the dimensions of the data 
turnout_whole.head(1)

Unnamed: 0,id,age,educate,income,vote,white
0,1,60,14.0,3.3458,1,1


This is as anticipated. There are 2000 observations of 6 variables. 

In order to use machine learning, we have to break the data set into two parts

In [4]:
# first we label the outcome and the predictors
outcome = turnout_whole['vote']
predictors = turnout_whole[['age', 'educate', 'income', 'white']]

In [5]:
# pull out the data out into data sets
predictor_train, predictor_test, outcome_train, outcome_test = train_test_split(predictors, outcome, test_size = 0.2 , random_state = 499) 

In [6]:
# merge to create a single data frame for each of test and train 
train = pd.merge(predictor_train, outcome_train, left_index=True, right_index=True)

# reset and drop previous index to ensure we don't have errors later on
train = train.reset_index(drop = True)

#repeat.
test = pd.merge(predictor_test, outcome_test, left_index=True, right_index=True)
test = test.reset_index(drop = True)

For this part, we will only use the training data set. We'll develop the model and then later we will test it on the test data set.

In [7]:
# show train
train.head()

Unnamed: 0,age,educate,income,white,vote
0,21,12.0,1.838,1,0
1,33,12.0,4.4578,1,1
2,29,19.0,3.8606,1,1
3,49,12.0,1.8127,1,1
4,52,11.0,0.7594,0,1


In [8]:
# first we calculate the probabilities of each outcome
N = train.shape[0]

# subset the data by class (group of dependent variable outcomes)
vote_1 = train.query("vote == 1")
vote_0 = train.query("vote == 0")

# calculate the probability for each class
pr_vote_1 = vote_1.shape[0]/N
pr_vote_0 = vote_0.shape[0]/N



Pr(vote = 1): 0.74625
Pr(vote = 0): 0.25375



In [9]:
def calc_probs(data,outcome_var="",binary_var=[]):
    '''
    Calc probs, this functon calculates conditional probability for binary variables
    
    parameters:
        data:
            a pandas data frame with independent and dependent variables
        outcome_var:
            a string with the name of the outcome variable's column
        binary_var:
            a list of string column names.
    return:
        cond_probs:
            a dictionary with a touple key
    '''
    # generate empty dictionary containers.
    class_probs = {}
    cond_probs = {}
    # locate all variables that are not the outcome.
    vars = [v for v in data.columns if v == any()]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return cond_probs


# run
cond_probs = calc_probs(train,outcome_var="vote",binary_var=["white"])

In [11]:
# collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("age",1):{'mean':vote_1.age.mean(),'sd':vote_0.age.std()},
 ("age",0):{'mean':vote_0.age.mean(),'sd':vote_0.age.std()},
 ("income",1):{'mean':vote_1.income.mean(),'sd':vote_1.income.std()},
 ("income",0):{'mean':vote_0.income.mean(),'sd':vote_0.income.std()},
 ("educate",1):{'mean':vote_1.educate.mean(),'sd':vote_1.educate.std()},
 ("educate",0):{'mean':vote_0.educate.mean(),'sd':vote_0.educate.std()}
}

{('age', 0): {'mean': 43.357142857142854, 'sd': 19.080415548294297},
 ('age', 1): {'mean': 46.23869346733668, 'sd': 19.080415548294297},
 ('educate', 0): {'mean': 10.652709359605911, 'sd': 3.1947225075904124},
 ('educate', 1): {'mean': 12.523031825795645, 'sd': 3.2706391049603347},
 ('income', 0): {'mean': 2.807944088669951, 'sd': 2.1931252923066418},
 ('income', 1): {'mean': 4.3193976549413735, 'sd': 2.901648439970187}}


In [28]:
def predict(data,dist_locs,cond_probs,outcome_variable):
    '''
    Predict takes in a data frame and several dictionaries in order to conduct a classification on the outcome variable. This function 'tests' a model.
    
    parameters: 
        data: 
            a pandas data frame with relevant variables.
        dist_locs:
            a dictionary of the continuous variable probabilities
        cond_probs:
            a dictionary of the binary variable probabilities
    '''
    store_preds = []
    for i,row in data.iterrows():
        
        # get the predictions using a Gaussan distribution
        # reset the probabilities
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row)):
            if row.index[j] != outcome_variable: # ignore the outcome
                if row.index[j] != "white": # the if statement only works for the continuous variables
                    # we use *= to multiply the probabilities
                    pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
                    pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
                else: # this code is for the binary variable, white
                    pr_0 *= cond_probs[(row.index[j],int(row.values[j]),0)]
                    pr_1 *= cond_probs[(row.index[j],int(row.values[j]),1)]
        pr_0 *= pr_vote_0
        pr_1 *= pr_vote_1
        
        # assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run
preds_train = predict(train,dist_locs, cond_probs, "vote")

In [29]:
accuracy = sum(train.vote == preds_train.pred)/train.shape[0]
accuracy

0.73

Now, we'll look at the testing data set. We do this to see how well our model does with data we haven't seen before.

In [26]:
# run on the test
preds_test = predict(test,dist_locs,cond_probs, "vote")

In [27]:
# get test accuracy
accuracy = sum(test.vote == preds_test.pred)/test.shape[0]
accuracy

0.7575

In the end, our model preforms at 73% accuracy on the test model and slightly better (76% accuracy) on the test data. The model does outpreform a coin flip.