# Preperation

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st

In [2]:
#Load the data.
dta = pd.read_csv('../turnout.csv')

In [3]:
# Set seed
np.random.seed(1)

# Train-Test split (just using Pandas).
train = dta.sample(frac=.8).reset_index(drop=True)
test = dta.drop(train.index).reset_index(drop=True)

# Look at the head of the data.
train.head()

Unnamed: 0,id,age,educate,income,vote,white
0,675,35,10.0,2.7852,1,1
1,1700,37,12.0,3.4183,0,1
2,1283,82,10.0,0.9689,1,1
3,1316,25,14.0,3.3834,1,1
4,1211,33,10.0,2.9072,0,1


# The process 

I will divide the data into two subsets, one for categrical variable(white or not), one for continuous variables. And I will calculate the class probability and conditional probabilities base on two different types of variables. In the end, I will calculateb the predictive probability combining the above conditional probabilities.

In [4]:
#Subset the data.
cat_cols = train[['vote', 'white']]
con_cols = train[['age', 'educate', 'income', 'vote']]

In [5]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data. 
    
    Arguements
    ----------
    data: a data set only contains categorical variables.
    outcome_var: The variable that serves as the condition.
    
    Return
    ------
    The class probability and the conditional probabilities.
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs

In [6]:
#Assign them to objects for later usage.
class_probs, cond_probs = calc_probs(cat_cols, 'vote')

In [7]:
#Subset the train data by the 'vote' column.
v1 = train.query("vote == 1")
v0 = train.query("vote == 0")

# Collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("age",1):{'mean':v1.age.mean(),'sd':v1.age.std()},
 ("age",0):{'mean':v0.age.mean(),'sd':v0.age.std()},
 ("educate",1):{'mean':v1.educate.mean(),'sd':v1.educate.std()},
 ("educate",0):{'mean':v0.educate.mean(),'sd':v0.educate.std()},
 ("income",1):{'mean':v1.income.mean(),'sd':v1.income.std()},
 ("income",0):{'mean':v0.income.mean(),'sd':v0.income.std()}
}

In [8]:
def predict(data,class_probs,cond_probs,dist_locs):
    '''
    This function takes the original data, the class probability, the conditional probabilty of the categrical variable and the statistics of each conditional distribution as input and returns both the probabilities and the most likely class.
    
    Arguements
    ----------
    data: the original data.
    class_probs: the class probability of the prediction variable.
    cond_probs: the conditional probability of the categorical variable.
    dist_locs: the mean and standard dev. of each conditional distribution
    
    Return
    ------
    The probabilities and the most likely class.
    '''
    store_preds = []
    #Iterate through each row.
    for i,row in data.iterrows():
        #Set the initial probabilities to 0.
        pr_0 = 1; pr_1 = 1
        #Mutiply all the conditional probabilities of continuous varables.
        for j in range(1, 4):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        #Then mutiply the conditional probability of categorical variable and the class probability.
        pr_0 *= cond_probs[(row.index[5],row.values[5],0)]
        pr_1 *= cond_probs[(row.index[5],row.values[5],1)]
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])


In [9]:
#Calculate the predictive accuracy on training data.
preds=predict(train,class_probs,cond_probs,dist_locs)
accuracy = sum(train.vote == preds.pred)/train.shape[0]
accuracy

0.743125

In [10]:
#Calculate the predictive accuracy on test data.
test_preds = predict(test, class_probs, cond_probs, dist_locs)
test_accuracy = sum(test.vote == test_preds.pred)/test.shape[0]
test_accuracy

0.71

The classifier has a accuracy of 74.31% for the training data and has a accuracy of 71% for the test data. It is indeed better than coin flips(50%).