In [3]:
import pandas as pd
import numpy as np
#for working with income
import scipy.stats as st
#for pretty printing
import pprint as pp


In [4]:
# read in data
vote_dta = pd.read_csv("../turnout.csv")

In [5]:
# break the data up into a training (1600 entries, 80%) and test dataset (400 entries, 20%). 

# Train-Test split (using Pandas)
trainprime = vote_dta.sample(frac=.8).reset_index(drop=True)
test = vote_dta.drop(trainprime.index).reset_index(drop=True)

display(test.head())

Unnamed: 0,id,age,educate,income,vote,white
0,1601,31,16.0,3.9394,1,0
1,1602,53,12.0,6.3352,1,1
2,1603,65,12.0,0.8284,0,1
3,1604,34,12.0,3.3834,1,1
4,1605,28,12.0,8.7545,0,1


In [6]:
#removed id, must define column number either by variable or directly to remove
left_df = trainprime.iloc[:, :1]
#left is ids so right will remain
right_df = trainprime.iloc[:, 1:]
train = right_df

#create binary variables, looking back i did not need to do the above portion,
# but thought I would keep it for the recored
train_bin = train.drop(columns=['age','educate','income'])
test_bin = test.drop(columns=['id','age','educate','income'])

#create numreic version
train_num = train.drop(columns=['white'])
test_num = test.drop(columns=['id','white'])

print(test_bin.head())
print(test_num.head())


   vote  white
0     1      0
1     1      1
2     0      1
3     1      1
4     0      1
   age  educate  income  vote
0   31     16.0  3.9394     1
1   53     12.0  6.3352     1
2   65     12.0  0.8284     0
3   34     12.0  3.3834     1
4   28     12.0  8.7545     0


In [7]:
def calc_prob(data, outcome=""):
    '''
    Function calculates the class and conditional probabilities in
    the binary data. produces tuples to keep
    track of the variable, it's val, and the outcome
    '''
    # Generate empty dictionary containers.
    class_probs = {}
    cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each
            # variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


x = calc_prob(train_bin,outcome="vote")
#separate out the class prob from the cond prob
class_probs = x[0]
cond_probs = x[1]

# Print with pp to format better
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.2575, 1: 0.7425}


conditional probabilities

{('white', 0, 0): 0.2038834951456311,
 ('white', 0, 1): 0.1262626262626263,
 ('white', 1, 0): 0.7961165048543689,
 ('white', 1, 1): 0.8737373737373737}


In [8]:
def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditiona probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    #
    stored_pred = []
    # iterate through the rows of data
    for i,row in data.iterrows():
        #set initial probabilities to be modified
        pr_1 = 1; pr_0 = 1
        # mutiply conditional probabilities of continuous varables
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        stored_pred.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(stored_pred,columns=["pr_0","pr_1","pred"])

# Run 
preds = predict(train_bin, class_probs, cond_probs)
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,0.205,0.64875,1
1,0.0525,0.09375,1
2,0.0525,0.09375,1
3,0.205,0.64875,1
4,0.205,0.64875,1


In [9]:
bin_accuracy = sum(train_bin.vote == preds.pred)/train.shape[0]
print(bin_accuracy)
test_preds = predict(test_bin, class_probs, cond_probs)
bin_test_acc = sum(test_bin.vote == test_preds.pred)/test.shape[0]
bin_test_acc

0.7425


0.7

The binary predictor is more accurate than a coin flip when attempting to predict the test data.

In [10]:
#Subset train data by 'vote' column.
vote1 = train_num.query("vote == 1")
vote0 = train_num.query("vote == 0")

# probabilities
pr_vote1 = vote1.shape[0]/train_num.shape[0]
pr_vote0 = vote0.shape[0]/train_num.shape[0]


# Collect the mean and standard dev. of each condition
dist_locs = \
{("age",1):{'mean':vote1.age.mean(),'sd':vote1.age.std()},
 ("age",0):{'mean':vote0.age.mean(),'sd':vote0.age.std()},
 ("educate",1):{'mean':vote1.educate.mean(),'sd':vote1.educate.std()},
 ("educate",0):{'mean':vote0.educate.mean(),'sd':vote0.educate.std()},
 ("income",1):{'mean':vote1.income.mean(),'sd':vote1.income.std()},
 ("income",0):{'mean':vote0.income.mean(),'sd':vote0.income.std()}
}
print(dist_locs)

{('age', 1): {'mean': 46.20454545454545, 'sd': 16.82927043172548}, ('age', 0): {'mean': 42.61407766990291, 'sd': 19.36935387315589}, ('educate', 1): {'mean': 12.508838383838384, 'sd': 3.2209635040880196}, ('educate', 0): {'mean': 10.688106796116505, 'sd': 3.2593387158255354}, ('income', 1): {'mean': 4.22915244107744, 'sd': 2.8895094614694696}, ('income', 0): {'mean': 2.746505097087379, 'sd': 2.1695839598930142}}


In [16]:
age, ed, inc, vote = train_num.iloc[1,:]
# Prediction for the 1 class
a1 = st.norm(dist_locs[("age",1)]['mean'], dist_locs[("age",1)]['sd']).pdf(age)
b1 = st.norm(dist_locs[("educate",1)]['mean'], dist_locs[("educate",1)]['sd']).pdf(ed)
c1 = st.norm(dist_locs[("income",1)]['mean'], dist_locs[("income",1)]['sd']).pdf(inc)
d1 = pr_vote1
pr_1 = a1 * b1 * c1 * d1 

# Prediction for the 0 class
a0 = st.norm(dist_locs[("age",0)]['mean'], dist_locs[("age",0)]['sd']).pdf(age)
b0 = st.norm(dist_locs[("educate",0)]['mean'], dist_locs[("educate",0)]['sd']).pdf(ed)
c0 = st.norm(dist_locs[("income",0)]['mean'], dist_locs[("income",0)]['sd']).pdf(inc)
d0 = pr_vote0
pr_0 = a0 * b0 * c0 * d0 

print(
f'''
    Pr(y == 1| X): {pr_1}
    Pr(y == 0| X): {pr_0}
''')


    Pr(y == 1| X): 5.252583494610646e-06
    Pr(y == 0| X): 4.353899564219851e-06



In [12]:
def num_predict(data,dist_locs):
    ''''''
    store_preds = []
    for i,row in data.iterrows():
        
        # set predictions 
        pr_0 = 1; pr_1 = 1
        #subtract 1 since that is the vote value
        for j in range(0,len(row)-1):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        pr_0 *= pr_vote0
        pr_1 *= pr_vote1
 
 #       # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])
pred_train = num_predict(train_num,dist_locs)
print(pred_train.head())

#determine accuracy of the predictive model for training data
accuracy_train = sum(train_num.vote == pred_train.pred)/train_num.shape[0]
print(accuracy_train)

           pr_0          pr_1  pred
0  2.925197e-08  1.318409e-05     1
1  6.979789e-07  3.526562e-07     0
2  4.106011e-05  5.140602e-05     1
3  7.269929e-06  6.113275e-05     1
4  2.944359e-05  7.601721e-05     1


0.745

In [16]:
# run by test data
preds_test = num_predict(test_num, dist_locs)

# determine accuracy against test data
acc_test = sum(test_num.vote == preds_test.pred)/test_num.shape[0]
acc_test

0.7225

The numeric predictor is more accurate than the previous binary predictor, and also a coin flip.