In [291]:
#prepare file
import pandas as pd
import numpy as np

In [292]:
#load in data
voteData = pd.read_csv('turnout.csv')

#reset index to be id
voteData = voteData.set_index('id')

#rearrange columns so vote is last
voteData = voteData[['age', 'educate', 'income', 'white', 'vote']]

#display head
voteData.head()

Unnamed: 0_level_0,age,educate,income,white,vote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,60,14.0,3.3458,1,1
2,51,10.0,1.8561,1,0
3,24,12.0,0.6304,1,0
4,38,8.0,3.4183,1,1
5,25,12.0,2.7852,1,1


In [293]:
#create labels for age (use later)
mylist = voteData['age'].tolist()
labels1 = list(dict.fromkeys(mylist))

#create labels for educate (use later)
mylist2 = voteData['educate'].tolist()
labels2 = list(dict.fromkeys(mylist2))


#place income into bins (1-10)
labels3 = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
for j in voteData.iloc[:,[2]]:
    voteData[j] = pd.cut(voteData[j],bins=len(labels3),labels=labels3)

#create labels for vote (use later)
labels4 = [0, 1]


In [294]:
#display head of data frame with binned income data
voteData.head()

Unnamed: 0_level_0,age,educate,income,white,vote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,60,14.0,3,1,1
2,51,10.0,2,1,0
3,24,12.0,1,1,0
4,38,8.0,3,1,1
5,25,12.0,2,1,1


In [295]:
#split into Train and Test df
dfTrain = voteData.sample(n=400)
dfTest = voteData.drop(dfTrain.index)

In [296]:
#Display Test head
dfTest.head()

Unnamed: 0_level_0,age,educate,income,white,vote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,60,14.0,3,1,1
3,24,12.0,1,1,0
4,38,8.0,3,1,1
5,25,12.0,2,1,1
6,67,12.0,2,1,1


In [297]:
#Create function to calculate percentages of each vote outcome
def count(data,colname,label,target):
    '''
    This function takes in data frame & counts occurences of each variable for a specified outcome
    
    Input:
    data: data set to be used
    colname: neme of column we're using to calculate categorical occurences
    label: category the value is in
    target: the outcome we're trying to predict
    
    Output:
    The number of times a column's category coincides with a certain outcome 
    '''
    condition = (data[colname] == label) & (data['vote'] == target)
    return len(data[condition])

In [298]:
#Calculate percentages of each vote outcome
count_0 = count(dfTrain,'vote',0,0)
count_1 = count(dfTrain,'vote',1,1)

#Use counts above to calculate probabilities of each outcome
prob_0 = count_0/len(dfTrain)
prob_1 = count_1/len(dfTrain)

#print probabilities
print(prob_0)
print(prob_1)

0.27
0.73


In [299]:
#consolidate labels created earlier
labels = [labels1, labels2, labels3, labels4]

In [300]:
#create empty dictionary to store probabilities
probabilities = {0:{},1:{}}

In [301]:
#Train data

#Set counter for column number (use to interate through column labels)
counter = 0

#Iterate through columns
for col in dfTrain.columns[:-1]:
        probabilities[0][col] = {}
        probabilities[1][col] = {}
        
        #iterate through column's labels (ie categories)
        for category in labels[counter]:
            
            #calculate counts of each category
            count_ct_0 = count(dfTrain,col,category,0)
            count_ct_1 = count(dfTrain,col,category,1)
            
            #calculate probabilities of each category
            probabilities[0][col][category] = count_ct_0 / count_0
            probabilities[1][col][category] = count_ct_1 / count_1
        
        #move to next set of labels
        counter = counter+1

In [302]:
#display categorical probabilities when vote is 0 (for brevity)
print(probabilities[0])

{'age': {60: 0.009259259259259259, 51: 0.046296296296296294, 24: 0.037037037037037035, 38: 0.018518518518518517, 25: 0.037037037037037035, 67: 0.0, 40: 0.009259259259259259, 56: 0.009259259259259259, 32: 0.037037037037037035, 75: 0.0, 46: 0.009259259259259259, 52: 0.009259259259259259, 22: 0.06481481481481481, 30: 0.0, 69: 0.009259259259259259, 34: 0.009259259259259259, 76: 0.0, 29: 0.027777777777777776, 49: 0.0, 47: 0.018518518518518517, 39: 0.0, 58: 0.0, 44: 0.018518518518518517, 31: 0.018518518518518517, 74: 0.009259259259259259, 27: 0.037037037037037035, 65: 0.018518518518518517, 70: 0.009259259259259259, 54: 0.0, 23: 0.018518518518518517, 84: 0.018518518518518517, 37: 0.009259259259259259, 33: 0.018518518518518517, 57: 0.009259259259259259, 26: 0.037037037037037035, 79: 0.009259259259259259, 53: 0.0, 68: 0.0, 28: 0.009259259259259259, 43: 0.0, 80: 0.018518518518518517, 50: 0.018518518518518517, 59: 0.018518518518518517, 35: 0.018518518518518517, 77: 0.009259259259259259, 48: 0.027

In [303]:
#initialize empty table for test data predictions
predicted = []

In [304]:
#iterate through test data
for row in range(0,len(dfTest)):
        
        #initialize variables for final probabilities
        prod_0 = prob_0
        prod_1 = prob_1
        
        #calculate final probabilities based on calculated probabilities of categories
        for feature in dfTest.columns[:-1]:
            prod_0 *= probabilities[0][feature][dfTest[feature].iloc[row]]
            prod_1 *= probabilities[1][feature][dfTest[feature].iloc[row]]
        
        #Predict the outcome
        if prod_0 > prod_1:
            predicted.append(0)
        else:
            predicted.append(1)

In [305]:
#convert predicted to data frame 
dfPred = pd.DataFrame(predicted)

In [306]:
#re-initialize testing data frame to match indices, and add predictions to it
dfTemp = dfTest.reset_index()
dfTemp['predicted'] = dfPred

In [307]:
#add a column for where the prediction matches the actual value 
dfTemp['Same'] = np.where( dfTemp['vote'] == dfTemp['predicted'] , 1, 0)

In [308]:
#display
dfTemp.head()

Unnamed: 0,id,age,educate,income,white,vote,predicted,Same
0,1,60,14.0,3,1,1,1,1
1,3,24,12.0,1,1,0,0,1
2,4,38,8.0,3,1,1,1,1
3,5,25,12.0,2,1,1,1,1
4,6,67,12.0,2,1,1,1,1


In [309]:
#Count number of times the prediction is accurate, over total num observations
dfTemp[["Same"]].sum()/1600

Same    0.7375
dtype: float64

Per the above output, my model is 73.75% accurate, which is better than a coin flip!