In [22]:
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [71]:
turnout = pd.read_csv('/Users/davidlopez/Desktop/PPOL564/coding_discussions_ppol564_fall2021/05_coding_discussion/turnout.csv')

# Rearrange columns of dataset
turnout = turnout[['vote', 'white', 'age', 'educate', 'income']]

# Set seed
np.random.seed(1234)

# Train-Test split (just using Pandas)
train = turnout.sample(frac=.8).reset_index(drop=True)
test = turnout.drop(train.index).reset_index(drop=True)

# Create subsets for binary data
train_dummies = train[['vote','white']]
test_dummies = test[['vote', 'white']]

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,vote,white,age,educate,income
0,1,1,78,16.0,1.3131
1,1,0,72,3.0,0.6765
2,1,1,64,10.0,1.3131
3,1,1,80,8.0,1.1839
4,1,1,19,14.0,2.9072


In [72]:
N = train.shape[0]


# Subset the data by class
vote1 = train.query("vote == 1")
vote0 = train.query("vote == 0")

# Calculate the probability for each class
pr_vote1 = vote1.shape[0]/N
pr_vote0 = vote0.shape[0]/N

# Print the probabilities
print(
f"""
Pr(vote = 1): {pr_vote1}
Pr(vote = 0): {pr_vote0}
""")



Pr(vote = 1): 0.7425
Pr(vote = 0): 0.2575



In [86]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data. 
    
    Input: dataframe and y variable
    Output: dictionaries for probabilities of each class and conditional variable
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


# Run
class_probs, cond_probs = calc_probs(train,outcome_var="vote")

# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.2575, 1: 0.7425}


conditional probabilities

{('age', 0, 0): -41.601941747572816,
 ('age', 0, 1): -45.32491582491583,
 ('age', 1, 0): 42.601941747572816,
 ('age', 1, 1): 46.32491582491583,
 ('educate', 0, 0): -9.62864077669903,
 ('educate', 0, 1): -11.558922558922559,
 ('educate', 1, 0): 10.62864077669903,
 ('educate', 1, 1): 12.558922558922559,
 ('income', 0, 0): -1.7381618932038831,
 ('income', 0, 1): -3.2294619528619526,
 ('income', 1, 0): 2.738161893203883,
 ('income', 1, 1): 4.229461952861953,
 ('white', 0, 0): 0.20145631067961167,
 ('white', 0, 1): 0.13131313131313127,
 ('white', 1, 0): 0.7985436893203883,
 ('white', 1, 1): 0.8686868686868687}


In [75]:
def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    
    Input: dataframe, dictionaries from calc_probs function
    output: dataframe with 
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run 
preds = predict(train_dummies, class_probs, cond_probs)
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,0.205625,0.645,1
1,0.051875,0.0975,1
2,0.205625,0.645,1
3,0.205625,0.645,1
4,0.205625,0.645,1


In [87]:
# Calculate predictive accuracy for training data

train_accuracy = sum(train.vote == preds.pred)/train.shape[0]
train_accuracy

0.7425

In [81]:
# Calculate accuracy for test data

test_preds = predict(test_dummies, class_probs, cond_probs)
test_accuracy = sum(test_dummies.vote == test_preds.pred)/test.shape[0]

test_accuracy

0.7

In [89]:
# Collect means and std devs for each conditional distribution

dist_locs = \
{("age",1):{'mean':vote1.age.mean(),'sd':vote1.age.std()},
 ("age",0):{'mean':vote0.age.mean(),'sd':vote0.age.std()},
 ("educate",1):{'mean':vote1.educate.mean(),'sd':vote1.educate.std()},
 ("educate",0):{'mean':vote0.educate.mean(),'sd':vote0.educate.std()},
 ("income",1):{'mean':vote1.income.mean(),'sd':vote1.income.std()},
 ("income",0):{'mean':vote0.income.mean(),'sd':vote0.income.std()}
}

# Print
pp.pprint(dist_locs)

{('age', 0): {'mean': 42.601941747572816, 'sd': 19.147825402160812},
 ('age', 1): {'mean': 46.32491582491583, 'sd': 16.924844588853727},
 ('educate', 0): {'mean': 10.62864077669903, 'sd': 3.3043810919835317},
 ('educate', 1): {'mean': 12.558922558922559, 'sd': 3.2957141274443202},
 ('income', 0): {'mean': 2.738161893203883, 'sd': 2.242991372933763},
 ('income', 1): {'mean': 4.229461952861953, 'sd': 2.848208991067699}}


In [96]:
# Prediction for the 1 class
a = st.norm(dist_locs[("age",1)]['mean'], dist_locs[("age",1)]['sd']).pdf(age)
b = st.norm(dist_locs[("educate",1)]['mean'], dist_locs[("educate",1)]['sd']).pdf(educate)
c = st.norm(dist_locs[("income",1)]['mean'], dist_locs[("income",1)]['sd']).pdf(income)

d = pr_vote1

pr_1 = a * b * c * d

# Prediction for the 0 class
a = st.norm(dist_locs[("age",0)]['mean'], dist_locs[("age",0)]['sd']).pdf(age)
b = st.norm(dist_locs[("educate",0)]['mean'], dist_locs[("educate",0)]['sd']).pdf(educate)
c = st.norm(dist_locs[("income",0)]['mean'], dist_locs[("income",0)]['sd']).pdf(income)

d = pr_vote0
pr_0 = a * b * c * d

print(
f'''
    Pr(y == 1| X): {pr_1}
    Pr(y == 0| X): {pr_0}
''')


    Pr(y == 1| X): 6.427332059133578e-07
    Pr(y == 0| X): 1.617304556289908e-06



In [101]:
train_conts = train[['vote', 'age', 'educate', 'income']]
test_conts = test[['vote', 'age', 'educate', 'income']]

def predict(data,dist_locs):
    '''
    Function calculates the conditional probability for continuous predictors.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row)):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        pr_0 *= pr_vote0
        pr_1 *= pr_vote1
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run
preds_train = predict(train_conts,dist_locs)

preds_train.head(10)


Unnamed: 0,pr_0,pr_1,pred
0,5e-06,1.767746e-05,1
1,2e-06,6.427332e-07,0
2,5e-05,7.533258e-05,1
3,1e-05,8.89076e-06,0
4,3.2e-05,6.577414e-05,1
5,3e-06,1.418703e-06,0
6,6e-05,8.628565e-05,1
7,1.8e-05,6.307415e-05,1
8,4.2e-05,0.0001025867,1
9,4.3e-05,0.000149839,1


In [102]:
accuracy_train = sum(train.vote == preds_train.pred)/train.shape[0]
accuracy_train

0.74375

In [105]:
preds_test = predict(test_conts,dist_locs)

accuracy_test = sum(test.vote == preds_test.pred)/test.shape[0]
accuracy_test

0.7225

### Findings

From the results of predictive accuracy of 0.7 and 0.7225 we can observe that the model performs better than a coin flip in predicting probablity of voting.