In [1]:
import pandas as pd
import numpy as np
import pprint as pp # for printing

# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set seed
np.random.seed(1234)

In [3]:
# read in data
data = pd.read_csv("/Users/anandigupta/Desktop/coding_discussions_ppol564_fall2021/05_coding_discussion/turnout.csv")

# Train-Test split (just using Pandas)
train = data.sample(frac=.8).reset_index(drop=True)
test = data.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,id,age,educate,income,vote,white
0,1749,78,16.0,1.3131,1,1
1,935,72,3.0,0.6765,1,0
2,1034,64,10.0,1.3131,1,1
3,244,80,8.0,1.1839,1,1
4,929,19,14.0,2.9072,1,1


In [4]:
#Calculate class probabilities

N = train.shape[0]


In [5]:
# Bin continuous variables into 25% quartiles 
train['age_qt'] = pd.qcut(train['age'], 4, labels=['bottom 25', 'lower 25', 'upper 25', 'top 25'], duplicates='drop')
train['educate_qt'] = pd.qcut(train['educate'], 4, labels=['bottom 25', 'lower 25', 'upper 25', 'top 25'],duplicates='drop')
train['income_qt'] = pd.qcut(train['income'], 4, labels=['bottom 25', 'lower 25', 'upper 25', 'top 25'], duplicates='drop')

# Select data for modeling
train=train[['vote', 'age_qt', 'educate_qt', 'income_qt', 'white']]


In [6]:
# Subset the data by class
v1 = train.query("vote == 1")
v0 = train.query("vote == 0")

# Calculate the probability for each class
pr_v_1 = v1.shape[0]/N
pr_v_0 = v0.shape[0]/N

# Print the probabilities
print(
f"""
Pr(vote = 1): {pr_v_1}
Pr(vote = 0): {pr_v_0}
""")


Pr(vote = 1): 0.7425
Pr(vote = 0): 0.2575



In [7]:
train.head()

Unnamed: 0,vote,age_qt,educate_qt,income_qt,white
0,1,top 25,top 25,bottom 25,1
1,1,top 25,bottom 25,bottom 25,0
2,1,top 25,bottom 25,bottom 25,1
3,1,top 25,bottom 25,bottom 25,1
4,1,bottom 25,upper 25,lower 25,1


In [8]:
def calc_probs(data,outcome_var=""):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data. 
    
    Note that I'm using dictionaries with tuple keys to keep
    track of the variable, it's val, and the outcome, which we're conditioning on. 
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    
    #calculate class probabilities as v1 or v2 dimensions/full data dimensions
    class_probs[1] = v1.shape[0]/N
    class_probs[0] = v0.shape[0]/N
    
    # Calculate the Conditional Probabilities $Pr(data | class)$  
    
    #given vote = 1
    cond_probs['white', 1, 1] = v1.query("white == 1").shape[0]/v1.shape[0]
    cond_probs['white', 0, 1] = v1.query("white == 0").shape[0]/v1.shape[0]
    
    #given vote = 0
    cond_probs['white', 0, 0] = v0.query("white == 1").shape[0]/v0.shape[0]
    cond_probs['white', 1, 0] = v0.query("white == 0").shape[0]/v0.shape[0]

    var_names = ['age_qt', 'educate_qt', 'income_qt']
    
    for v in var_names:
        #given vote = 1
        cond_probs[v, 'bottom 25', 1] = v1.loc[v1[v] == "bottom 25"].shape[0]/v1.shape[0]
        cond_probs[v, 'lower 25', 1] = v1.loc[v1[v] == "lower 25"].shape[0]/v1.shape[0]
        cond_probs[v, 'upper 25', 1] = v1.loc[v1[v] == "upper 25"].shape[0]/v1.shape[0]
        cond_probs[v, 'top 25', 1] = v1.loc[v1[v] == "top 25"].shape[0]/v1.shape[0]
        
        #given vote = 0
        cond_probs[v, 'bottom 25', 0] = v0.loc[v0[v] == "bottom 25"].shape[0]/v0.shape[0]
        cond_probs[v, 'lower 25', 0] = v0.loc[v0[v] == "lower 25"].shape[0]/v0.shape[0]
        cond_probs[v, 'upper 25', 0] = v0.loc[v0[v] == "upper 25"].shape[0]/v0.shape[0]
        cond_probs[v, 'top 25', 0] = v0.loc[v0[v] == "top 25"].shape[0]/v0.shape[0]

    return class_probs, cond_probs


# Run
class_probs, cond_probs = calc_probs(train,outcome_var="vote")

# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.2575, 1: 0.7425}


conditional probabilities

{('age_qt', 'bottom 25', 0): 0.3737864077669903,
 ('age_qt', 'bottom 25', 1): 0.20875420875420875,
 ('age_qt', 'lower 25', 0): 0.2354368932038835,
 ('age_qt', 'lower 25', 1): 0.255050505050505,
 ('age_qt', 'top 25', 0): 0.24757281553398058,
 ('age_qt', 'top 25', 1): 0.24915824915824916,
 ('age_qt', 'upper 25', 0): 0.14320388349514562,
 ('age_qt', 'upper 25', 1): 0.28703703703703703,
 ('educate_qt', 'bottom 25', 0): 0.41019417475728154,
 ('educate_qt', 'bottom 25', 1): 0.20454545454545456,
 ('educate_qt', 'lower 25', 0): 0.3859223300970874,
 ('educate_qt', 'lower 25', 1): 0.3787878787878788,
 ('educate_qt', 'top 25', 0): 0.08009708737864078,
 ('educate_qt', 'top 25', 1): 0.2415824915824916,
 ('educate_qt', 'upper 25', 0): 0.12378640776699029,
 ('educate_qt', 'upper 25', 1): 0.1750841750841751,
 ('income_qt', 'bottom 25', 0): 0.41019417475728154,
 ('income_qt', 'bottom 25', 1): 0.19528619528619529,
 ('income_qt', 'l

In [9]:
def predict(data,class_probs,cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i,row in data.iterrows():
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):

            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run 
preds = predict(train, class_probs, cond_probs)
preds.head(10)

Unnamed: 0,pr_0,pr_1,pred
0,0.000422,0.007582,1
1,0.008566,0.00097,0
2,0.002161,0.006419,1
3,0.002161,0.006419,1
4,0.00067,0.005834,1
5,0.002161,0.006419,1
6,0.002089,0.012622,1
7,0.000434,0.00805,1
8,0.000651,0.010296,1
9,0.000513,0.00641,1


In [10]:
#calculate predictive accuracy as sum of obs where training vote = predicted value/N

accuracy = sum(train.vote == preds.pred)/train.shape[0]
accuracy

0.723125

In [11]:
#Repeat initial cleaning for test data

# Bin continuous variables into 25% quartiles 
test['age_qt'] = pd.qcut(test['age'], 4, labels=['bottom 25', 'lower 25', 'upper 25', 'top 25'], duplicates='drop')
test['educate_qt'] = pd.qcut(test['educate'], 4, labels=['bottom 25', 'lower 25', 'upper 25', 'top 25'],duplicates='drop')
test['income_qt'] = pd.qcut(test['income'], 4, labels=['bottom 25', 'lower 25', 'upper 25', 'top 25'], duplicates='drop')

# Select data for testing model on
test=test[['vote', 'age_qt', 'educate_qt', 'income_qt', 'white']]

In [12]:
#calculate predictive accuracy for test data

test_preds = predict(test, class_probs, cond_probs)
test_accuracy = sum(test.vote == test_preds.pred)/test.shape[0]
test_accuracy

0.6975

After converting our continuous data into categorical bins, the model predicts whether or not an individual will vote based on their age, education, income, and race with 72% accuracy for the training data and 70% accuracy for the test data (which is indeed better than a coin flip which would be accurate about 50% of the time).