# Coding Discussion 5
### Colette Yeager

In [144]:
# Import packages
import numpy as np
import pandas as pd
import scipy.stats as st

In [145]:
# Read in data and set training and test datasets

# Set seed
np.random.seed(1234)

# Read in data
data = pd.read_csv("/Users/coletteyeager/coding_discussions_ppol564_fall2021/05_coding_discussion/turnout.csv")

# Train-Test split
train = data.sample(frac=.8).reset_index(drop=True)
test = data.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,id,age,educate,income,vote,white
0,1749,78,16.0,1.3131,1,1
1,935,72,3.0,0.6765,1,0
2,1034,64,10.0,1.3131,1,1
3,244,80,8.0,1.1839,1,1
4,929,19,14.0,2.9072,1,1


In [146]:
# Calculate vote probabilities

# Query data
v1 = train.query("vote == 1")
v0 = train.query("vote == 0")

# Vote probabilities
pr_v1 = v1.shape[0]/train.shape[0]
pr_v0 = v0.shape[0]/train.shape[0]
class_probs = [pr_v0, pr_v1]
class_probs

[0.2575, 0.7425]

In [147]:
# Subset into binary and continuous variable datasets
binary = train[['vote', 'white']]
continuous = train[['vote', 'age', 'educate', 'income']]

In [148]:
# Binary calculation

def calc_probs(data, outcome_var = ""):
    '''
    This function calculates the conditional probabilities for the binary variable.
    
    Arguments
    ---------
    data: DataFrame
        A data set containing the binary variables 
    outcome_var: str
        A string with the name of the variable being looked at
    
    Return
    ------
    cond_probs: dictionary
        A dictionary containing the conditional probabilities for the binary variables
    '''
    
    # Create empty dictionary
    cond_probs = {}
    # Get non-outcome variables from the data
    vars = [v for v in binary.columns if v!= outcome_var]
    for y, d in data.groupby(outcome_var):
        for v in vars:
            # Get conditinoal probabilities
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v, 1, y)] = pr
            cond_probs[(v, 0, y)] = 1 - pr
    return cond_probs

In [149]:
# Get the conditional probabilities and print them out
cond_probs = calc_probs(binary, "vote")
cond_probs

{('white', 1, 0): 0.7985436893203883,
 ('white', 0, 0): 0.20145631067961167,
 ('white', 1, 1): 0.8686868686868687,
 ('white', 0, 1): 0.13131313131313127}

In [150]:
# Get the mean and standard deviation for each continuous variable
vars = [v for v in continuous.columns if v != "vote"] # Collect each variable
dist_locs = {} # Create empty dictionary
for v in vars:
    dist_locs.update({(v, 1): {'mean': v1[v].mean(), 'sd': v1[v].std()}})
    dist_locs.update({(v, 0): {'mean': v0[v].mean(), 'sd': v0[v].std()}})
    
dist_locs

{('age', 1): {'mean': 46.32491582491583, 'sd': 16.924844588853716},
 ('age', 0): {'mean': 42.601941747572816, 'sd': 19.147825402160812},
 ('educate', 1): {'mean': 12.558922558922559, 'sd': 3.295714127444309},
 ('educate', 0): {'mean': 10.62864077669903, 'sd': 3.304381091983527},
 ('income', 1): {'mean': 4.229461952861947, 'sd': 2.8482089910676964},
 ('income', 0): {'mean': 2.7381618932038836, 'sd': 2.2429913729337625}}

In [156]:
def predict(data, dist_locs, cond_probs, class_probs):
    '''
    This function calculates the conditional probabilities for the continuous variables.
    
    Arguments
    ---------
    data: DataFrame
        A data set containing the continuous variables 
    dist_locs: Dictionary
        A dictionary containing the mean and standard deviation for each continuous variable option
    cond_probs: Dictionary
        A dictionary containing the conditional probability for each binary variable
    class_probs: List
        A list containing the class probabilities
    
    Return
    ------
    pred: DataFrame
        A data set containing the conditional probabilities and predicted outcome
    '''
    
    store_preds = []
    for i,row in data.iterrows():
        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        for j in range(1, 4): #Subset to only look at continuous variables
            # Multiply all the continuous probailities
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        # Multiply by binary probabilities
        pr_0 *= cond_probs['white', row['white'], 0]
        pr_1 *= cond_probs['white', row['white'], 1]
        # Multiple by class proabilities
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
        # Turn to DataFrame
        pred = pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])
        
    return pred

In [157]:
# Run prediction function on training data
preds_train = predict(train, dist_locs, cond_probs, class_probs)
preds_train.head()

Unnamed: 0,pr_0,pr_1,pred
0,3.632617e-06,1.535617e-05,1
1,3.258162e-07,8.439931e-08,0
2,3.954328e-05,6.544042e-05,1
3,7.829549e-06,7.723286e-06,0
4,2.550202e-05,5.713713e-05,1


In [158]:
# Compare prediction to actual
accuracy_train = sum(train.vote == preds_train.pred)/train.shape[0]
accuracy_train

0.736875

In [160]:
# Run prediction functino on test data
preds_test = predict(test,dist_locs, cond_probs, class_probs)
preds_test.head()

Unnamed: 0,pr_0,pr_1,pred
0,4e-06,1.5e-05,1
1,2e-05,0.000179,1
2,3e-05,6.8e-05,1
3,7.3e-05,0.000187,1
4,2e-06,4e-05,1


In [161]:
# Compare prediction to actual
accuracy_test = sum(test.vote == preds_test.pred)/test.shape[0]
accuracy_test

0.715

From the training data, we have a predictive accuracy of 73.68%. From the test data, we have a predictive accuracy of 71.5%. This is definitely better than a coin flip, which has a 50% accuracy. 