## Nikhila Iyer
### November 14, 2021
### Coding Discussion #5

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF
import warnings # for silencing warnings
warnings.filterwarnings("ignore")

In [2]:
# Load in the data
vote_data = pd.read_csv("/Users/nikhilaiyer/Documents/GRAD SCHOOL/ppol564/coding_discussions_ppol564_fall2021/05_coding_discussion/turnout.csv").drop(columns = 'id')

In [3]:
# Split the data (train - test)
train = vote_data.sample(frac=.8).reset_index(drop=True)
test = vote_data.drop(train.index).reset_index(drop=True)

# Print off the split count to check
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,age,educate,income,vote,white
0,33,12.0,4.6493,1,1
1,64,10.0,1.6472,1,1
2,28,12.0,2.917,1,1
3,22,12.0,1.7443,1,1
4,66,14.0,7.5945,0,1


In [4]:
# Predicting multiple observations with binary data
def calc_probs(data, focus_var = ""):
    '''
    Takes in a dataset and a focus variable which is the outcome of importance and gets the conditional and class probability
   
    Arguments
    --------------
    data : a dataset
    focus_var: string with a focus outcome name
    
    Returns
    --------------
    dict(2): class and conditional probabilities in dictionaries
    '''
    # Make empty dictionary containers
    class_probs = {}
    cond_probs = {}
    # Get all the variables that are not the outcome variable
    vars = [v for v in data.columns if v != focus_var]
    for focus_outcome, focus_data in data.groupby(focus_var):
        class_probs.update({focus_outcome: focus_data.shape[0]/data.shape[0]})
        for var_name in vars:
            # calculate the conditional probabilities for each variable given the class.
            prob = focus_data[var_name].sum()/focus_data.shape[0]
            cond_probs[(var_name, 1, focus_outcome)] = prob
            cond_probs[(var_name, 0, focus_outcome)] = 1 - prob
    return class_probs, cond_probs

In [5]:
class_probs, cond_probs = calc_probs(train[['vote', 'white']], focus_var = "vote")

In [6]:
# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.241875, 1: 0.758125}


conditional probabilities

{('white', 0, 0): 0.20155038759689925,
 ('white', 0, 1): 0.11953833470733721,
 ('white', 1, 0): 0.7984496124031008,
 ('white', 1, 1): 0.8804616652926628}


In [7]:
def bin_predict(data, class_probs, cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    Then returns both the probabilities and the most likely class. 
    '''
    store_preds = []
    for i, row in data.iterrows():
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row.index)):            
            pr_0 *= cond_probs[(row.index[j], row.values[j], 0)]
            pr_1 *= cond_probs[(row.index[j], row.values[j], 1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

In [9]:
# Run the table to present probabilities
preds_bin_table = bin_predict(train[['vote', 'white']], class_probs, cond_probs)
preds_bin_table.head()

Unnamed: 0,pr_0,pr_1,pred
0,0.193125,0.6675,1
1,0.193125,0.6675,1
2,0.193125,0.6675,1
3,0.193125,0.6675,1
4,0.193125,0.6675,1


In [10]:
# Showing the accuracy of the predictions
bin_accuracy = sum(train.vote == preds_bin_table.pred)/train.shape[0]
bin_accuracy

0.758125

In [13]:
# Setting up dataset of just continuous variables (without binary)
train_cont = train[['vote', "age", "educate", "income"]]
vote1 = train.query("vote == 1")
vote0 = train.query("vote == 0")

In [14]:
# Finding the mean and std of each conditional variable
dist_locs = \
{("age",1):{'mean':vote1.age.mean(),'sd':vote1.age.std()},
 ("age",0):{'mean':vote0.age.mean(),'sd':vote0.age.std()},
 ("educate",1):{'mean':vote1.educate.mean(),'sd':vote1.educate.std()},
 ("educate",0):{'mean':vote0.educate.mean(),'sd':vote0.educate.std()},
 ("income",1):{'mean':vote1.income.mean(),'sd':vote1.income.std()},
 ("income",0):{'mean':vote0.income.mean(),'sd':vote0.income.std()}
}

pp.pprint(dist_locs)

{('age', 0): {'mean': 42.5968992248062, 'sd': 18.68841293221688},
 ('age', 1): {'mean': 46.04122011541632, 'sd': 16.92630939293045},
 ('educate', 0): {'mean': 10.813953488372093, 'sd': 3.115274358613906},
 ('educate', 1): {'mean': 12.53833470733718, 'sd': 3.251994492862486},
 ('income', 0): {'mean': 2.879895348837209, 'sd': 2.275269279953376},
 ('income', 1): {'mean': 4.275148639736191, 'sd': 2.920399047028736}}


In [15]:
def cont_predict(data, dist_locs):
    '''
    Takes in data, and the means and standard deviations for each conditional variable, calculates the conditional
    probabilities and returns both probabilities plus a prediction in a dataframe.
   
    Arguments
    --------------
    data : dataset subsetting of continous variables
    dist_locs: dictionary of means and standard deviations
    
    Return
    --------------
    dataframe: of conditional probablities for each class with the expected prediction  
    '''
    store_preds = []
    for i, row in data.iterrows():
        pr_0 = 1; pr_1 = 1
        for j in range(1, len(row)):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
        store_preds.append([pr_0, pr_1, class_pred])
    return pd.DataFrame(store_preds, columns=["pr_0", "pr_1", "pred"]) 

In [16]:
preds_cont_table = cont_predict(train_cont, dist_locs)
preds_cont_table.head(10)

Unnamed: 0,pr_0,pr_1,pred
0,7e-05,0.000218,1
1,5e-05,8.4e-05,1
2,7.9e-05,0.00015,1
3,5.2e-05,7.4e-05,1
4,4e-06,7.1e-05,1
5,4e-06,6.2e-05,1
6,1e-05,8e-06,0
7,6.7e-05,0.000231,1
8,2e-06,7.8e-05,1
9,1.9e-05,2.3e-05,1


In [17]:
test_cont = test[['vote', "age", "educate", "income"]]

In [19]:
# Calculating the accuracy for the training data
cont_acccuracy = sum(train.vote == preds_cont_table.pred)/train.shape[0]
cont_acccuracy

0.755625