### PPOL564 | Data Science 1: Foundations | Coding discussion 05
####  Alvaro Altamirano Montoya

#### 1 : Import Libraries, set WD, and load files.

In [42]:
# 1.1 Importing required libraries
import numpy as np, pandas as pd, os
import scipy.stats as st
import pprint as pp # for printing

In [43]:
# 1.2 Set paths and read files
path = r'C:\Users\unily\Documents\Georgetown\PPOL 564 - Intro to Data Science\Coding discussions\5'
os.chdir(path) # Set WD
# 1.3 Load and split data in training and test
# Set seed
np.random.seed(1234)
# read in data
vt_data = pd.read_csv("turnout.csv")
# Train-Test split (just using Pandas)
train = vt_data.sample(frac=.8).reset_index(drop=True)
test = vt_data.drop(train.index).reset_index(drop=True)
print('training: '+str(train.shape[0]), '; test: '+str(test.shape[0]))

training: 1600 ; test: 400


#### Calculate Class Probabilities: $Pr(class)$

In [44]:
N = train.shape[0]
# Subset the data by class
vt1 = train.query("vote == 1")
vt0 = train.query("vote == 0")
# Calculate the probability for each class
pr_vt_1 = vt1.shape[0]/N
pr_vt_0 = vt0.shape[0]/N
# Print the probabilities
print(
f"""
Pr(vote = 1): {pr_vt_1}
Pr(vote = 0): {pr_vt_0}
""")


Pr(vote = 1): 0.7425
Pr(vote = 0): 0.2575



#### Conditional probabilities function

In [45]:
def calc_probs(data,outcome_var=""):
    '''
    calc_probs():
    This function calculates the class and conditional probabilities for the dummy variables. 
    -------------------------------------------------------------------------------------------
    Arguments:
    The prior probabilities based on the dimensions of variables/outcomes within original dataset.
    -------------------------------------------------------------------------------------------
    return:
    Dictionary objects with three key/values sets.
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v != outcome_var]
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs

# Run
class_probs, cond_probs = calc_probs(train,outcome_var="vote")
# Print
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n" + "conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.2575, 1: 0.7425}

conditional probabilities

{('age', 0, 0): -41.601941747572816,
 ('age', 0, 1): -45.32491582491583,
 ('age', 1, 0): 42.601941747572816,
 ('age', 1, 1): 46.32491582491583,
 ('educate', 0, 0): -9.62864077669903,
 ('educate', 0, 1): -11.558922558922559,
 ('educate', 1, 0): 10.62864077669903,
 ('educate', 1, 1): 12.558922558922559,
 ('id', 0, 0): -1052.5606796116506,
 ('id', 0, 1): -971.0942760942761,
 ('id', 1, 0): 1053.5606796116506,
 ('id', 1, 1): 972.0942760942761,
 ('income', 0, 0): -1.7381618932038831,
 ('income', 0, 1): -3.2294619528619526,
 ('income', 1, 0): 2.738161893203883,
 ('income', 1, 1): 4.229461952861953,
 ('white', 0, 0): 0.20145631067961167,
 ('white', 0, 1): 0.13131313131313127,
 ('white', 1, 0): 0.7985436893203883,
 ('white', 1, 1): 0.8686868686868687}


#### Class prediction. Create conditional probabilities and predict using bayes formula. 

In [46]:
# Calculate Class Probabilities
y1 = train.query("vote == 1")
y0 = train.query("vote == 0")
#del y1['vote'], y0['vote']
# Class probabilities.
pr_y1 = y1.shape[0]/train.shape[0]
pr_y0 = y0.shape[0]/train.shape[0]
# Collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("income",1):{'mean':y1.income.mean(),'sd':y1.income.std()},
 ("income",0):{'mean':y0.income.mean(),'sd':y0.income.std()},
  ("age",1):{'mean':y1.age.mean(),'sd':y1.age.std()},
 ("age",0):{'mean':y0.age.mean(),'sd':y0.age.std()},
   ("educate",1):{'mean':y1.educate.mean(),'sd':y1.educate.std()},
 ("educate",0):{'mean':y0.educate.mean(),'sd':y0.educate.std()},
}
# Print
pp.pprint(dist_locs)

{('age', 0): {'mean': 42.601941747572816, 'sd': 19.147825402160812},
 ('age', 1): {'mean': 46.32491582491583, 'sd': 16.924844588853716},
 ('educate', 0): {'mean': 10.62864077669903, 'sd': 3.304381091983527},
 ('educate', 1): {'mean': 12.558922558922559, 'sd': 3.295714127444309},
 ('income', 0): {'mean': 2.7381618932038836, 'sd': 2.2429913729337625},
 ('income', 1): {'mean': 4.229461952861947, 'sd': 2.8482089910676964}}


In [47]:
#Calculating the conditional probability for the binary variable
#Given vote = 0 
w1v0 = y0.query("white == 1").shape[0]/y0.shape[0]
w0v0 = y0.query("white == 0").shape[0]/y0.shape[0]
#Given vote = 1
w1v1 = y1.query("white == 1").shape[0]/y1.shape[0]
w0v1 = y1.query("white == 0").shape[0]/y1.shape[0]

In [48]:
# Created copies of both train and tests without the dependent variable
train_xs = train.copy()
test_xs = test.copy()
train_xs = train_xs[['white','age','income','educate']]
test_xs = test_xs[['white','age','income','educate']]

In [49]:
def predict(data,dist_locs):
    '''
    Predict():
    Multiplies the conditional probabilities of the continous variables with the conditional probability of the discrete variable for each individual respondent. Then, this is 
    multiplied with each class probability. 
    -------------------------------------------------------------------------------------------------------------------------
    Arguments: 
    A dataframe and a dictionary object with the conditional probabilities.
    -------------------------------------------------------------------------------------------------------------------------
    Return Value: 
    A classification of class by individual, within a dataframe. 
    ------------------------------------------------------------------------------------------------------------------------
    '''    
    store_preds = []
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussian distribution
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row)):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        
        # Get predictions for dummy variable using conditional probabilities from previuos exercise
        if(row["white"]==0):
            pr_0*=w0v0 # given vote = 0
        else:
            pr_0*=w1v0
        if(row["white"]==0):
            pr_1*=w0v1 # given vote = 1
        else:
            pr_1*=w1v1
            
        #Multiplying with the class probability 
        pr_0 *= pr_y0
        pr_1 *= pr_y1
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

#### Prediction metrics and discussion

In [50]:
# Accuracy metric for train data
preds_train = predict(train_xs,dist_locs)
accuracy_train = sum(train.vote == preds_train.pred)/train.shape[0]
print("Accuracy metric for train data: " + str(accuracy_train*100)[0:4]+'%')

Accuracy metric for train data: 73.6%


In [51]:
# Accuracy metric for test data
preds_test = predict(test_xs, dist_locs)
accuracy_test = sum(test.vote == preds_test.pred)/test.shape[0]
print("Accuracy metric for train data: "+ str(accuracy_test*100)[0:4]+'%')

Accuracy metric for train data: 71.5%


#### In terms of performance metrics, the accuracy coefficient is greater for the training set than for the test set, although by a small percentage difference (73.6% vs 71.5%). Result = ~72% > ~50% (coin flip).