In [1]:
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF 
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
turnout = pd.read_csv("turnout.csv") #reading in the dataset
turnout.head()

Unnamed: 0,id,age,educate,income,vote,white
0,1,60,14.0,3.3458,1,1
1,2,51,10.0,1.8561,0,1
2,3,24,12.0,0.6304,0,1
3,4,38,8.0,3.4183,1,1
4,5,25,12.0,2.7852,1,1


In [3]:
np.random.seed(42)
train = turnout.sample(frac=.8).reset_index(drop=True) #splitting 80% dataset for training
train.head()

Unnamed: 0,id,age,educate,income,vote,white
0,1861,54,10.0,1.2659,1,1
1,354,44,16.0,3.1676,1,0
2,1334,45,14.0,5.178,1,0
3,906,80,8.0,1.261,0,1
4,1290,26,14.0,7.0281,0,1


In [4]:
test = turnout.drop(train.index).reset_index(drop=True)# and the rest 20% for testing
test.head()

Unnamed: 0,id,age,educate,income,vote,white
0,1601,31,16.0,3.9394,1,0
1,1602,53,12.0,6.3352,1,1
2,1603,65,12.0,0.8284,0,1
3,1604,34,12.0,3.3834,1,1
4,1605,28,12.0,8.7545,0,1


In [5]:
print("No of observations in training data:", train.shape[0], "\nNo of observations in test data:", test.shape[0])

No of observations in training data: 1600 
No of observations in test data: 400


## Naive Bayes Classifier

$$Pr(class | data) = Pr( x_1| class)\times Pr( x_2| class) \times \dots \times  Pr(class)$$

### APPROACH: Following the Naive Bayes Classifier approach, we will mulitpy the conditional probabilities of both discrete and continuous variables with each other and then multipy the product with class proability to arrive at the predictions

### Conditional Probability of discrete variables

#### Class Probabilities - we calculate the probability of someone voting and not voting in the training dataset

In [6]:
vote_0 = train.query("vote == 0") #pulling observations of people who didn't vote
vote_1 = train.query("vote == 1") #pulling observations of people who voted

N = train.shape[0]

#Calculating class probability
pr_vote_0 = vote_0.shape[0]/N  
pr_vote_1 = vote_1.shape[0]/N

#Printing the class probabilities that we calculated
print(
f"""
Pr(vote = 1): {pr_vote_1}
Pr(vote = 0): {pr_vote_0}
""")


Pr(vote = 1): 0.75125
Pr(vote = 0): 0.24875



#### Conditional Probabilities

In [7]:

white1_vote1 = vote_1.query("white==1").shape[0]/ vote_1.shape[0] #probaility of a person being white given that they vote
white0_vote1 = vote_1.query("white==0").shape[0]/ vote_1.shape[0] #probaility of a person not being white given that they vote

#probaility of a person being white given that they didn't vote
white1_vote0 = vote_0.query("white==1").shape[0]/ vote_0.shape[0]

#probaility of a person not being white given that they didn't vote
white0_vote0 = vote_0.query("white==0").shape[0]/ vote_0.shape[0] 

print(
f"""
Pr(white = 1 |vote = 1): {white1_vote1}
Pr(white = 0 |vote = 1): {white0_vote1}

Pr(white = 1 |vote = 0): {white1_vote0}
Pr(white = 0 |vote = 0): {white0_vote0}
""")


Pr(white = 1 |vote = 1): 0.8743760399334443
Pr(white = 0 |vote = 1): 0.12562396006655574

Pr(white = 1 |vote = 0): 0.7889447236180904
Pr(white = 0 |vote = 0): 0.21105527638190955



#### Predicting Multiple observations

In [8]:
# We generate dictionaries for class and conditional probabilities

def calc_probs(data, var="vote"):
    '''
    Function calculates the class and conditional probabilities in 
    the binary data. 
    
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}

    # iterate through the class outcomes
    for v_0, v_1 in data.groupby("vote"): 
        # calculate the class probabilities
        class_probs.update({v_0: v_1.shape[0]/data.shape[0]})
        pr = v_1["white"].sum()/v_1.shape[0]
        cond_probs[("white",1,v_0)] = pr 
        cond_probs[("white",0,v_0)] = 1 - pr
    return class_probs, cond_probs


# storing the function output into class_probs and cond_probs objects
class_probs, cond_probs = calc_probs(train,"vote")

# Printing class and conditional probabilities
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.24875, 1: 0.75125}


conditional probabilities

{('white', 0, 0): 0.21105527638190957,
 ('white', 0, 1): 0.12562396006655574,
 ('white', 1, 0): 0.7889447236180904,
 ('white', 1, 1): 0.8743760399334443}


In [10]:
def predict(data,cond_probs):
    '''
    Function calculates the conditional probability for membership into each class.
    '''
    store_preds = []
    for i,row in data.iterrows():
        
        pr_0 = cond_probs[('white',row.white,0)] 
        pr_1 = cond_probs[('white',row.white,1)]  
        
        store_preds.append([pr_0,pr_1])
    return pd.DataFrame(store_preds,columns=["pr_d_0","pr_d_1"])

#calculating the conditional probabilitie for the discrete variable on the training data set 
preds_discrete = predict(train, cond_probs)
preds_discrete.head()

Unnamed: 0,pr_d_0,pr_d_1
0,0.788945,0.874376
1,0.211055,0.125624
2,0.211055,0.125624
3,0.788945,0.874376
4,0.788945,0.874376


### Continuous variables on our dataset- age, educate, income

In [17]:
# Collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("age",1):{'mean':vote_1.age.mean(),'sd':vote_1.age.std()},
 ("age",0):{'mean':vote_0.age.mean(),'sd':vote_0.age.std()},
 ("educate",1):{'mean':vote_1.educate.mean(),'sd':vote_1.educate.std()},
 ("educate",0):{'mean':vote_0.educate.mean(),'sd':vote_0.educate.std()},
 ("income",1):{'mean':vote_1.income.mean(),'sd':vote_1.income.std()},
 ("income",0):{'mean':vote_0.income.mean(),'sd':vote_0.income.std()}
}

pp.pprint(dist_locs)

{('age', 0): {'mean': 42.37185929648241, 'sd': 19.006625175567997},
 ('age', 1): {'mean': 46.44925124792013, 'sd': 16.936758606996662},
 ('educate', 0): {'mean': 10.825376884422111, 'sd': 3.1563432966353564},
 ('educate', 1): {'mean': 12.517054908485857, 'sd': 3.2786113780338866},
 ('income', 0): {'mean': 2.8102128140703533, 'sd': 2.2937124906875623},
 ('income', 1): {'mean': 4.254539267886853, 'sd': 2.942782307111688}}


In [12]:
def predict_cont(data,dist_locs):
    '''
    This function calculates the product of conditional probabilities of continuous variables assuming normal distribution based 
    on their descriptive statistics.
    '''
    store_preds = []
    
    #for each row, we multiply the conditional probabilties corresponding to the three continuous variables and store them.
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussan distribution
        pr_c_0 = 1; pr_c_1 = 1
        for j in [1,2,3]: #1,2,3 are the column indices of the continuous variables in our dataset
            
            #we get the probability density correspoinding to a variable's value from a normal distribution 
            #based on the variable's mean and standard devaition
            pr_c_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_c_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
            

        store_preds.append([pr_c_0,pr_c_1])
        
    return pd.DataFrame(store_preds,columns=["pr_c_0","pr_c_1"])

# Conditional probabilities of the continuous variables on the training data set 
preds_continuous = predict_cont(train,dist_locs)
preds_continuous.head()

Unnamed: 0,pr_c_0,pr_c_1
0,0.000295,0.000156
1,0.000118,0.000204
2,0.000162,0.000333
3,3.5e-05,1.3e-05
4,3.5e-05,0.000109


In [13]:
#Now that we have the conditional probabilities of each obeservations based on a class , 
#we can calcuate the probabilty of observing a class, given the data 
#by multiplying the condtitional probabilities with each other and then with the class probability.

def join_cont_dis(dataset1, dataset2):
    data_set = dataset1.join(dataset2) #Joining all the conditional probabilties.
    #multiplying the conditional probabilities with class probability
    data_set['pr_0'] = data_set['pr_d_0'] * data_set['pr_c_0'] * class_probs[0] 
    data_set['pr_1'] = data_set['pr_d_1'] * data_set['pr_c_1'] * class_probs[1]
    data_set['final_pred'] = [1 if row['pr_1'] > row['pr_0'] else 0 for index, row in data_set.iterrows()]
    
    return (data_set)

#This dataset just shows the conditional probability of the discrete variable given a class and 
#the product of the conditional probabilities of the continuous variables pr_c_0, pr_c_1. 
#And the final prediction based on conditional and class probabilities
train_output = join_cont_dis(preds_discrete,preds_continuous )
train_output.head()

Unnamed: 0,pr_d_0,pr_d_1,pr_c_0,pr_c_1,pr_0,pr_1,final_pred
0,0.788945,0.874376,0.000295,0.000156,5.8e-05,0.000103,1
1,0.211055,0.125624,0.000118,0.000204,6e-06,1.9e-05,1
2,0.211055,0.125624,0.000162,0.000333,8e-06,3.1e-05,1
3,0.788945,0.874376,3.5e-05,1.3e-05,7e-06,8e-06,1
4,0.788945,0.874376,3.5e-05,0.000109,7e-06,7.1e-05,1


In [19]:
#Accuracy on the training data set
accuracy_train_final = sum(train.vote == train_output.final_pred)/train.shape[0]
print("Accuracy on training data set: ",accuracy_train_final)

Accuracy on training data set:  0.743125


In [20]:
# Running the model on test data

Pred_discrete_test = predict(test, cond_probs) # Conditional probabilities of binary varibales
predict_continuous_test = predict_cont(test,dist_locs) #Product of conditional probabilities of continous variables
test_output = join_cont_dis(Pred_discrete_test,predict_continuous_test ) 
#Putting the data generated from above functions together and then creating the final prediction for each observation
accuracy_test = sum(test.vote == test_output.final_pred)/test.shape[0]
#Calculating accuracy
print("Accuracy on test data set: ",accuracy_test)


Accuracy on test data set:  0.7125


### The model's accuracy on training data set is74% and on testing data set is 71%. 