# PPOL564 | Data Science 1: Foundations 

## Coding Discussion 5



#### By      : Sonali Subbu Rathinam 
#### NetID : ss4608
#### Date   : 14/11/2021

In [1]:
#Importing the required modules

import numpy as np 
import pandas as pd

# for printing
import pprint as pp 

# for Normal PDF
import scipy.stats as st 

#To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Reading the csv file
turnout = pd.read_csv("turnout.csv")

In [3]:
#Viewing the data
print(turnout)

        id  age  educate  income  vote  white
0        1   60     14.0  3.3458     1      1
1        2   51     10.0  1.8561     0      1
2        3   24     12.0  0.6304     0      1
3        4   38      8.0  3.4183     1      1
4        5   25     12.0  2.7852     1      1
...    ...  ...      ...     ...   ...    ...
1995  1996   26     16.0  3.3834     0      1
1996  1997   34     12.0  2.9170     1      1
1997  1998   51     16.0  7.8949     1      1
1998  1999   22     10.0  2.4811     0      1
1999  2000   59     10.0  0.5523     0      1

[2000 rows x 6 columns]


In [4]:
#Splitting the data into training and testing data in a 80:20 ratio. 
train = turnout.sample(frac=.8).reset_index(drop=True)
test = turnout.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

# Look at the head of the training data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,id,age,educate,income,vote,white
0,941,38,14.0,4.3348,1,1
1,1811,31,11.0,2.7068,0,1
2,1426,86,9.0,0.4118,0,1
3,1879,48,6.0,0.5523,0,0
4,532,51,10.0,3.3834,0,1


The formula for Naive Bayes Classification is as follows:
<br><br>

$$Pr(class | data) = Pr( x_1| class)\times Pr( x_2| class) \times \dots \times  Pr(class)$$

<br><br>

Hence, the following steps are followed to classify each individual respondent in the  data: 
1. Calculate the class probability. 
2. Calculate the conditional probabilities of the discrete variables in the data. 
3. Calculate the conditional probabilities of the continous variables in the data. 
4. Obtain the product of the aforementioned probabilities for a respondent. 
5. Obtain the product of the value in the previous step with each class probability. 
6. An individual respondent is assigned the class with which it has higher probability (higher product value). 

#### 1.  Calculate the class probability 

In [5]:
# Subset the data by class 
vote1 = train.query("vote == 1")
vote0 = train.query("vote == 0")

In [6]:
# Class probabilities.
pr_vote1 = vote1.shape[0]/train.shape[0]
pr_vote0 = vote0.shape[0]/train.shape[0]

#Printing the results
print("Vote = 1 : ", pr_vote1, "\nVote = 0 : ", pr_vote0)

Vote = 1 :  0.745625 
Vote = 0 :  0.254375


#### 2. Calculate the conditional probabilities of the discrete variables in the data.

As we have only one discrete input variable, *white*, its conditional probability is calculated.

In [7]:
#Calculating the conditional probability for white

#Given vote =0 
white1_vote0 = vote0.query("white == 1").shape[0]/vote0.shape[0]
white0_vote0 = vote0.query("white == 0").shape[0]/vote0.shape[0]

#Given vote =1
white1_vote1 = vote1.query("white == 1").shape[0]/vote1.shape[0]
white0_vote1 = vote1.query("white == 0").shape[0]/vote1.shape[0]


#Printing the results
print(
f"""
Pr(white = 1 |vote = 0): {white1_vote0}
Pr(white = 0 |vote = 0): {white0_vote0}
Pr(white = 1 |vote = 1): {white1_vote1}
Pr(white = 0 |vote = 1): {white0_vote1}
""")


Pr(white = 1 |vote = 0): 0.7665847665847666
Pr(white = 0 |vote = 0): 0.2334152334152334
Pr(white = 1 |vote = 1): 0.8801341156747695
Pr(white = 0 |vote = 1): 0.11986588432523052



#### 3. Calculate the conditional probabilities of the continous variables in the data.

We have three continous variables, *age, educate* and *income*. 

In [8]:
#Obtaining the mean and standard dev. of each conditional distribution
#This is a dictionary whose values contain the values required to calculate the conditional probabilities of each variable. 
dist_locs = \
{("age",1):{'mean':vote1.age.mean(),'sd':vote1.age.std()},
 ("age",0):{'mean':vote0.age.mean(),'sd':vote0.age.std()},
 ("educate",1):{'mean':vote1.educate.mean(),'sd':vote1.educate.std()},
 ("educate",0):{'mean':vote0.educate.mean(),'sd':vote0.educate.std()},
 ("income",1):{'mean':vote1.income.mean(),'sd':vote1.income.std()},
 ("income",0):{'mean':vote0.income.mean(),'sd':vote0.income.std()},
}

# Print
pp.pprint(dist_locs)

{('age', 0): {'mean': 43.729729729729726, 'sd': 19.46933108461713},
 ('age', 1): {'mean': 46.24140821458508, 'sd': 16.935885672695832},
 ('educate', 0): {'mean': 10.61916461916462, 'sd': 3.2933245349552562},
 ('educate', 1): {'mean': 12.45222129086337, 'sd': 3.264266135083623},
 ('income', 0): {'mean': 2.7325761670761706, 'sd': 2.2187345798361373},
 ('income', 1): {'mean': 4.242505280804694, 'sd': 2.9278864215557983}}


In [9]:
def predict(data,dist_locs):
    '''
    In the predict function, steps 3 -6 are carried out. The conditional probabilities of the continous variables are 
    multiplied with the conditional probability of the discrete variable for each individual respondent. Then, this is 
    multiplied with each class probability. The class associated with the higher probability is assigned to each respondent. 
    -------------------------------------------------------------------------------------------------------------------------
    Arguments: 
    data -> the data we need to work on. 
    dist_locs -> the dictionary that contains the required values for calculating conditional probabilites of continous
    variables
    -------------------------------------------------------------------------------------------------------------------------
    Return Value: 
    A dataframe, that contains the predicted class for each respondent. 
    ------------------------------------------------------------------------------------------------------------------------
    '''
    
    #An empty list that will be used to store data in the function
    store_preds = []
    
    #Iterating through the data
    for i,row in data.iterrows():
        
        
        #Initialising the probabilites to 1. 
        pr_0 = 1; pr_1 = 1
        
        
        # Get the conditional probabilites for continous variables using a Gaussan distribution
        for j in range(1,4): #Iterating through the required columns 
            
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
            
        
        #We are checking for the value of the discrete variable for each individual respondent, and then multiplying the 
        #appropriate discrete conditional probability
        if(row["white"]==0):
            pr_0*=white0_vote0
        else:
            pr_0*=white1_vote0
    
            
        if(row["white"]==0):
            pr_1*=white0_vote1
        else:
            pr_1*=white1_vote1
            
        
        #Multiplying with the class probability 
        pr_0 *= pr_vote0
        pr_1 *= pr_vote1
        
        
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","Final_Prediction"])

In [10]:
# Run the function for training data
pred_train = predict(train, dist_locs)

In [11]:
#check the result
pred_train.head(15)

Unnamed: 0,pr_0,pr_1,Final_Prediction
0,3.8e-05,0.000204,1
1,7e-05,0.000136,1
2,4e-06,4e-06,0
3,6e-06,2e-06,0
4,7.6e-05,0.000179,1
5,3e-06,2.6e-05,1
6,3e-06,5e-06,1
7,3e-05,0.000137,1
8,2.8e-05,6e-05,1
9,5.5e-05,6.7e-05,1


In [12]:
#Accuracy of Prediction for the training data
accuracy_train = sum(train.vote == pred_train.Final_Prediction)/train.shape[0]
accuracy_train

0.74125

In [13]:
#Run the function for test data
pred_test = predict(test, dist_locs)

In [14]:
pred_test.head(15)

Unnamed: 0,pr_0,pr_1,Final_Prediction
0,4.858402e-06,1.288538e-05,1
1,1.904413e-05,0.000182377,1
2,3.036557e-05,6.995595e-05,1
3,6.738822e-05,0.0001880787,1
4,1.445936e-06,4.353968e-05,1
5,4.649896e-05,8.456979e-05,1
6,9.757092e-08,1.388731e-08,0
7,4.385315e-05,7.626277e-05,1
8,1.051163e-05,7.446327e-06,0
9,3.089978e-05,0.0001071054,1


In [15]:
#Accuracy of Prediction for the test data
accuracy_test = sum(test.vote == pred_test.Final_Prediction)/test.shape[0]
accuracy_test

0.7125

From the above results, we know that the predictions on our training data has an accuracy of 74.125%. This is higher than the accuracy of predictions on the test data, which is 71%. Both these predictions perform better than chance/coin flip, which would have an accuracy of about 50%. 