# Gaussian Naive Bayes

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

### Getting and split data

The aim is to construct a classifier that predicts whether a user will buy a new SUV given information of his/her Age and Salary.

In [2]:
data = pd.read_csv('Social_Network_Ads.csv')
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
X = data.iloc[:, [2, 3]].values
y = data.iloc[:, 4].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

Split function returns sets as numpy.ndarray, let's to convert them back to pandas.DataFrame type 

In [5]:
data_train = pd.DataFrame({'Age':X_train[:,0],'EstimatedSalary':X_train[:,1],'Purchased':y_train[:]})
data_test = pd.DataFrame({'Age':X_test[:,0],'EstimatedSalary':X_test[:,1],'Purchased':y_test[:]})

In [6]:
print(data_train.shape)
print(data_test.shape)

(300, 3)
(100, 3)


In [7]:
data_train.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,44,39000,0
1,32,120000,1
2,38,50000,0
3,32,135000,1
4,52,21000,1


In [8]:
data_test.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,30,87000,0
1,38,50000,0
2,35,75000,0
3,30,79000,0
4,35,50000,0


### Separate data_train by class

In [9]:
datasets = {}
by_class = data_train.groupby('Purchased')

for groups, features in by_class:
    datasets[groups] = features

In [10]:
datasets[0].head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,44,39000,0
2,38,50000,0
6,39,42000,0
7,38,61000,0
8,36,50000,0


In [11]:
datasets[1].head()

Unnamed: 0,Age,EstimatedSalary,Purchased
1,32,120000,1
3,32,135000,1
4,52,21000,1
5,53,104000,1
12,42,73000,1


### Calculate Mean and Standard deviation

Class 1 -> Purchased

In [12]:
mean_age_purchased = np.mean(data_train[data_train["Purchased"]==1]["Age"])
std_age_purchased = np.std(data_train[data_train["Purchased"]==1]["Age"])

mean_es_purchased = np.mean(data_train[data_train["Purchased"]==1]["EstimatedSalary"])
std_es_purchased = np.std(data_train[data_train["Purchased"]==1]["EstimatedSalary"])

In [13]:
print("mean_age_purchased = {:03.2f}".format(mean_age_purchased))
print("std_age_purchased = {:03.2f}".format(std_age_purchased))
print("mean_es_purchased = {:03.2f}".format(mean_es_purchased))
print("std_es_purchased = {:03.2f}".format(std_es_purchased))

mean_age_purchased = 45.99
std_age_purchased = 8.55
mean_es_purchased = 85594.59
std_es_purchased = 42206.57


Class 0 -> Not purchased

In [14]:
mean_age_not_purchased = np.mean(data_train[data_train["Purchased"]==0]["Age"])
std_age_not_purchased = np.std(data_train[data_train["Purchased"]==0]["Age"])

mean_es_not_purchased = np.mean(data_train[data_train["Purchased"]==0]["EstimatedSalary"])
std_es_not_purchased = np.std(data_train[data_train["Purchased"]==0]["EstimatedSalary"])

In [15]:
print("mean_age_not_purchased = {:03.2f}".format(mean_age_not_purchased))
print("std_age_not_purchased = {:03.2f}".format(std_age_not_purchased))
print("mean_es_not_purchased = {:03.2f}".format(mean_es_not_purchased))
print("std_es_not_purchased = {:03.2f}".format(std_es_not_purchased))

mean_age_not_purchased = 33.51
std_age_not_purchased = 7.83
mean_es_not_purchased = 60179.89
std_es_not_purchased = 24557.88


### Class Probabilities

In [16]:
p_purchased =  datasets[1].shape[0]/(datasets[0].shape[0]+datasets[1].shape[0])
p_not_purchased =  datasets[0].shape[0]/(datasets[0].shape[0]+datasets[1].shape[0])

In [17]:
print("p_purchased = {:03.2f}".format(p_purchased))
print("p_not_purchased = {:03.2f}".format(p_not_purchased))

p_purchased = 0.37
p_not_purchased = 0.63


### Gaussian Probability Function

In [18]:
def gauss(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev )) * exponent

### Make Predictions

In [19]:
def predict(test_set): 
    m = len(test_set)
    y_pred = np.zeros((m,1))
    for i in range (m):     
        p_x_purchased = gauss(test_set[i][0],mean_age_purchased,std_age_purchased)*gauss(test_set[i][1],mean_es_purchased,std_es_purchased)
        p_x_not_purchased = gauss(test_set[i][0],mean_age_not_purchased,std_age_not_purchased)*gauss(test_set[i][1],mean_es_not_purchased,std_es_not_purchased)
        
        evidence = ((p_x_purchased * p_purchased)+(p_x_not_purchased*p_not_purchased))
        p_1 = (p_x_purchased * p_purchased) / evidence
        p_0 = (p_x_not_purchased * p_not_purchased) / evidence
        
        if(p_1 > p_0):
            y_pred[i,0] = 1
        else:
            y_pred[i,0] = 0
            
    assert(y_pred.shape == (m,1))
    
    return y_pred
    

In [20]:
y = data_test['Purchased']
y_pred = predict(data_test.values)

### Get accuracy with confusion matrix

In [21]:
cm = confusion_matrix(y, y_pred)
print(cm)

[[65  3]
 [ 7 25]]


### References

https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/

https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained/