In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
# calculate gaussian probability
def c_g_p(x, mu, sigma):
    return 1 / (math.sqrt(sigma ** math.pi)) * np.exp(-sigma * np.power((x - mu), 2))

In [3]:
# calculate the Probability Density Function (PDF) of ' M ' and ' W '
def p_calc(x, feature, a_train):
    
    p_f_m_m = np.mean(a_train.loc[a_train['g'] == ' M '][feature].values) # calculate men mean
    p_f_m_s = np.std(a_train.loc[a_train['g'] == ' M '][feature].values) # calculate men std deviation
    p_f_m = c_g_p(x, p_f_m_m, p_f_m_s) # pdf feature men

    p_f_w_m = np.mean(a_train.loc[a_train['g'] == ' W '][feature].values) # calculate women mean
    p_f_w_s = np.std(a_train.loc[a_train['g'] == ' W '][feature].values) # calculate women std deviation
    p_f_w = c_g_p(x, p_f_w_m, p_f_w_s) # pdf feature women

    return p_f_m, p_f_w

In [4]:
# gaussian naive bayes classification
def gnb_classification(x, a_train, drop_age):
    
    # calculate PDFs for each feature
    pdf_hm, pdf_hw = p_calc(x[0], 'h', a_train) # height for men and women
    pdf_wm, pdf_ww = p_calc(x[1], 'w', a_train) # weight for men and women

    # calculate prior probabilities of the classes    
    n_men, n_women = np.count_nonzero(np.asarray(a_train['g']) == ' M '), np.count_nonzero(np.asarray(a_train['g']) == ' W ')
    tot_classes = n_women + n_men
    man_p, woman_p = n_men / tot_classes, n_women / tot_classes

    if drop_age:
        
        p_m = pdf_hm * pdf_wm * man_p # PDF_height_men*PDF_weight_men*prior_men
        p_w = pdf_hw * pdf_ww * woman_p
    else:
        pdf_am, pdf_aw = p_calc(x[2], 'a', a_train) 

        
        p_m = pdf_hm * pdf_wm * pdf_am * man_p
        p_w = pdf_hw * pdf_ww * pdf_aw * woman_p
        
    return ' M ' if p_m > p_w else ' W '

In [5]:
a_train = pd.read_csv('Train.csv')
a_test = pd.read_csv('Sample_test.csv')
a_test = a_test.values

In [6]:
for x in a_test:
        print("For test data point = {}".format(x))
        pre1 = gnb_classification(x, a_train, drop_age=False)
        print("\tOutput is {}".format(pre1))

For test data point = [ 1.86951158 76.13764299 32.        ]
	Output is  M 
For test data point = [ 1.44964692 60.1069934  24.        ]
	Output is  W 
For test data point = [ 1.50062171 70.4236467  28.        ]
	Output is  W 
For test data point = [ 1.64928523 65.99131108 28.        ]
	Output is  W 


In [7]:
c = pd.read_csv('Test.csv')

In [8]:
p_a, p_e = 0, 0

for index, test_sample in c.iterrows():
        sample = test_sample.values[:3]
        target = test_sample.values[3]
        prediction = gnb_classification(sample, c.drop(index), drop_age=False)
        p_a += 1 if target == prediction else 0
        
        prediction = gnb_classification(sample[:2], c.drop(index),
                                                         drop_age=True)
        p_e += 1 if target == prediction else 0
        
print("GNB output")
print("{}/{} Predictions/Total using all features".format(p_a, c.shape[0]))
ac1 = p_a/c.shape[0]
print("accuracy for all features = {}".format(ac1))
print("{}/{} Predictions/Total excluding age".format(p_e, c.shape[0]))
ac2 = p_e/c.shape[0]
print("accuracy for all features = {}".format(ac2))

GNB output
69/120 Predictions/Total using all features
accuracy for all features = 0.575
70/120 Predictions/Total excluding age
accuracy for all features = 0.5833333333333334
