# Imports

In [2]:
import pandas as pd
import numpy as np

# Data Preprocessing

In [3]:
data = pd.read_csv('./data/spambase.csv').values
X = data[:,:48]
Y = data[:,-1]

In [4]:
trainX = X[:-100,]
trainY = Y[:-100,]
testX = X[-100:,]
testY = Y[-100:,]

# Training

- Naive Bayes Classification is based on the Naive Bayes Theorem :

$$  P(\textbf{Class}|\textbf{Features}) = \frac{P(\textbf{Features} | \textbf{Class}) P(\textbf{Class})}{P(\textbf{Features})} $$

- When predicting the class we choose the class with highest probability

$$  \textbf{arg max}(\textbf{Class})  P(\textbf{Class}|\textbf{Features}) = \frac{P(\textbf{Features} | \textbf{Class}) P(\textbf{Class})}{P(\textbf{Features})} $$

- We can ignore the denominator : $ P(\textbf{Features}) $ because it's the same for all classes. We are left with : 

$$  \textbf{arg max}(\textbf{Class}) P(\textbf{Class}|\textbf{Features}) = P(\textbf{Features} | \textbf{Class}) P(\textbf{Class}) $$

## Gaussian NB

- To calculate $ P(\textbf{Features}/\textbf{Class})$ we use the Gaussian Probability Formula :
<h3>
$$
P(\textbf{f1 = x}/\textbf{Class}) = {\frac {1}{\sqrt {2\pi \cdot \sigma ^{2}}}}e^{-\large{{\frac {(x-\mu )^{2}}{2 \cdot \sigma ^{2}}}}}
$$
</h3>

- Calculate Means ($ \mu $) and Standard Deviations ($ \sigma $) for each Feature for each Class
- Calculate Class Priors $ P(\textbf{Class}) $

In [11]:
all_classes = set(trainY)
nbr_train_rows = len(trainY)
nbr_elements_in_classes = {}

mean_features = {}
sd_features = {}
class_priors = {}

for c in all_classes:
    mean_features[c] = [0] * 48
    sd_features[c] = [0] * 48
    nbr_elements_in_classes[c] = len(list(filter(lambda x: x == c, trainY)))
    class_priors = nbr_elements_in_classes[c] / nbr_train_rows

print (nbr_elements_in_classes)

#print(sum_features)

for f in range(48):
    for r in range(nbr_train_rows):
        c = trainY[r]
        mean_features[c][f] += trainX[r][f]

for c in mean_features:
    for f in range(48):
        mean_features[c][f] = mean_features[c][f] / nbr_elements_in_classes[c]


#print (mean_features)

for f in range(48):
    for r in range(nbr_train_rows):
        c = trainY[r]
        sd_features[c][f] += ((trainX[r][f] - mean_features[c][f]) ** 2)

for c in mean_features:
    sd_features[c] = list(map(lambda x: sqrt(x / (nbr_elements_in_classes[c] - 1)), sd_features[c]))

#print (sd_features)

In [12]:
# Test the model with the TestX and TestY sets

def gaussian(mean, sd, x):
    exponent = exp(-( ((x-mean)**2) / (2*(sd**2)) ))
    res =  1 / sqrt( (2 * pi * (sd**2)) )
    res *= exponent
    return res

def predict(features, c):
    likelyhood = 1
    for i in range(len(features)):
        likelyhood *= gaussian (mean_features[c][i], sd_features[c][i], features[i])
    return likelyhood * class_priors[c]

results = [False] * len(testY)
score = 0

for i in range(len(testY)):
    f_vector = testX[i]
    correct_class = testY[i]
    predictions = {}
    max_val = 0
    current_class = None
    for c in all_classes:
        predictions[c] = predict(f_vector, c)
        if predictions[c] > max_val:
            max_val = predictions[c]
            current_class = c
    results[i] = current_class == correct_class
    if results[i]:
        score += 1

#print(results)
print(score/len(testY))

NameError: name 'mean_features' is not defined