In [1]:
import numpy as np
from collections import Counter
import sklearn.naive_bayes
from sklearn.metrics import accuracy_score

spambase = np.loadtxt('spambase.data' , delimiter = ',')

# X is n x d, with n = 4601 and d = 57
X = spambase[ : , : -1]
y = spambase[ : , -1].astype(int)
print(np.shape(X))

(4601, 57)


In [2]:
# shuffling the data and splitting into training and test set
X_y = spambase[:,:]
np.random.shuffle(X_y)

X_shuffled = X_y[:,:-1]
y_shuffled = X_y[:,-1].astype(int)

n_train = 2000
X_train = X_shuffled[:n_train, :]
X_test = X_shuffled[n_train:, :]

In [3]:
# Quantization of feature variables
median = []
median = np.median(X_train,0)

for j in range(0,X_train.shape[1]):
    X_train[:,j] = (X_train[:,j] > median[j]).astype(int)
    X_test[:,j] = (X_test[:,j] > median[j]).astype(int)

In [4]:
# Fitting the Naive Bayes model using scikit-learn
c = Counter(y_shuffled[:n_train])
p_y = c[1]/len(y_shuffled[:n_train])
print('Probability of spam mail:', p_y)

naive_bayes = sklearn.naive_bayes.BernoulliNB()
naive_bayes.fit(X_train,y_shuffled[:n_train])

Probability of spam mail: 0.398


BernoulliNB()

In [5]:
classification = naive_bayes.predict(X_test)
print('Accuracy score:',accuracy_score(classification,y_shuffled[n_train:]))

Accuracy score: 0.8981161091887736


In [6]:
test_error = 1/len(classification)*sum((classification-y_shuffled[n_train:])**2)
print('Test error:',test_error)

Test error: 0.10188389081122645


2. The majority class from the training data is 0 (not spam), since P(y=1) = ~0.4 as computed above.
If we predicted the label 0 for all examples in the test set, we would get the following test error:

In [7]:
not_spam = np.zeros((np.shape(classification)))
print('Majority class error:',1-accuracy_score(not_spam,y_shuffled[n_train:]))

Majority class error: 0.39100346020761245
