In [20]:
import numpy as np
import pandas as pd
import urllib.request

from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

### Load Data

In [21]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

raw_data = urllib.request.urlopen(url)

dataset = np.loadtxt(raw_data, delimiter=",")

In [22]:
X = dataset[:, 0:48]
y = dataset[:, -1]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Bernoulli Naive Bayes
BernoulliNB implements the naive Bayes training and classification algorithms for data that is distributed according to multivariate Bernoulli distributions; i.e., there may be multiple features but each one is assumed to be a binary-valued (Bernoulli, boolean) variable. Therefore, this class requires samples to be represented as binary-valued feature vectors; if handed any other kind of data, a BernoulliNB instance may binarize its input (depending on the binarize parameter).

In [24]:
BernNB = BernoulliNB(binarize = True)
BernNB.fit(X_train, y_train)
print(BernNB)

y_pred = BernNB.predict(X_test)
print("Accuracy Score:", accuracy_score(y_pred, y_test))

BernoulliNB(alpha=1.0, binarize=True, class_prior=None, fit_prior=True)
Accuracy Score: 0.8610206297502715


### Gaussian Naive Bayes
GaussianNB implements the Gaussian Naive Bayes algorithm for classification.

In [25]:
GaussNB = GaussianNB()
GaussNB.fit(X_train, y_train)
print(GaussNB)

y_pred = GaussNB.predict(X_test)
print("Accuracy Score:", accuracy_score(y_pred, y_test))

GaussianNB(priors=None, var_smoothing=1e-09)
Accuracy Score: 0.8251900108577633


### Multinomial Naive Bayes
MultinomialNB implements the naive Bayes algorithm for multinomially distributed data, and is one of the two classic naive Bayes variants used in text classification (where the data are typically represented as word vector counts, although tf-idf vectors are also known to work well in practice). The distribution is parametrized by vectors  for each class , where  is the number of features (in text classification, the size of the vocabulary) and  is the probability  of feature  appearing in a sample belonging to class .

In [26]:
MultNB = MultinomialNB()
MultNB.fit(X_train, y_train)
print(MultNB)

y_pred = MultNB.predict(X_test)
print("Accuracy Score:", accuracy_score(y_pred, y_test))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy Score: 0.8718783930510315


### Complement Naive Bayes
ComplementNB implements the complement naive Bayes (CNB) algorithm. CNB is an adaptation of the standard multinomial naive Bayes (MNB) algorithm that is particularly suited for imbalanced data sets. Specifically, CNB uses statistics from the complement of each class to compute the model’s weights. The inventors of CNB show empirically that the parameter estimates for CNB are more stable than those for MNB. Further, CNB regularly outperforms MNB (often by a considerable margin) on text classification tasks.

In [27]:
CompNB = ComplementNB()
CompNB.fit(X_train, y_train)
print(CompNB)

y_pred = CompNB.predict(X_test)
print("Accuracy Score:", accuracy_score(y_pred, y_test))

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
Accuracy Score: 0.8697068403908795


### Hyperparameters Tunning

In [28]:
BernNB = BernoulliNB(binarize = 0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_pred = BernNB.predict(X_test)
print("Accuracy Score:", accuracy_score(y_pred, y_test))

BernoulliNB(alpha=1.0, binarize=0.1, class_prior=None, fit_prior=True)
Accuracy Score: 0.8925081433224755
