In [None]:
import time

import pandas as pd
from sklearn.metrics import accuracy_score

In [None]:
dataset = pd.read_csv('spam_tfidf.csv', index_col=[0])
dataset.head()

In [None]:
from sklearn.model_selection import train_test_split

X = dataset.drop('targhet', axis=1)
y = dataset['targhet']  # colonna che segna se è spam o meno
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

sklearn _BaseEstimator_ and _ClassifierMixin_ classes sono usati per creare un classificatore custom
che può essere utilizzato con la libreria sklearn

In [77]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


class GaussianNaiveBayes(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.priors_ = np.bincount(y) / len(y)
        self.n_classes_ = np.max(y) + 1

        self.means_ = np.array([X[np.where(y == i)].mean(axis=0) for i in range(self.n_classes_)])
        self.stds_ = np.array([X[np.where(y == i)].std(axis=0) for i in range(self.n_classes_)])

        return self

    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)

        res = []
        for i in range(len(X)):
            probas = []
            for j in range(self.n_classes_):
                probas.append((1 / np.sqrt(2 * np.pi * self.stds_[j] ** 2) * np.exp(
                    -0.5 * ((X[i] - self.means_[j]) / self.stds_[j]) ** 2)).prod() * self.priors_[j])
            probas = np.array(probas)
            res.append(probas / probas.sum())

        return np.array(res)

    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)

        res = self.predict_proba(X)

        return res.argmax(axis=1)



In [78]:
# from sklearn.naive_bayes import GaussianNB
nbg = GaussianNaiveBayes()
start_time = time.time()
nbg.fit(X_train, y_train)
print('Training time: %f' % (time.time() - start_time))
start_time = time.time()
y_pred_nbg = nbg.predict(X_test)
print('Prediction time: %f' % (time.time() - start_time))
print('Missclassified examples: %d' % (y_test != y_pred_nbg).sum())
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred_nbg))

Training time: 0.016015
Prediction time: 0.079684
Missclassified examples: 169
Accuracy: 0.817


  res.append(probas / probas.sum())
