In [38]:
import pandas as pd
import time
from sklearn.metrics import accuracy_score

In [39]:
dataset = pd.read_csv('spam_tfidf.csv', index_col=[0])
dataset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,46,47,48,49,50,51,52,53,54,targhet
0,0.0,1.04567,0.570084,0.0,0.309696,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.553776,0.0,0.0,1
1,0.309672,0.45748,0.445378,0.0,0.135492,0.427637,0.365548,0.12039,0.0,1.18663,...,0.0,0.0,0.0,0.0,0.069628,0.0,0.264787,0.214164,0.08707,1
2,0.088478,0.0,0.632436,0.0,1.190396,0.290182,0.330734,0.206383,1.1416,0.315593,...,0.131159,0.0,0.0,0.01762,0.07543,0.0,0.196455,0.218923,0.01814,1
3,0.0,0.0,0.0,0.0,0.609715,0.0,0.539619,1.083511,0.552962,0.795294,...,0.0,0.0,0.0,0.0,0.072265,0.0,0.097516,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.609715,0.0,0.539619,1.083511,0.552962,0.795294,...,0.0,0.0,0.0,0.0,0.07121,0.0,0.096092,0.0,0.0,1


In [40]:
from sklearn.model_selection import train_test_split
X = dataset.drop('targhet', axis=1)
y = dataset['targhet']  # colonna che segna se è spam o meno
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [57]:
import numpy as np

class GaussianNB:
    def __init__(self):
        # Initialize an empty dictionary to store the mean and variance of each feature for each label
        self.mean = {}
        self.variance = {}

    def fit(self, X, y):
        # Compute the mean and variance of each feature for each label
        self.mean, self.variance = compute_mean_variance(X, y)
        # Compute the prior probability of each label
        self.priors = compute_priors(y)

    def predict(self, X):
        # Initialize an empty list to store the predictions
        y_pred = []
        # Loop over the rows of the data
        for i, row in X.iterrows():
            # Predict the label for the current row
            # Initialize a dictionary to store the probabilities of each label
            prob = {}
            # Loop over the labels
            for label in self.priors.keys():
                # Compute the probability of the features given the label
                prob[label] = prob_features_given_label(row, label, self.mean, self.variance) * self.priors[label]
            # Select the label with the highest probability
            prediction = max(prob, key=prob.get)
            y_pred.append(prediction)
        return y_pred

    def prob_features_given_label(self, features, label, mean, variance):
        # Compute the probability of each feature given the label
        prob = 1.0
        for i, feature in enumerate(features):
            # Compute the probability of the feature given the label using the Gaussian distribution
            prob *= (1 / np.sqrt(2 * np.pi * variance[label][i])) * np.exp(-((feature - mean[label][i]) ** 2) / (2 * variance[label][i]))
        return prob


    def score(self, X, y):
        # Predict the labels for the data
        y_pred = self.predict(X)
        # Compute the accuracy of the predictions
        accuracy = np.mean(y_pred == y)
        return accuracy

# Define a function to compute the mean and variance of each feature for each label
def compute_mean_variance(X, y):
    # Initialize dictionaries to store the mean and variance of each feature for each label
    mean = {}
    variance = {}
    # Loop over the labels
    for label in np.unique(y):
        # Select the rows with the current label
        X_label = X[y == label]
        # Compute the mean and variance of each feature for the current label
        mean[label] = np.mean(X_label, axis=0)
        variance[label] = np.var(X_label, axis=0)
    return mean, variance

# Define a function to compute the prior probability of each label
def compute_priors(y):
    # Compute the total number of samples
    n_samples = len(y)
    # Initialize a dictionary to store the priors
    priors = {}
    # Loop over the labels
    for label in np.unique(y):
        # Compute the prior probability of the label
        priors[label] = np.sum(y == label) / n_samples
    return priors


In [60]:
from sklearn.naive_bayes import GaussianNB
nbg = GaussianNB()
start_time=time.time()
nbg.fit(X_train, y_train)
print('Training time: %f'%(time.time() - start_time))
start_time=time.time()
y_pred_nbb = nbg.predict(X_test)
print('Prediction time: %f'%(time.time() - start_time))
print('Missclassified examples: %d'% (y_test != y_pred_nbb).sum())
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred_nbb))

Training time: 0.012424
Prediction time: 0.005161
Missclassified examples: 174
Accuracy: 0.811
