In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [28]:
def read_data(name1,name2):
    train = pd.read_csv(f"datasets/{name1}.txt", delim_whitespace=True, header=None)
    X_train = train.iloc[:, :-1].values  # Features
    y_train = train.iloc[:, -1].values    # Target

    test = pd.read_csv(f"datasets/{name2}.txt", delim_whitespace=True, header=None)
    X_test = test.iloc[:, :-1].values  # Features
    y_test = test.iloc[:, -1].values    # Target
    return X_train, y_train, X_test, y_test


In [29]:
class NaiveBayes:
    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        self.parameters = []
        # Calculate mean and standard deviation for each class / feature
        for c in self.classes:
            X_c = X[y == c]
            self.parameters.append([(np.mean(X_c[:, i]), np.std(X_c[:, i])) for i in range(X_c.shape[1])])
        

    #NormalPDF probability of X P(x)
    def _calculate_probability(self, x, mean, stdev):
        exponent = np.exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
        return (1 / (np.sqrt(2 * np.pi) * stdev)) * exponent

    #Looping through dataset
    def _calculate_class_probabilities(self, x):
        probabilities = {}
        for i, c in enumerate(self.classes):
            probabilities[c] = 1
            for j, param in enumerate(self.parameters[i]):
                mean, stdev = param
                probabilities[c] *= self._calculate_probability(x[j], mean, stdev)
        return probabilities

    def predict(self, X):
        predictions = []
        #foreach class find most likely probability datapoint is a given class
        for x in X:
            probabilities = self._calculate_class_probabilities(x)
            best_class = None
            best_prob = -1
            for c, prob in probabilities.items():
                if best_class is None or prob > best_prob:
                    best_prob = prob
                    best_class = c
            predictions.append(best_class)
        return predictions

def accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def true_positives(y_true, y_pred, positive_label):
    return np.sum((y_true == positive_label) & (y_pred == positive_label))

def false_positives(y_true, y_pred, positive_label):
    return np.sum((y_true != positive_label) & (y_pred == positive_label))

def true_negatives(y_true, y_pred, negative_label):
    return np.sum((y_true == negative_label) & (y_pred == negative_label))

def false_negatives(y_true, y_pred, negative_label):
    return np.sum((y_true != negative_label) & (y_pred != negative_label))

def precision(y_true, y_pred, positive_label):
    tp = true_positives(y_true, y_pred, positive_label)
    fp = false_positives(y_true, y_pred, positive_label)
    return tp / (tp + fp)

def recall(y_true, y_pred, positive_label):
    tp = true_positives(y_true, y_pred, positive_label)
    fn = false_negatives(y_true, y_pred, positive_label)
    return tp / (tp + fn)

def print_all(y_test,y_pred):
    print(f'accurancy: {accuracy(y_test, y_pred)}')
    print(f'true pos: {true_positives(y_test, y_pred, positive_label=1)}')  # positive class label is 1
    print(f'false pos: {false_positives(y_test, y_pred, positive_label=1)}')
    print(f'true neg: {true_negatives(y_test, y_pred, negative_label=-1)}')  # negative class label is -1
    print(f'false neg: {false_negatives(y_test, y_pred, negative_label=-1)}')
    print(f'precision: {precision(y_test, y_pred, positive_label=1)}')
    print(f'recall: {recall(y_test, y_pred, positive_label=1)}')

In [34]:
nb=NaiveBayes()
X_train, y_train, X_test, y_test = read_data("irisTraining","irisTesting")
nb.fit(X_train,y_train)
y_pred = np.array(nb.predict(X_test))
print_all(y_test,y_pred)

accurancy: 0.94
true pos: 16
false pos: 3
true neg: 31
false neg: 16
precision: 0.8421052631578947
recall: 0.3404255319148936


In [35]:
nb=NaiveBayes()
X_train, y_train, X_test, y_test = read_data("buyTraining","buyTesting")
nb.fit(X_train,y_train)
y_pred = np.array(nb.predict(X_test))
print_all(y_test,y_pred)

accurancy: 0.75
true pos: 2
false pos: 1
true neg: 1
false neg: 2
precision: 0.6666666666666666
recall: 0.6666666666666666


In [36]:
nb=NaiveBayes()
X_train, y_train, X_test, y_test = read_data("irisPCTraining","irisPCTesting")
nb.fit(X_train,y_train)
y_pred = np.array(nb.predict(X_test))
print_all(y_test,y_pred)

accurancy: 0.92
true pos: 16
false pos: 4
true neg: 30
false neg: 16
precision: 0.8
recall: 0.34782608695652173


Accuracy represents the number of correctly predicted classifications over the total number of classifications. (.94,.75,.92)

true positive is classified positives which were positive,(16,2,16)

false positive is classified positives which were negative,(3,1,4)

true negative is classified negatives which were negative,(31,1,30)

false negative is classified negatives which were positive.(16,2,16)

Precision is looking at the accuracy of positive classifications, (.842,.66,80)

and Recall is the accuracy of all actual (base) positive classes, and their state after classification.(.34,.66,.348)

--

Our Model has a higher precision, and lower recall, meaning that it is prioritizing accurate positive predictions, over coverage of all positive instances. In the context of this dataset
