# 8510 HW6 Part1 - David Tung

In [254]:
import pandas as pd
import numpy as np

# Load the data from the uploaded file
file_path = './spam2.csv'
spam_data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset to understand its structure
#print(spam_data.head())
np_spam_data = np.array(spam_data)

In [255]:
# Splitting the data into train (top 70%) and test (remaining 30%) sets
split_index = int(0.7 * len(spam_data))
train_data = np_spam_data[:split_index]
test_data = np_spam_data[split_index:]

In [256]:
class_count = np.array([sum(train_data[:,0]==0),sum(train_data[:,0]==1)])
prior_prob = class_count/train_data.shape[0]

In [260]:
class NaiveBayesUtils:
    
    def __init__(self):
        self.n_target_classes = 0
        self.classes = None
        self.class_f = None # frequency
        self.feature_f = None
        self.class_prob = None
        self.feature_p = None

    def probabilities(self, feature):
        n_rows, n_columes = feature.shape
        self.class_prob = self.class_f / n_rows
        for i in range(n_columes):
            for f in range(len(np.unique(feature[:, i]))):
                for c in self.classes:
                    self.feature_p[i, f, c] = \
                        (self.feature_f[i, f, c] + 1) / (self.class_f[c] \
                                + len(np.unique(feature[:, i])))

    def fit_model(self, feature, y):
        self.classes = np.unique(y)
        self.n_target_classes = len(self.classes)
        n_rows, n_features = feature.shape
        self.class_f = np.zeros(self.n_target_classes)
        self.feature_f = \
            np.zeros((n_features, len(np.unique(feature)), self.n_target_classes))
        self.class_prob = np.zeros(self.n_target_classes)
        self.feature_p = \
            np.zeros((n_features, len(np.unique(feature)), self.n_target_classes))
        for cls in self.classes: # iterate classes
            feature_c = feature[y == cls]
            self.class_f[cls] = feature_c.shape[0]
            for fi in range(n_features): # iterate feature index
                feature_values = np.unique(feature[:, fi])
                for fv in feature_values: # iterate feature val
                    self.feature_f[fi, fv, cls] = np.sum(feature_c[:, fi] == fv)     
        self.probabilities(feature)

    def predict(self, features):
        pred = []
        for sample in features:
            posteriors = []
            for cls in self.classes:
                posterior = self.class_prob[cls]
                for i, val in enumerate(sample):
                    posterior = posterior * self.feature_p[i, val, cls]
                posteriors.append(posterior)
            pred.append(self.classes[np.argmax(posteriors)])
        return np.array(pred)

In [261]:
naiveBayesUtils = NaiveBayesUtils()
features = train_data[:, 1:]  # Features
target = train_data[:, 0]   # Labels
naiveBayesUtils.fit_model(features, target)

testing_data_features = test_data[:, 1:]  # Features from testing
y_pred = naiveBayesUtils.predict(testing_data_features)

actual = np.array([test_data[:, 0]]).T[:,0]
pred = np.array([y_pred]).T[:,0]

TP = sum((actual==1) & (pred ==1))
TN = sum((actual==0) & (pred ==0))
FN = sum((actual==1) & (pred ==0))
FP = sum((actual==0) & (pred ==1))

cm = np.array([[TP, FN],
               [FP, TN]])

# Results

In [262]:
Accuracy = (TP+TN)/(TP+FN+FP+TN)
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1 = 2*Precision*Recall/(Precision+Recall)
print(f"Class Counts [0, 1]:\n {class_count}\n")
print(f"Prior Probabilities [0, 1]:\n {prior_prob}\n")
print(f"Confusion Matrix[[TP,FN],[FP,TN]]: \n {cm}\n")
print(f"Accuracy= {Accuracy}")
print(f"Recall= {Recall}")
print(f"Precision= {Precision}")
print(f"F1= {F1}")


Class Counts [0, 1]:
 [460 454]

Prior Probabilities [0, 1]:
 [0.50328228 0.49671772]

Confusion Matrix[[TP,FN],[FP,TN]]: 
 [[166  33]
 [  6 187]]

Accuracy= 0.9005102040816326
Recall= 0.8341708542713567
Precision= 0.9651162790697675
F1= 0.8948787061994609
