# Naïve Bayes Model

## Requisites

In [None]:
import numpy as np
import time
import math 
from sklearn.model_selection import train_test_split

 ## Preprocessing Data

In [None]:
data = np.load("training_data.npz")["data"]
test = np.load("testing.npz")["data"]

In [None]:
Y = open('newsgrouplabels.txt', 'r')
classes = []
#print("The classes are")
for line in Y:
    y = line.split()
    classes.append(y)
    #print(y[1])

In [None]:
vocabs = open('vocabulary.txt', 'r')
vocabulary = []
#print("The classes are")
for line in vocabs:
    vocab = line.split()
    vocabulary.append(vocab[0])
    #print(vocab[0])

In [None]:
X = data[:, 1:-1]
Y = data[:, -1]

xTrain, xVal, yTrain, yVal = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

## Training the Model

In [None]:
#calculating the prior probability P(Y)
tCount = yTrain.size
prior = []
for i,j in classes:
    vCount = 0
    for k in yTrain:
        if k == int(i):
            vCount += 1
    prior.append([int(i), vCount/tCount])
    #print("Class", i, j, "probability is", f"{vCount}/{tCount} = {round(vCount/tCount, 2)}")

In [None]:
vData = np.empty((0, xTrain.shape[1]), dtype=xTrain.dtype)
for i,j in classes:
    rowIndices = np.where(yTrain == int(i))[0]
    vData = np.vstack((vData, np.sum(xTrain[rowIndices, :], axis=0)))

In [None]:
#calculating likelihood of the vocabulary for each class P(X/Y)
tWords = np.sum(vData, axis=1).reshape(-1, 1)
vSize = len(vocabulary)
beta = 1/tWords
alpha = 1 + beta
likelihoods = (vData + (alpha - 1)) / (tWords + ((alpha - 1) * vSize))

In [None]:
predict = np.argmax(np.log2(np.array(prior)[:,1]) + xVal @ np.log2(likelihoods.T), axis=1) + 1
accuracy = round(np.mean(predict == yVal), 2)
print("Accuracy:", accuracy)

## Experimentation

In [None]:
# declaring a testing function to retrain the model considering different values of beta
def naiveBayesTest(betaValues, roundingValue):
    for beta in betaValues:
        alpha = 1 + beta
        likelihoods = (vData + (alpha - 1)) / (tWords + ((alpha - 1) * vSize))
        predict = np.argmax(np.log2(np.array(prior)[:,1]) + xVal @ np.log2(likelihoods.T), axis=1) + 1
        accuracy = round(np.mean(predict == yVal), roundingValue)
        print("Beta:", beta, "| Accuracy:", accuracy)

In [None]:
betaValues = [1e-4, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
naiveBayesTest(betaValues,4)