In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import math

In [4]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True)

In [8]:
# Preprocess the data
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

In [11]:
# Multinomial Naive Bayes using Sklearn
model_sk = MultinomialNB()
model_sk.fit(X_train,newsgroups_train.target)
pred_sk = model_sk.predict(X_test)

In [17]:
# Evalutaion of Sklearn Model
accuracy_sk = accuracy_score(newsgroups_test.target,pred_sk)
precision_sk = precision_score(newsgroups_test.target,pred_sk,average='macro')
recall_sk = recall_score(newsgroups_test.target,pred_sk,average='macro')
f1_sk = f1_score(newsgroups_test.target,pred_sk,average='macro')

In [20]:
print(f"Sklearn Model - Accuracy: {accuracy_sk},\n Precision: {precision_sk},\n Recall: {recall_sk},\n F1 Score: {f1_sk}")

Sklearn Model - Accuracy: 0.8169144981412639,
 Precision: 0.8367634643937156,
 Recall: 0.8022683859017992,
 F1 Score: 0.7998199251752561


In [22]:
# Naive bayes from scratch
class NaiveBayes:
    def fit(self,x,y):
        n_samples, n_features = x.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        
        self._priors = np.zeros(n_classes)
        self._likelihoods = np.zeros((n_classes,n_features))
        
        for idx,c in enumerate(self._classes):
            X_c = x[y==c]
            self._priors[idx] = X_c.shape[0] / float(n_samples)
            self._likelihoods[idx, :] = (X_c.sum(axis=0) + 1) / (X_c.sum() + n_features)
            
    def predict(self,X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)
    
    def _predict(self,x):
        posteriors = []
        
        for idx,c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            conditional = np.sum(x * np.log(self._likelihoods[idx]))
            posterior = prior + conditional
            posteriors.append(posterior)
        return self._classes[np.argmax(posteriors)]

In [23]:
nb = NaiveBayes()
nb.fit(X_train.toarray(),newsgroups_train.target)
pred_custom = nb.predict(X_test.toarray())

In [24]:
accuracy_custom = accuracy_score(newsgroups_test.target, pred_custom)
precision_custom = precision_score(newsgroups_test.target, pred_custom, average='macro')
recall_custom = recall_score(newsgroups_test.target, pred_custom, average='macro')
f1_custom = f1_score(newsgroups_test.target, pred_custom, average='macro')

In [26]:
print(f"Sklearn Model - Accuracy: {accuracy_sk},\n Precision: {precision_sk},\n Recall: {recall_sk},\n F1 Score: {f1_sk}")
print(f"Custom Model - Accuracy: {accuracy_custom},\n Precision: {precision_custom},\n Recall: {recall_custom},\n F1 Score: {f1_custom}")
# comparing results
print(f"Comparison:\n Sklearn Model - Accuracy: {accuracy_sk},\n Precision: {precision_sk},\n Recall: {recall_sk},\n F1 Score: {f1_sk}")
print(f"Custom Model - Accuracy: {accuracy_custom},\n Precision: {precision_custom},\n Recall: {recall_custom},\n F1 Score: {f1_custom}")

Sklearn Model - Accuracy: 0.8169144981412639,
 Precision: 0.8367634643937156,
 Recall: 0.8022683859017992,
 F1 Score: 0.7998199251752561
Custom Model - Accuracy: 0.8169144981412639,
 Precision: 0.8367634643937156,
 Recall: 0.8022683859017992,
 F1 Score: 0.7998199251752561
Comparison:
 Sklearn Model - Accuracy: 0.8169144981412639,
 Precision: 0.8367634643937156,
 Recall: 0.8022683859017992,
 F1 Score: 0.7998199251752561
Custom Model - Accuracy: 0.8169144981412639,
 Precision: 0.8367634643937156,
 Recall: 0.8022683859017992,
 F1 Score: 0.7998199251752561
