## Multinomial Naive Bayes Implemention from Scratch

In [256]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [257]:
spam = pd.read_csv("../datasets/sms_spam/sms_spam_cleaned.csv", delimiter=";", index_col=0)
spam.head()

Unnamed: 0,class,message,stemmed,not_stemmed
0,ham,"Go until jurong point, crazy.. Available only ...",go wrong point crazi avail bug n great world l...,go until wrong point crazy available only in b...
1,ham,Ok lar... Joking wif u oni...,ok lar joke,ok lar joking if you on
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri number wili come win fa cup final t...,free entry in number a wily come to win fa cup...
3,ham,U dun say so early hor... U c already then say...,u dun say earli c alreadi say,u dun say so early for you c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe us live around though,nah i don t think he goes to us he lives aroun...


In [258]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   class        5572 non-null   object
 1   message      5572 non-null   object
 2   stemmed      5562 non-null   object
 3   not_stemmed  5570 non-null   object
dtypes: object(4)
memory usage: 217.7+ KB


In [259]:
spam['stemmed'] = spam['stemmed'].fillna("")
spam['not_stemmed'] = spam['not_stemmed'].fillna("")

In [260]:
spam['class'].value_counts()

class
ham     4825
spam     747
Name: count, dtype: int64

In [261]:
cv = CountVectorizer()
X = cv.fit_transform(spam['stemmed']).toarray()

In [262]:
X.shape

(5572, 4905)

In [263]:
y = np.where(spam['class'] == 'ham', 0, 1)

In [264]:
def shuffle_data(X, y):
    shuffle_indices = np.random.permutation(len(X))
    X, y = X[shuffle_indices], y[shuffle_indices]

    return X, y

In [265]:
X, y = shuffle_data(X, y)

In [266]:
def split_dataset(X, y, split_ratio):
    split_size = int(len(X) * split_ratio)
    X_train = X[:split_size]
    y_train = y[:split_size]
    X_test = X[split_size:]
    y_test = y[split_size:]

    return X_train, y_train, X_test, y_test

In [267]:
split_ratio = 0.75

In [268]:
X_train, y_train, X_test, y_test = split_dataset(X, y, split_ratio)

In [269]:
class MultinomialNaiveBayes:  
    def fit(self, X, y):
        self.likelihoods_1 = (np.sum(X[y == 1], axis=0) + 1) / ((np.sum(X[y == 1])) + X.shape[1])
        self.likelihoods_0 = (np.sum(X[y == 0], axis=0) + 1) / (np.sum(X[y == 0]) + X.shape[1])
        self.prior = np.sum(y) / len(y)
   
    def predict(self, X):
        joint_log_likelihood_1 = np.sum(X * np.log(self.likelihoods_1), axis=1)
        joint_log_likelihood_0 = np.sum(X * np.log(self.likelihoods_0), axis=1)
        log_posterior_1 = joint_log_likelihood_1 + np.log(self.prior)
        log_posterior_0 = joint_log_likelihood_0 + np.log(1 - self.prior)
        y_pred = (log_posterior_1 > log_posterior_0).astype(int)
        
        return y_pred

In [270]:
model = MultinomialNaiveBayes()
model.fit(X_train, y_train)

In [271]:
y_pred = model.predict(X_test)

In [272]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1211
           1       0.87      0.92      0.89       182

    accuracy                           0.97      1393
   macro avg       0.93      0.95      0.94      1393
weighted avg       0.97      0.97      0.97      1393

