In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [None]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
print(df.describe)

In [None]:
df = df.drop(["Unnamed: 0", "label_num"], axis=1)
print(df.describe)

In [None]:
df.columns = ['label', 'text']
print(df.describe)

In [None]:
df['b_labels'] = df['label'].map({'ham': 0, 'spam': 1})
print(df.head())

In [None]:
y = df['b_labels'].values
x_train, x_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33)


In [None]:
print(x_train)

In [None]:
print(y_train)

In [None]:
class NaiveBayesSpamClassifier:
    def fit(self, x_train, y_train):
        self.messages = x_train
        self.labels = y_train
        
        self.word_dict = {}

        for i in range(len(self.messages)):
            try:
                for word in self.messages[i].split():
                    #print(word)
                    
                    if word not in self.word_dict:
                        word = str(word)
                        self.word_dict[word] = (0, 0)
                    #print(word)
                    self.word_dict[word] = (self.word_dict[word][0] + (self.labels[i] == 0), self.word_dict[word][1] + (self.labels[i] == 1))
            except:
                    
                    pass
        self.spam_count = sum(self.labels)
        self.ham_count = len(self.labels) - self.spam_count
        self.p_spam = self.spam_count / len(self.labels)
        self.p_ham = 1 - self.p_spam
    def predict(self, message):
        p_spam_message = self.p_spam
        p_ham_message = self.p_ham
        for word in message.split():
            if word not in self.word_dict:
                continue
            p_spam_message *= (self.word_dict[word][1] + 1) / (self.spam_count + 1)
            p_ham_message *= (self.word_dict[word][0] + 1) / (self.ham_count + 1)
        return p_spam_message > p_ham_message
    
    def accuracy(self, X, y):
        # Predict and check the accuracy
        for message in X:
            prediction = self.predict(message)

        return np.mean(prediction == y)
        

In [None]:
#make x_train a list
x_train = x_train.tolist()

In [None]:
x_test = x_test.tolist()

In [None]:
print(x_train)

In [None]:
# Instantiate and train the Naive Bayes Classifier
nb = NaiveBayesSpamClassifier()
nb.fit(x_train, y_train)



In [None]:
nb.word_dict

In [None]:
# Calculate training and test accuracy
print("train accuracy:", nb.accuracy(x_train, y_train))
print("test accuracy:", nb.accuracy(x_test, y_test))

In [None]:
tfidf = TfidfVectorizer(decode_error='ignore')
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

In [None]:
print(x_train)

In [None]:
#classify using naive bayes
model = MultinomialNB()
model.fit(x_train, y_train)
print("train accuracy:", model.score(x_train, y_train))
print("test score:", model.score(x_test, y_test))



In [None]:

class NaiveBayesClassifier:
    def fit(self, X, y):
        # Separate documents by class
        self.spam_docs = X[y == 1]
        self.ham_docs = X[y == 0]
        
        # Calculate the prior probabilities P(spam) and P(ham)
        self.p_spam = len(self.spam_docs) / len(X)
        self.p_ham = len(self.ham_docs) / len(X)
        
        # Calculate word counts for spam and ham
        self.spam_word_count = np.sum(self.spam_docs, axis=0)
        self.ham_word_count = np.sum(self.ham_docs, axis=0)
        #print(self.spam_word_count)
        # Total word counts for spam and ham documents
        self.spam_total = np.sum(self.spam_word_count)
        self.ham_total = np.sum(self.ham_word_count)
        
        # Vocabulary size
        self.vocab_size = X.shape[1]
        
        # Calculate conditional probabilities with Laplace smoothing
        self.spam_prob = (self.spam_word_count + 1) / (self.spam_total + self.vocab_size)
        self.ham_prob = (self.ham_word_count + 1) / (self.ham_total + self.vocab_size)
    
    def predict_log_proba(self, X):
        # Calculate log probabilities for the given X based on learned probabilities
        log_prob_spam = X @ np.log(self.spam_prob) + np.log(self.p_spam)
        log_prob_ham = X @ np.log(self.ham_prob) + np.log(self.p_ham)
        
        # Combine into a matrix of log probabilities for each class
        return np.vstack([log_prob_ham, log_prob_spam]).T
    
    def predict(self, X):
        # Get the log probabilities for ham and spam
        log_probs = self.predict_log_proba(X)
        
        # Choose the class with the higher probability (log space)
        return np.argmax(log_probs, axis=1)
    
    def accuracy(self, X, y):
        # Predict and check the accuracy
        predictions = self.predict(X)
        return np.mean(predictions == y)
    def precision(self, X, y):
        # Predict and check the precision
        predictions = self.predict(X)
        return np.sum(predictions[y == 1] == 1) / np.sum(predictions == 1)
    def recall(self, X, y):
        # Predict and check the recall
        predictions = self.predict(X)
        return np.sum(predictions[y == 1] == 1) / np.sum(y == 1)
    def f1_score(self, X, y):
        # Calculate the F1 score
        precision = self.precision(X, y)
        recall = self.recall(X, y)
        return 2 * precision * recall / (precision + recall)




In [None]:

class NaiveBayesClassifier:
    def fit(self, X, y):
        # Separate documents by class
        self.C = [X[y == i] for i in np.unique(y)]
        
        # Calculate the prior probabilities P(spam) and P(ham)
        self.p_C = [len(self.C[i]) / len(X) for i in range(len(np.unique(y)))]
        
        # Calculate word counts for spam and ham
        self.C_count = [np.sum(self.C[i], axis=0) for i in range(len(np.unique(y)))]
        
        self.c_total = [np.sum(self.C_count[i]) for i in range(len(np.unique(y)))]
        
        # Vocabulary size
        self.vocab_size = X.shape[1]
        
        # Calculate conditional probabilities with Laplace smoothing
        self.C_prob = [(self.C_count[i] + 1) / (self.c_total[i] + self.vocab_size) for i in range(len(np.unique(y)))]
    
    def predict_log_proba(self, X):
        
        log_prob_C = [X @ np.log(self.C_prob[i]) + np.log(self.p_C[i]) for i in range(len(np.unique(y)))]
        
        # Combine into a matrix of log probabilities for each class
        return np.vstack(log_prob_C).T
    
    def predict(self, X):
        # Get the log probabilities for ham and spam
        log_probs = self.predict_log_proba(X)
        
        # Choose the class with the higher probability (log space)
        return np.argmax(log_probs, axis=1)
    
    def accuracy(self, X, y):
        # Predict and check the accuracy
        predictions = self.predict(X)
        return np.mean(predictions == y)
    def precision(self, X, y):
        # Predict and check the precision
        predictions = self.predict(X)
        return np.sum(predictions[y == 1] == 1) / np.sum(predictions == 1)
    def recall(self, X, y):
        # Predict and check the recall
        predictions = self.predict(X)
        return np.sum(predictions[y == 1] == 1) / np.sum(y == 1)
    def f1_score(self, X, y):
        # Calculate the F1 score
        precision = self.precision(X, y)
        recall = self.recall(X, y)
        return 2 * precision * recall / (precision + recall)




In [None]:
#transform xtrain to numpy array
x_train = x_train.toarray()
x_test = x_test.toarray()


In [None]:
# Instantiate and train the Naive Bayes Classifier
nb = NaiveBayesClassifier()
nb.fit(x_train, y_train)

# Calculate training and test accuracy
print("train accuracy:", nb.accuracy(x_train, y_train))
print("train precision:", nb.precision(x_train, y_train))
print("train recall:", nb.recall(x_train, y_train))
print("train f1 score:", nb.f1_score(x_train, y_train))
print("test accuracy:", nb.accuracy(x_test, y_test))
print("test precision:", nb.precision(x_test, y_test))
print("test recall:", nb.recall(x_test, y_test))
print("test f1 score:", nb.f1_score(x_test, y_test))
