# Exercise for Unit 4.1 Na√Øve Bayes

In [49]:
# Import necessary libraries
import re # Regular expressions for parsing input
import math # Math library for calculations
from collections import defaultdict # For creating a default dictionary to store graph data

In [50]:
class NaiveBayesManual:

    # Initialize the Naive Bayes Classifier
    def __init__(self):
        self.vocabulary = set()
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.total_words_per_class = defaultdict(int)
    
    # ----------------------------
    # 1. Text Preprocessing
    # ----------------------------
    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r"[^a-z0-9\s]", "", text)  # Keep letters, numbers, and spaces
        return text.split()
    
    # ----------------------------
    # a. Generate Bag of Words
    # ----------------------------
    def train(self, dataset):
        for item in dataset:
            text = item["text"]
            label = item["label"]
            
            self.class_counts[label] += 1
            words = self.tokenize(text)

            for word in words:
                self.vocabulary.add(word)
                self.word_counts[label][word] += 1
                self.total_words_per_class[label] += 1

    # ----------------------------
    # b. Calculate Prior
    # ----------------------------
    def calculate_prior(self, label):
        total_docs = sum(self.class_counts.values())
        return self.class_counts[label] / total_docs

    # ----------------------------
    # c. Calculate Likelihoo (Laplace smoothing)
    # ----------------------------
    def calculate_likelihood(self, word, label):
        word_count = self.word_counts[label][word]
        total_words = self.total_words_per_class[label]
        vocab_size = len(self.vocabulary)

        return (word_count + 1) / (total_words + vocab_size)

    # ----------------------------
    # d. Classify Sentence
    # ----------------------------
    def predict(self, text):
        words = self.tokenize(text)
        scores = {}

        for label in self.class_counts:
            # Start with log prior
            scores[label] = math.log(self.calculate_prior(label))

            # Add log likelihoods
            for word in words:
                likelihood = self.calculate_likelihood(word, label)
                scores[label] += math.log(likelihood)

        predicted_class = max(scores, key=scores.get)
        confidence_score = scores[predicted_class]
        
        return predicted_class, confidence_score



In [None]:
# Load the dataset
from dataset import dataset

model = NaiveBayesManual()
model.train(dataset)

print(sorted(model.vocabulary))


Updated vocabulary with numbers:
['3', '50', 'a', 'are', 'at', 'can', 'catch', 'click', 'dinner', 'for', 'free', 'get', 'here', 'hi', 'how', 'in', 'iphone', 'lets', 'limited', 'lowest', 'meds', 'meeting', 'mom', 'money', 'now', 'off', 'office', 'on', 'pm', 'price', 'prizes', 'report', 'send', 'still', 'team', 'the', 'time', 'today', 'tomorrow', 'up', 'we', 'win', 'you', 'your']


In [52]:
# a. Generate Bag of Words
print("\nVocabulary:", model.vocabulary)
print("[SPAM]", dict(model.word_counts["SPAM"]))
print("[HAM]", dict(model.word_counts["HAM"]))


Vocabulary: {'for', 'catch', 'office', 'team', 'iphone', 'now', 'lowest', 'lets', 'your', 'can', 'here', 'still', 'a', 'the', 'off', 'click', 'how', 'time', 'mom', 'hi', 'in', 'get', '50', 'are', '3', 'prizes', 'limited', 'meds', 'send', 'report', 'up', 'tomorrow', 'free', 'at', 'money', 'on', 'pm', 'price', 'win', 'dinner', 'you', 'meeting', 'today', 'we'}
[SPAM] {'free': 2, 'money': 1, 'now': 1, 'lowest': 1, 'price': 1, 'for': 2, 'your': 1, 'meds': 1, 'win': 1, 'a': 1, 'iphone': 1, 'today': 1, 'get': 1, '50': 1, 'off': 1, 'limited': 1, 'time': 1, 'click': 1, 'here': 1, 'prizes': 1}
[HAM] {'hi': 1, 'mom': 1, 'how': 1, 'are': 2, 'you': 2, 'we': 1, 'still': 1, 'on': 1, 'for': 1, 'dinner': 1, 'lets': 1, 'catch': 1, 'up': 1, 'tomorrow': 2, 'at': 2, 'the': 3, 'office': 2, 'meeting': 2, '3': 1, 'pm': 1, 'team': 1, 'in': 1, 'can': 1, 'send': 1, 'report': 1}


In [53]:
# b. Calculate Prior
print("Prior SPAM:", model.calculate_prior("SPAM"))
print("Prior HAM:", model.calculate_prior("HAM"))

Prior SPAM: 0.45454545454545453
Prior HAM: 0.5454545454545454


In [54]:
# c. Calculate Likelihood (Laplace smoothing)

print("Likelihood Calculations with Laplace Smoothing:")
print("=" * 50)
for word in model.vocabulary:
    spam_likelihood = model.calculate_likelihood(word, "SPAM")
    ham_likelihood = model.calculate_likelihood(word, "HAM")
    print(f"P({word}|SPAM) = {spam_likelihood:.6f}")
    print(f"P({word}|HAM)  = {ham_likelihood:.6f}")
    print("-" * 30)

Likelihood Calculations with Laplace Smoothing:
P(for|SPAM) = 0.045455
P(for|HAM)  = 0.025974
------------------------------
P(catch|SPAM) = 0.015152
P(catch|HAM)  = 0.025974
------------------------------
P(office|SPAM) = 0.015152
P(office|HAM)  = 0.038961
------------------------------
P(team|SPAM) = 0.015152
P(team|HAM)  = 0.025974
------------------------------
P(iphone|SPAM) = 0.030303
P(iphone|HAM)  = 0.012987
------------------------------
P(now|SPAM) = 0.030303
P(now|HAM)  = 0.012987
------------------------------
P(lowest|SPAM) = 0.030303
P(lowest|HAM)  = 0.012987
------------------------------
P(lets|SPAM) = 0.015152
P(lets|HAM)  = 0.025974
------------------------------
P(your|SPAM) = 0.030303
P(your|HAM)  = 0.012987
------------------------------
P(can|SPAM) = 0.015152
P(can|HAM)  = 0.025974
------------------------------
P(here|SPAM) = 0.030303
P(here|HAM)  = 0.012987
------------------------------
P(still|SPAM) = 0.015152
P(still|HAM)  = 0.025974
-------------------------

In [55]:
# Actual Testing
sentence1 = "Limited offer, click here!"
sentence2 = "Meeting at 2 PM with the manager."

prediction1 = model.predict(sentence1)
prediction2 = model.predict(sentence2)

print("Sentence 1:", prediction1[0], "with confidence score:", prediction1[1]) # Expected: SPAM
print("Sentence 2:", prediction2[0], "with confidence score:", prediction2[1]) # Expected: HAM


Sentence 1: SPAM with confidence score: -15.467634786790136
Sentence 2: HAM with confidence score: -26.73610763753005
