# Exercise for Unit 4.1 Na√Øve Bayes

In [11]:
# Import necessary libraries
import re # Regular expressions for parsing input
import math # Math library for calculations
from collections import defaultdict # For creating a default dictionary to store graph data

In [None]:
class NaiveBayesManual:

    # Initialize the Naive Bayes Classifier
    def __init__(self):
        self.vocabulary = set()
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.total_words_per_class = defaultdict(int)
    
    # ----------------------------
    # 1. Text Preprocessing
    # ----------------------------
    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r"[^a-z\s]", "", text)
        return text.split()
    
    # ----------------------------
    # a. Generate Bag of Words
    # ----------------------------
    def train(self, dataset):
        for item in dataset:
            text = item["text"]
            label = item["label"]
            
            self.class_counts[label] += 1
            words = self.tokenize(text)

            for word in words:
                self.vocabulary.add(word)
                self.word_counts[label][word] += 1
                self.total_words_per_class[label] += 1

    # ----------------------------
    # b. Calculate Prior
    # ----------------------------
    def calculate_prior(self, label):
        total_docs = sum(self.class_counts.values())
        return self.class_counts[label] / total_docs

    # ----------------------------
    # c. Calculate Likelihoo (Laplace smoothing)
    # ----------------------------
    def calculate_likelihood(self, word, label):
        word_count = self.word_counts[label][word]
        total_words = self.total_words_per_class[label]
        vocab_size = len(self.vocabulary)

        return (word_count + 1) / (total_words + vocab_size)

    # ----------------------------
    # d. Classify Sentence
    # ----------------------------
    def predict(self, text):
        words = self.tokenize(text)
        scores = {}

        for label in self.class_counts:
            # Start with log prior
            scores[label] = math.log(self.calculate_prior(label))

            # Add log likelihoods
            for word in words:
                likelihood = self.calculate_likelihood(word, label)
                scores[label] += math.log(likelihood)

        predicted_class = max(scores, key=scores.get)
        confidence_score = scores[predicted_class]
        
        return predicted_class, confidence_score



In [13]:
# Load the dataset
from dataset import dataset

model = NaiveBayesManual()
model.train(dataset)


In [16]:
print("Prior SPAM:", model.calculate_prior("SPAM"))
print("Prior HAM:", model.calculate_prior("HAM"))

Prior SPAM: 0.45454545454545453
Prior HAM: 0.5454545454545454


In [17]:
# Actual Testing
sentence1 = "Limited offer, click here!"
sentence2 = "Meeting at 2 PM with the manager."

prediction1 = model.predict(sentence1)
prediction2 = model.predict(sentence2)

print("Sentence 1:", prediction1[0], "with confidence score:", prediction1[1]) # Expected: SPAM
print("Sentence 2:", prediction2[0], "with confidence score:", prediction2[1]) # Expected: HAM


Sentence 1: SPAM with confidence score: -15.281554724250565
Sentence 2: HAM with confidence score: -22.153860243779278
