# Exercise for Unit 4.1 Na√Øve Bayes

In [41]:
# Import necessary libraries
import re # Regular expressions for parsing input
import math # Math library for calculations
from collections import defaultdict # For creating a default dictionary to store graph data

In [None]:

class NaiveBayesManual:

    # Initialize the Naive Bayes Classifier
    def __init__(self):
        self.vocabulary = set()
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.total_words_per_class = defaultdict(int)
    
    # ----------------------------
    # 1. Text Preprocessing
    # ----------------------------
    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r"[^a-z\s]", "", text)
        return text.split()
    
    # ----------------------------
    # a. Generate Bag of Words
    # ----------------------------
    def train(self, dataset):
        for text, label in dataset:
            self.class_counts[label] += 1
            words = self.tokenize(text)

            for word in words:
                self.vocabulary.add(word)
                self.word_counts[label][word] += 1
                self.total_words_per_class[label] += 1

    # ----------------------------
    # b. Calculate Prior
    # ----------------------------
    def calculate_prior(self, label):
        total_docs = sum(self.class_counts.values())
        return self.class_counts[label] / total_docs

    # ----------------------------
    # c. Calculate Likelihoo (Laplace smoothing)
    # ----------------------------
    def calculate_likelihood(self, word, label):
        word_count = self.word_counts[label][word]
        total_words = self.total_words_per_class[label]
        vocab_size = len(self.vocabulary)

        return (word_count + 1) / (total_words + vocab_size)

    # ----------------------------
    # d. Classify Sentence
    # ----------------------------
    def predict(self, text):
        words = self.tokenize(text)
        scores = {}

        for label in self.class_counts:
            # Start with log prior
            scores[label] = math.log(self.calculate_prior(label))

            # Add log likelihoods
            for word in words:
                likelihood = self.calculate_likelihood(word, label)
                scores[label] += math.log(likelihood)

        return max(scores, key=scores.get)



In [43]:
# Load the dataset
from dataset import dataset

model = NaiveBayesManual()
model.train(dataset)


Training on: 'Limited time offer click now' with label 'SPAM'
Training on: 'Exclusive offer just for you' with label 'SPAM'
Training on: 'Meeting at 2 PM with manager' with label 'HAM'
Training on: 'Project discussion tomorrow' with label 'HAM'
Training on: 'Click here to win prize' with label 'SPAM'
Training on: 'Team meeting schedule confirmed' with label 'HAM'


In [44]:
print("Prior SPAM:", model.calculate_prior("SPAM"))
print("Prior HAM:", model.calculate_prior("HAM"))

Prior SPAM: 0.5
Prior HAM: 0.5


In [45]:
# Actual Testing
sentence1 = "Limited offer, click here!"
sentence2 = "Meeting at 2 PM with the manager."
sentence3 = "You win! Claim your prize by submitting your bank account"
sentence4 = "Project discussion tomorrow at 10 AM"
sentence5 = "Don't miss this! You just won a free trip to the Bahamas! Click here to claim your prize now!"

print("Sentence 1:", model.predict(sentence1)) # Expected: SPAM
print("Sentence 2:", model.predict(sentence2)) # Expected: HAM
print("Sentence 3: ", model.predict(sentence3)) # Expected: SPAM
print("Sentence 4:", model.predict(sentence4)) # Expected: HAM
print("Sentence 5:", model.predict(sentence5)) # Expected: SPAM


Sentence 1: SPAM
Sentence 2: HAM
Sentence 3:  SPAM
Sentence 4: HAM
Sentence 5: SPAM
