In [6]:
from src.gmail_api import fetch_spam_and_ham
import os
import math

if os.path.exists('token.json'):
    os.remove('token.json')
    spam_mails, ham_mails = fetch_spam_and_ham(100)
else:
    spam_mails, ham_mails = fetch_spam_and_ham(100)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=628982370333-kkqj66e4850ij5qm0nr2l4b6a514s23e.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.readonly&state=aIY7xURTOwJZHIqpoyraHKM0n5x3q7&access_type=offline
Found 16 spam messages and 100 ham messages.


## Implementation 1

In [4]:
print(f"Type of ham and spam mails: {type(spam_mails)}")

Type of ham and spam mails: <class 'dict'>


In [7]:
spam_mails = {msg_id: metadata for msg_id, metadata in spam_mails.items() if metadata['content'] != ''}
ham_mails = {msg_id: metadata for msg_id, metadata in ham_mails.items() if metadata['content'] != ''}

print(f"Number of spam mails: {len(spam_mails)}")
print(f"Number of ham mails: {len(ham_mails)}")

Number of spam mails: 9
Number of ham mails: 75


In [8]:
p_spam = len(spam_mails)/(len(spam_mails) + len(ham_mails))
p_ham = len(ham_mails)/(len(spam_mails) + len(ham_mails))
print(f"Probability of spam: {p_spam}")
print(f"Probability of ham: {p_ham}")

Probability of spam: 0.10714285714285714
Probability of ham: 0.8928571428571429


In [9]:
spam_iterator = list(spam_mails.items())
ham_iterator = list(ham_mails.items())

In [10]:
from collections import Counter

spam_word_dict = {}
ham_word_dict = {}

spam_word_dict = dict(Counter(
    word.lower()
    for _, metadata_dict in spam_iterator
    for word in metadata_dict['content'].split()
))

ham_word_dict = dict(Counter(
    word.lower()
    for _, metadata_dict in ham_iterator
    for word in metadata_dict['content'].split()
))

In [11]:
spam_word_count = dict(sorted(spam_word_dict.items(), key=lambda x: x[1], reverse=True))
ham_word_count = dict(sorted(ham_word_dict.items(), key=lambda x: x[1], reverse=True))

In [12]:
total_words_in_spam = sum(spam_word_count.values())
total_words_in_ham = sum(ham_word_count.values())
vocab = set(list(spam_word_count.keys()) + list(ham_word_count.keys()))
vocab_size = len(vocab)
print(f"Total words in spam: {total_words_in_spam}")
print(f"Total words in ham: {total_words_in_ham}")
print(f"Vocabulary size (Unique words only from each set of Ham and Spam): {len(vocab)}")

Total words in spam: 8756
Total words in ham: 43874
Vocabulary size (Unique words only from each set of Ham and Spam): 9928


In [13]:
laplace_smoothing_factor = 1
epsilon = 1e-10

In [14]:
class TFNaiveBayesClassifier:
    def __init__(self, spam_word_count, ham_word_count, p_spam, p_ham, total_words_in_spam, total_words_in_ham, vocab_size, laplace_smoothing_factor):
        self.spam_word_count = spam_word_count
        self.ham_word_count = ham_word_count
        self.p_spam = p_spam
        self.p_ham = p_ham
        self.total_words_in_spam = total_words_in_spam
        self.total_words_in_ham = total_words_in_ham
        self.vocab_size = vocab_size
        self.laplace_smoothing_factor = laplace_smoothing_factor
    
    def get_word_probability(self,word,class_word_count_dict,total_words_in_class):
        """Calculate the P(word|Class) with laplace smoothing"""
        word_count = class_word_count_dict.get(word, 0)
        probability = (word_count + self.laplace_smoothing_factor)/(total_words_in_class + self.laplace_smoothing_factor*self.vocab_size)
        return probability
    
    def get_word_probability_total(self, word):
        """Calculate the P(word) occuring in the training set (with laplace smoothing)"""
        spam_probability = self.get_word_probability(word, self.spam_word_count, self.total_words_in_spam)
        ham_probability = self.get_word_probability(word, self.ham_word_count, self.total_words_in_ham)
        return spam_probability*self.p_spam + ham_probability*self.p_ham
    
    def classification_probability(self, email_content):
        """Classify the email content as spam or ham"""
        words = email_content.lower().split()
        
        # probability_spam_given_words = self.p_spam
        # probability_ham_given_words = self.p_ham
        
        log_probability_spam = math.log(self.p_spam)
        log_probability_ham = math.log(self.p_ham)
        
        for word in words:
            probability_word_given_spam = self.get_word_probability(word,self.spam_word_count,self.total_words_in_spam)
            probability_word_given_ham = self.get_word_probability(word,self.ham_word_count,self.total_words_in_ham)
            
            # probability_spam_given_words *= probability_word_given_spam
            # probability_ham_given_words *= probability_word_given_ham
            
            log_probability_spam += math.log(probability_word_given_spam)
            log_probability_ham += math.log(probability_word_given_ham)
            
        # Normalize the probabilities
        # total_probability = probability_spam_given_words + probability_ham_given_words
        max_log_probability = max(log_probability_spam, log_probability_ham)
        
        log_probability_spam -= max_log_probability
        log_probability_ham -= max_log_probability
        probability_spam_given_words = math.exp(log_probability_spam)
        probability_ham_given_words = math.exp(log_probability_ham)
        
        total_probability = probability_spam_given_words + probability_ham_given_words
        
        #Final probabilities
        probability_spam_given_words /= total_probability
        probability_ham_given_words /= total_probability
        
        return {
            'spam_probability': probability_spam_given_words,
            'ham_probability': probability_ham_given_words
        }
        
    def classify(self, email_content):
        """Classify the email content as spam or ham"""
        probabilities = self.classification_probability(email_content)
        if probabilities['spam_probability'] > probabilities['ham_probability']:
            return 'spam'
        else:
            return 'ham'

In [15]:
custom_classifier = TFNaiveBayesClassifier(spam_word_count, ham_word_count, p_spam, p_ham, total_words_in_spam, total_words_in_ham, vocab_size, laplace_smoothing_factor=1)

In [16]:
for msg_id, metadata in list(spam_iterator):
    classification = custom_classifier.classify(metadata['content'])
    classification_probabilities = custom_classifier.classification_probability(metadata['content'])
    print(f"Message ID: {msg_id}, Classification: {classification}, Probabilities: {classification_probabilities['spam_probability']:.4f} (spam), {classification_probabilities['ham_probability']:.4f} (ham)")
    if classification != 'spam':
        print(f"False negative detected: {msg_id} classified as {classification}")

Message ID: 197b7a3cac8ab5f9, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 197ad02fb929f34f, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 1977c01f5cb0dd2e, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 19779781bb917411, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 19768ccac149dafb, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 19766868856ed1d5, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 1974346a02e839a8, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 19734207a456b7dd, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)
Message ID: 197306ab16c6c557, Classification: spam, Probabilities: 1.0000 (spam), 0.0000 (ham)


In [27]:
for msg_id, metadata in list(ham_iterator):
    classification = custom_classifier.classify(metadata['content'])
    classification_probabilities = custom_classifier.classification_probability(metadata['content'])
    print(f"Message ID: {msg_id}, Classification: {classification}, Probabilities: {classification_probabilities}")
    if classification != 'ham':
        print(f"False positive detected: {msg_id} classified as {classification}")

Message ID: 197bca55d72ab5ff, Classification: ham, Probabilities: {'spam_probability': 0.005955513964599366, 'ham_probability': 0.9940444860354006}
Message ID: 197bb484ec145cb4, Classification: ham, Probabilities: {'spam_probability': 1.5554861892973933e-86, 'ham_probability': 1.0}
Message ID: 197bb2b187253e32, Classification: ham, Probabilities: {'spam_probability': 0.005955513964599366, 'ham_probability': 0.9940444860354006}
Message ID: 197badd0831731d3, Classification: ham, Probabilities: {'spam_probability': 2.231839129424329e-128, 'ham_probability': 1.0}
Message ID: 197b9fc7b3bda8ba, Classification: ham, Probabilities: {'spam_probability': 1.1095097183049564e-103, 'ham_probability': 1.0}
Message ID: 197b8586de386758, Classification: ham, Probabilities: {'spam_probability': 1.310097505425835e-14, 'ham_probability': 0.9999999999999869}
Message ID: 197b75ebae33ad9b, Classification: ham, Probabilities: {'spam_probability': 6.167606650688457e-50, 'ham_probability': 1.0}
Message ID: 197

In [119]:
# We have inlcuded many optimizations in this code, such as:
## 1. Using log probabilities to avoid underflow issues with very small probabilities which was resulting in NaN values and thus ZeroDivisionError.
## 2. Subtract the maximum log probability to avoid overflow issues with very large probabilities.
## 3. We have not used epsilon anywhere in this code, as we are using log probabilities which avoids the underflow issues.

# This code is a simple implementation of a Naive Bayes classifier for spam detection using term frequency only

In [120]:
# Another interesting thing to note is that, the spam classification is almost perfect, while the ham classification sometimes provides higher spam probabilities than what spam emails have in ham probabilities, i.e. there is a sense of spam being detected in ham emails too, but not as much as spam emails being detected as spam. This is because the spam emails have a lot of common words which are not present in ham emails, thus making it easier to classify them as spam.
# One can conclude that some of the ham emails are actually spam, but they are not classified as spam because they do not have enough common words with the spam emails. This is a limitation of the Naive Bayes classifier, as it assumes that the words are independent of each other, which is not always the case in real-world scenarios.

## Implementation 2

In [1]:
from src.gmail_api import fetch_spam_and_ham
import os

if os.path.exists('token.json'):
    os.remove('token.json')
    spam_mails, ham_mails = fetch_spam_and_ham(100)
else:
    spam_mails, ham_mails = fetch_spam_and_ham(100)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=628982370333-kkqj66e4850ij5qm0nr2l4b6a514s23e.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.readonly&state=HONVCZiNnObUGstCmu0p9bGzGuqOOG&access_type=offline
Found 17 spam messages and 100 ham messages.


In [19]:
from src.classifier_copy import TFNaiveBayesClassifier as TFNaiveBayesClassifierCopy

new_classifier = TFNaiveBayesClassifierCopy(laplace_smoothing_factor=1)

In [20]:
new_classifier.fit(spam_mails, ham_mails)

In [21]:
spam_iterator = list(spam_mails.items())
ham_iterator = list(ham_mails.items())

In [25]:
for msg_id, metadata in list(ham_iterator):
    classification = new_classifier.classify(metadata['content'])
    classification_probabilities = new_classifier.classification_probability(metadata['content'])
    print(f"Message ID: {msg_id}, Classification: {classification}, Probabilities: {classification_probabilities}")
    if classification != 'spam':
        print(f"Wrong classification detected: {msg_id} classified as {classification}")
    else:
        print(f"True positive detected: {msg_id} classified as {classification}")

Message ID: 197bca55d72ab5ff, Classification: ham, Probabilities: {'spam_probability': 0.010853855413729085, 'ham_probability': 0.9891461445862709}
Wrong classification detected: 197bca55d72ab5ff classified as ham
Message ID: 197bb484ec145cb4, Classification: ham, Probabilities: {'spam_probability': 1.32553842772058e-58, 'ham_probability': 1.0}
Wrong classification detected: 197bb484ec145cb4 classified as ham
Message ID: 197bb2b187253e32, Classification: ham, Probabilities: {'spam_probability': 0.010853855413729085, 'ham_probability': 0.9891461445862709}
Wrong classification detected: 197bb2b187253e32 classified as ham
Message ID: 197badd0831731d3, Classification: ham, Probabilities: {'spam_probability': 7.643329985756523e-83, 'ham_probability': 1.0}
Wrong classification detected: 197badd0831731d3 classified as ham
Message ID: 197b9fc7b3bda8ba, Classification: ham, Probabilities: {'spam_probability': 1.6254528634348027e-71, 'ham_probability': 1.0}
Wrong classification detected: 197b9fc