# SPAM or HAM

In [71]:
%matplotlib inline
import math
import nltk
import polars as pl
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/mantunes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mantunes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mantunes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [62]:
dataset_train = [
('send us your password', 'spam'),
('review our website', 'spam'),
('send your password', 'spam'),
('send us your account', 'spam'),
('Your activity report', 'ham'),
('benefits physical activity', 'ham'),
('the importance vows', 'ham'),
]

dataset_test = [
    ('renew your password', 'spam'),
    ('renew your vows', 'spam'),
    ('benefits of our account', 'ham'),
    ('the importance of physical activity', 'ham')
]

print(f'{dataset_train}')
print(f'{dataset_test}')

[('send us your password', 'spam'), ('review our website', 'spam'), ('send your password', 'spam'), ('send us your account', 'spam'), ('Your activity report', 'ham'), ('benefits physical activity', 'ham'), ('the importance vows', 'ham')]
[('renew your password', 'spam'), ('renew your vows', 'spam'), ('benefits of our account', 'ham'), ('the importance of physical activity', 'ham')]


In [63]:
class NB:
    def __init__(self, k=1):
        self.k = k
        self.lemmatizer = WordNetLemmatizer()
    
    def _vocab(self, samples):
        vocab = [token for sample in samples for token in sample]
        #print(f'{samples}/{vocab}')
        return list(set(vocab))

    def _compute_likelihood(self, samples):
        likelihood = {}
        vocab = self._vocab(samples)
        
        for w in vocab:
            count = 0
            for sentence in samples:
                if w in sentence:
                    #print(w+":", sentence)
                    count += 1
            #print(f"Number of ham emails with the word '{w}': {count}")
            #prob = (count + self.k)/(len(samples) + 2.0*self.k) # smoothing
            #print(f"Probability of the word '{w}': {prob} ")
            likelihood[w.lower()] = count
        return likelihood
    
    def _p_word_spam(self, token):
        return (self.k + self.likelihood_spam.get(token, 0.0)) / ((2.0 * self.k) + self.num_spam_messages)

    def _p_word_ham(self, token):
        return (self.k + self.likelihood_ham.get(token, 0.0)) / ((2.0 * self.k) + self.num_ham_messages)

    def train(self, dataset):
        # compute priors
        dataset_total = len(dataset)
        spam_samples = [txt for txt, label in dataset if label == 'spam']
        ham_samples = [txt for txt, label in dataset if label == 'ham']

        #print(f'{spam_samples}')
        #print(f'{ham_samples}')

        self.ps = len(spam_samples) / dataset_total
        self.ph = len(ham_samples) / dataset_total

        #print(f'{self.ps} {self.ph}')

        # Pre-process text
        spam_samples = [nltk.word_tokenize(sample) for sample in spam_samples]
        ham_samples = [nltk.word_tokenize(sample) for sample in ham_samples]

        #print(f'{spam_samples}')
        #print(f'{ham_samples}')

        spam_samples = [[self.lemmatizer.lemmatize(w).lower() for w in tokens if len(self.lemmatizer.lemmatize(w)) > 2] for tokens in spam_samples]
        ham_samples = [[self.lemmatizer.lemmatize(w).lower() for w in tokens if len(self.lemmatizer.lemmatize(w)) > 2] for tokens in ham_samples]
        
        #print(f'{spam_samples}')
        #print(f'{ham_samples}')

        # compute_likelihood
        self.likelihood_spam = self._compute_likelihood(spam_samples)
        self.num_spam_messages = len(spam_samples)
        self.likelihood_ham = self._compute_likelihood(ham_samples)
        self.num_ham_messages = len(ham_samples)

        #print(f'{self.likelihood_spam}')
        #print(f'{self.likelihood_ham}')
    
    def predict(self, txt):
        # Pre-process text (similar to the train)
        tokens = nltk.word_tokenize(txt)
        tokens = [self.lemmatizer.lemmatize(w).lower() for w in tokens if len(self.lemmatizer.lemmatize(w)) > 2]

        #print(tokens)

        log_p_spam = 0.0
        log_p_ham = 0.0

        for t in tokens:
            log_p_spam += math.log(self._p_word_spam(t))
            log_p_ham += math.log(self._p_word_ham(t))
        
        prob_spam = (math.exp(log_p_spam)*self.ps)/(math.exp(log_p_spam)*self.ps+math.exp(log_p_ham)*self.ph)
        if prob_spam >= 0.5:
            return 'spam', prob_spam
        else:
            return 'ham', prob_spam

In [68]:
clf = NB()
clf.train(dataset_train)

In [69]:
acc = 0.0
for sentence, label in dataset_train:
    predicted_label, prob_spam = clf.predict(sentence)
    print(f'{sentence} -> {predicted_label} {prob_spam}')
    if label == predicted_label:
        acc += 1.0
print(f'Accuracy = {acc/len(dataset_train)}')


send us your password -> spam 0.9487666034155597
review our website -> spam 0.8605851979345954
send your password -> spam 0.9487666034155597
send us your account -> spam 0.9250693802035151
Your activity report -> ham 0.204582651391162
benefits physical activity -> ham 0.06041565973900433
the importance vows -> ham 0.08796622097114705
Accuracy = 1.0


In [70]:
acc = 0.0
for sentence, label in dataset_test:
    predicted_label, prob_spam = clf.predict(sentence)
    print(f'{sentence} -> {predicted_label} {prob_spam}')
    if label == predicted_label:
        acc += 1.0
print(f'Accuracy = {acc/len(dataset_test)}')

renew your password -> spam 0.8223684210526315
renew your vows -> ham 0.43554006968641107
benefits of our account -> spam 0.6067961165048542
the importance of physical activity -> ham 0.026092764998121326
Accuracy = 0.5


## Real Dataset

In [75]:
df = pl.read_csv('../datasets/spam.csv')
df

Target,SMS
str,str
"""ham""","""Go until juron…"
"""ham""","""Ok lar... Joki…"
"""spam""","""Free entry in …"
"""ham""","""U dun say so e…"
"""ham""","""Nah I don't th…"
"""spam""","""FreeMsg Hey th…"
"""ham""","""Even my brothe…"
"""ham""","""As per your re…"
"""spam""","""WINNER!! As a …"
"""spam""","""Had your mobil…"


In [85]:
dataset = df.rows()
dataset = [(text, label) for (label, text) in dataset]
idx = int(len(dataset)*.8)
dataset_train = dataset[0: idx]
dataset_test = dataset[idx:]

In [86]:
clf = NB()
clf.train(dataset_train)

In [87]:
acc = 0.0
for sentence, label in dataset_train:
    predicted_label, prob_spam = clf.predict(sentence)
    #print(f'{sentence} -> {predicted_label} {prob_spam}')
    if label == predicted_label:
        acc += 1.0
print(f'Accuracy = {acc/len(dataset_train)}')

Accuracy = 0.8265398550724637


In [88]:
acc = 0.0
for sentence, label in dataset_test:
    predicted_label, prob_spam = clf.predict(sentence)
    #print(f'{sentence} -> {predicted_label} {prob_spam}')
    if label == predicted_label:
        acc += 1.0
print(f'Accuracy = {acc/len(dataset_test)}')

Accuracy = 0.7457013574660634
