# Task 5

*by Lukas Dötlinger*

In [55]:
import pandas as pd
import itertools as it
from nltk.probability import FreqDist

df = pd.read_csv('res/spam-dataset.csv')

spam_list = df[df['Category'] == 'spam']['Message'].tolist()
ham_list = df[df['Category'] == 'ham']['Message'].tolist()


filter_words = lambda words: [ w.lower() for w in words if any(c.isalpha() for c in w) ]

def dist_from_lines(lines):
    flattened_words = list(it.chain.from_iterable([ l.split() for l in lines ]))
    dist = FreqDist(filter_words(flattened_words))
    return dist

def increase_count_in_dist(dist):
    for word in dist:
        dist[word] += 1

class NBClassifier:
    def __init__(self, spam_list, ham_list):
        self.p_ham = float(len(ham_list)) / float(len(ham_list) + len(spam_list))
        self.p_spam = float(len(spam_list)) / float(len(ham_list) + len(spam_list))

        self.spam_dist = dist_from_lines(spam_list)
        self.ham_dist = dist_from_lines(ham_list)

        increase_count_in_dist(self.spam_dist)
        increase_count_in_dist(self.ham_dist)

        words_not_in_ham = FreqDist(list(set(spam_dist.keys()).difference(ham_dist.keys())))
        self.ham_dist.update(words_not_in_ham)

        words_not_in_spam = FreqDist(list(set(ham_dist.keys()).difference(spam_dist.keys())))
        self.spam_dist.update(words_not_in_spam)

    def spam_prob(self, words, debug):
        p = self.p_spam
        for w in words:
            if self.spam_dist.freq(w) > 0:
                p *= self.spam_dist.freq(w)
            else:
                if debug:
                    print(f'Word not found in spam training set: {w}')
        return p

    def ham_prob(self, words, debug):
        p = self.p_ham
        for w in words:
            if self.ham_dist.freq(w) > 0:
                p *= self.ham_dist.freq(w)
            else:
                if debug:
                    print(f'Word not found in ham training set: {w}')
        return p

    def classify(self, text, debug = False):
        words = filter_words(text.split())
        if self.ham_prob(words, debug) >= self.spam_prob(words, debug):
            return 'ham'
        else:
            return 'spam'

    def print(self):
        print(f'p_ham: {self.p_ham}, p_spam: {self.p_spam}')


classifier = NBClassifier(spam_list, ham_list)

print(classifier.classify('dear friend hello how are you'))
print(classifier.classify('money win now'))


ham
spam


The class `NBClassifier` represents a trained instance of a Naive Beyes classifier. It takes as input two training lists, one with spam sentences and one with ham sentences. A Frequency Distribution from `nltk` is used to calculate the probabilities of a word beeing in a set.

To account for values which are not present in either the spam or the ham distribution, the diefferences are added and the other words are increase by one to get the same level of distribution. This mitigates 0 values when calculation the propability for a word.

In [71]:
import numpy as np

ham_chunks = np.array_split(ham_list, 10)
spam_chunks = np.array_split(spam_list, 10)

def combine_chunks(chunks, i):
    final = []
    for j in range(1, len(chunks)):
        if j != i:
            final.extend(chunks[j - 1])
    return final

for i in range(1, 10 + 1):
    nbc = NBClassifier(combine_chunks(spam_chunks, i), combine_chunks(ham_chunks, i))

    ham_errors = []
    for s in ham_chunks[i - 1]:
        if nbc.classify(s) == 'spam':
            ham_errors.append(s)

    spam_errors = []
    for s in spam_chunks[i - 1]:
        if nbc.classify(s) == 'ham':
            spam_errors.append(s)

    tp = float(len(spam_chunks[i - 1]) - len(spam_errors))
    fn = float(len(spam_errors))
    fp = float(len(ham_errors))
    tn = float(len(ham_chunks[i - 1]) - len(ham_errors))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    print('___________________________________________')
    print(f'Chunk {i}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F-measure: {(2 * precision * recall) / (precision + recall)}')
    print(f'Accuracy: {( + tn) / (tp + fp + tn + fn)}')
    print('___________________________________________')

___________________________________________
Chunk 1
Precision: 0.974025974025974
Recall: 1.0
F-measure: 0.9868421052631579
Accuracy: 0.8620071684587813
___________________________________________
___________________________________________
Chunk 2
Precision: 0.9866666666666667
Recall: 0.9866666666666667
F-measure: 0.9866666666666668
Accuracy: 0.8637992831541219
___________________________________________
___________________________________________
Chunk 3
Precision: 0.9493670886075949
Recall: 1.0
F-measure: 0.974025974025974
Accuracy: 0.8584229390681004
___________________________________________
___________________________________________
Chunk 4
Precision: 0.9615384615384616
Recall: 1.0
F-measure: 0.9803921568627451
Accuracy: 0.8602150537634409
___________________________________________
___________________________________________
Chunk 5
Precision: 0.9864864864864865
Recall: 0.9733333333333334
F-measure: 0.9798657718120806
Accuracy: 0.8637992831541219
_______________________________