# Task 5

*by Lukas Dötlinger*

In [36]:
import pandas as pd
import itertools as it
from nltk.probability import FreqDist

df = pd.read_csv('res/spam-dataset.csv')

spam_list = df[df['Category'] == 'spam']['Message'].tolist()
ham_list = df[df['Category'] == 'ham']['Message'].tolist()


def dist_from_lines(lines):
    flattened_words = list(it.chain.from_iterable([ l.split() for l in lines ]))
    dist = FreqDist([ w.lower() for w in flattened_words if any(c.isalpha() for c in w) ])
    return dist

def increase_count_in_dist(dist):
    for word in dist:
        dist[word] += 1

class NBClassifier:
    def __init__(self, spam_list, ham_list):
        self.p_ham = float(len(ham_list)) / float(len(ham_list) + len(spam_list))
        self.p_spam = float(len(spam_list)) / float(len(ham_list) + len(spam_list))

        self.spam_dist = dist_from_lines(spam_list)
        self.ham_dist = dist_from_lines(ham_list)

        increase_count_in_dist(self.spam_dist)
        increase_count_in_dist(self.ham_dist)

        words_not_in_ham = FreqDist(list(set(spam_dist.keys()).difference(ham_dist.keys())))
        self.ham_dist.update(words_not_in_ham)

        words_not_in_spam = FreqDist(list(set(ham_dist.keys()).difference(spam_dist.keys())))
        self.spam_dist.update(words_not_in_spam)

    def spam_prob(self, words):
        p = self.p_spam
        for w in words:
            if self.spam_dist.freq(w) > 0:
                p *= self.spam_dist.freq(w)
            else:
                print(f'Word not found in spam training set: {w}')
        return p

    def ham_prob(self, words):
        p = self.p_ham
        for w in words:
            if self.ham_dist.freq(w) > 0:
                p *= self.ham_dist.freq(w)
            else:
                print(f'Word not found in ham training set: {w}')
        return p

    def classify(self, text):
        words = [ w.lower() for w in text.split() ]
        if self.ham_prob(words) >= self.spam_prob(words):
            return "ham"
        else:
            return "spam"

    def print(self):
        print(f'p_ham: {self.p_ham}, p_spam: {self.p_spam}')


classifier = NBClassifier(spam_list, ham_list)

print(classifier.classify('dear friend hello how are you'))
print(classifier.classify('money win now'))


ham
spam
