In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [324]:
from collections import defaultdict


def split_tokenize(text):
    return text.split()


def normalize(probabilities):
    try:
        scale_factor = 1.0 / sum(probabilities.values())
    except ZeroDivisionError:
        scale_factor = 1
    for label, probability in probabilities.items():
        probabilities[label] = probability * scale_factor
    return probabilities


class NaiveBayes:
    def __init__(self, tokenize=split_tokenize, alpha=0.1):
        self.tokenize = tokenize
        self.alpha = alpha

    def get_label_counts(self, messages):
        counts = defaultdict(int)
        for label, text in messages:
            counts[label] += 1
        return counts

    def get_initial_label_probabilities(self, label_counts):
        number_of_messages = sum(label_counts.values())
        return {k: v / number_of_messages for k, v in self.label_counts.items()}

    def get_word_label_counts(self, messages):
        counts = defaultdict(lambda: defaultdict(int))
        for label, text in messages:
            for word in self.tokenize(text):
                counts[word][label] += 1
        return counts

    def get_number_of_words(self, word_label_counts):
        number_of_words = defaultdict(int)
        for word, counts in word_label_counts.items():
            for label, count in counts.items():
                number_of_words[label] += 1
        return number_of_words

    def fit(self, messages):
        self.label_counts = self.get_label_counts(messages)
        self.labels = set(self.label_counts.keys())
        self.initial_label_probabilities = self.get_initial_label_probabilities(
            self.label_counts
        )
        self.word_label_counts = self.get_word_label_counts(messages)
        self.number_of_words = self.get_number_of_words(self.word_label_counts)
        return self

    def predict(self, message):
        probabilities = dict(self.initial_label_probabilities)
        for word in self.tokenize(message):
            counts_per_label = self.word_label_counts.get(word, {})
            for label in self.labels:
                label_word_count = counts_per_label.get(label, self.alpha)
                frequency = min(1.0, label_word_count / self.number_of_words[label])
                probabilities[label] *= frequency
        return normalize(probabilities)

    def predict_label(self, message):
        predicted = self.predict(message)
        return sorted([(v, k) for k, v in predicted.items()], reverse=True)[0][1]

<IPython.core.display.Javascript object>

In [325]:
def test_probabilities_after_words():
    train = [
        ("spam", "spam"),
        ("spam", "spam"),
        ("ham", "ham spam"),
    ]
    test_cases = [
        # message, expected_probabilities
        ("spam", {"spam": 0.8, "ham": 0.2}),
    ]
    model = NaiveBayes().fit(train)
    for message, expected_probabilities in test_cases:
        actual_probabilities = model.predict(message)
        print(actual_probabilities, expected_probabilities)
        assert actual_probabilities == expected_probabilities


test_probabilities_after_words()

{'spam': 0.8, 'ham': 0.2} {'spam': 0.8, 'ham': 0.2}


<IPython.core.display.Javascript object>

In [326]:
def test_initial_probabilities():
    test_cases = [
        # train, expected_initial_probabilities
        ([], {}),
        ([("ham", ""), ("spam", "")], {"ham": 0.5, "spam": 0.5}),
        ([(i, "") for i in range(3)], {k: 1 / 3 for k in range(3)}),
    ]
    for train, expected_initial_probabilities in test_cases:
        model = NaiveBayes().fit(train)
        actual_probabilities = model.predict("")
        #         print(actual_probabilities, expected_initial_probabilities)
        assert actual_probabilities == expected_initial_probabilities


test_initial_probabilities()

<IPython.core.display.Javascript object>

In [327]:
def test_normalize():
    test_cases = [
        # probabilities, expected
        ({0: 0.6, 1: 0.2}, {0: 0.75, 1: 0.25}),
    ]
    for probabilities, expected in test_cases:
        actual = normalize(probabilities)
        print(actual, expected)
        assert actual == expected


test_normalize()

{0: 0.75, 1: 0.25} {0: 0.75, 1: 0.25}


<IPython.core.display.Javascript object>

In [328]:
import random

random.seed(2021)  # make sure the same messages are choosen between restarts


def split_train_test(messages, test_quote=0.1):
    messages_by_label = defaultdict(list)
    for label, text in messages:
        messages_by_label[label].append((label, text))

    # stratified sampling
    test, train = [], []
    for label, label_messages in messages_by_label.items():
        indices = list(range(len(label_messages)))
        test_len = int(len(indices) * test_quote)
        test_indices = set(random.sample(indices, test_len))
        for index, message in enumerate(label_messages):
            if index in test_indices:
                test.append(message)
            else:
                train.append(message)
    return train, test

<IPython.core.display.Javascript object>

In [329]:
from django_comments import get_model as get_comments_model

Comment = get_comments_model()


def get_messages_from_comments():
    messages = []
    for comment in Comment.objects.all():
        label = "spam" if comment.is_removed else "ham"
        message = f"{comment.name} {comment.title} {comment.comment}"
        messages.append((label, message))
    return messages


messages = get_messages_from_comments()

<IPython.core.display.Javascript object>

In [330]:
import io
import zipfile
import requests


def get_sms_messages():
    # download zip archive
    training_data_url = (
        "https://d2b7dn9rofvhjd.cloudfront.net/sms-spam-collection-dataset.zip"
    )
    response = requests.get(training_data_url)
    z = zipfile.ZipFile(io.BytesIO(response.content))

    # parse messages
    spam_text = z.read("spam.csv").decode("latin1")
    lines = iter(spam_text.split("\r\n"))
    skipped = next(lines)  # skip first line

    messages = []
    for line in lines:
        line = line.rstrip(",")
        label, *message = line.split(",")
        messages.append((label, " ".join(message)))
    return messages


messages = get_sms_messages()

<IPython.core.display.Javascript object>

In [331]:
import re

token_pattern = re.compile(r"(?u)\b\w\w+\b")
standard_tokenizer = token_pattern.findall


def tokenize(message):
    return standard_tokenizer(message.lower())

<IPython.core.display.Javascript object>

In [332]:
%%time
train_messages, test_messages = split_train_test(messages, test_quote=0.1)
print(len(train_messages), len(test_messages))
model = NaiveBayes(tokenize=tokenize).fit(train_messages)

5016 556
CPU times: user 64.4 ms, sys: 2.66 ms, total: 67 ms
Wall time: 65.6 ms


<IPython.core.display.Javascript object>

In [333]:
%%time

true_positives = 0
all_observations = len(test_messages)

for label, message in test_messages:
    if model.predict_label(message) == label:
        true_positives += 1

accuracy = true_positives / all_observations
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.980
CPU times: user 19.1 ms, sys: 1.5 ms, total: 20.6 ms
Wall time: 19.5 ms


<IPython.core.display.Javascript object>

In [334]:
outcomes = (("true", "false"), ("positive", "negative"))
possible_results = [f"{a}_{b}" for b in outcomes[1] for a in outcomes[0]]
result_template = dict.fromkeys(possible_results, 0)

labels = set(model.labels)
label_results = {label: dict(result_template) for label in labels}
all_observations = len(test_messages)

for label, message in test_messages:
    predicted = model.predict_label(message)
    if label == predicted:
        label_results[label]["true_positive"] += 1
    else:
        label_results[label]["false_negative"] += 1
        label_results[predicted]["false_positive"] += 1

<IPython.core.display.Javascript object>

In [335]:
def precision_recall_f1(result):
    all_observations = sum(result.values())
    tp = result["true_positive"]
    fp = result["false_positive"]
    fn = result["false_negative"]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1


def show_result(label_results):
    for label, result in label_results.items():
        precision, recall, f1 = precision_recall_f1(result)
        print(
            f"{label: >4} f1: {f1:.3f} precision: {precision:.3f} recall: {recall:.3f}"
        )

<IPython.core.display.Javascript object>

In [336]:
show_result(label_results)

 ham f1: 0.989 precision: 0.982 recall: 0.996
spam f1: 0.922 precision: 0.970 recall: 0.878


<IPython.core.display.Javascript object>