In [144]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [147]:
from django_comments import get_model as get_comments_model

Comment = get_comments_model()


def get_messages_from_comments():
    messages = []
    for comment in Comment.objects.all():
        label = "spam" if comment.is_removed else "ham"
        message = f"{comment.name} {comment.title} {comment.comment}"
        messages.append((label, message))
    return messages


messages = get_messages_from_comments()

<IPython.core.display.Javascript object>

In [146]:
import io
import zipfile
import requests


def get_sms_messages():
    # download zip archive
    training_data_url = (
        "https://d2b7dn9rofvhjd.cloudfront.net/sms-spam-collection-dataset.zip"
    )
    response = requests.get(training_data_url)
    z = zipfile.ZipFile(io.BytesIO(response.content))

    # parse messages
    spam_text = z.read("spam.csv").decode("latin1")
    lines = iter(spam_text.split("\r\n"))
    skipped = next(lines)  # skip first line

    messages = []
    for line in lines:
        line = line.rstrip(",")
        label, *message = line.split(",")
        messages.append((label, " ".join(message)))
    return messages


messages = get_sms_messages()

<IPython.core.display.Javascript object>

## Tokenizer

In [148]:
import re

token_pattern = re.compile(r"(?u)\b\w\w+\b")
standard_tokenizer = token_pattern.findall


def tokenize(message):
    return standard_tokenizer(message.lower())

<IPython.core.display.Javascript object>

## Model

In [149]:
from collections import defaultdict


def get_initial_probabilities(total):
    initial_probabilities = {}
    all_observations = sum(total.values())
    for label, observations_per_label in total.items():
        initial_probabilities[label] = observations_per_label / all_observations
    return initial_probabilities


def get_words_per_label(counts):
    words_per_label = defaultdict(int)
    for word, label_counts in counts.items():
        for label, count in label_counts.items():
            words_per_label[label] += 1
    return words_per_label


def get_counts(messages):
    total = defaultdict(int)
    counts = defaultdict(lambda: defaultdict(int))
    for label, message in messages:
        total[label] += 1
        for word in tokenize(message):
            counts[word][label] += 1
    return {"examples_by_label": total, "counts": counts}


def normalize(label_results):
    try:
        scale_factor = 1.0 / sum(label_results.values())
    except ZeroDivisionError:
        scale_factor = 1
    for label, probability in label_results.items():
        label_results[label] = probability * scale_factor
    return label_results


class NaiveBayes:
    def __init__(self, tokenize=tokenize):
        self.tokenize = tokenize

    def build_model(self, counts={}, examples_by_label={}):
        self.counts, self.example_by_label = counts, examples_by_label
        self.labels = list(examples_by_label.keys())
        self.words_per_label = get_words_per_label(counts)
        self.minimal_probability = 1.0 / sum(self.words_per_label.values())
        self.initial_probabilities = get_initial_probabilities(examples_by_label)
        return self

    def dict(self):
        return {
            "counts": self.counts,
            "examples_by_label": self.examples_by_label,
        }

    def from_dict(self, serialized):
        return self.build_model(**serialized)

    def fit(self, messages):
        return self.build_model(**get_counts(messages))

    def predict(self, message):
        label_results = {}
        for word in self.tokenize(message):
            for label in self.labels:
                counts_per_label = self.counts.get(word, {})
                if label in counts_per_label:
                    label_frequency = (
                        counts_per_label[label] / self.words_per_label[label]
                    )
                else:
                    label_frequency = self.minimal_probability
                previous_result = label_results.get(
                    label, self.initial_probabilities[label]
                )
                label_results[label] = previous_result * label_frequency
        if label_results == {}:
            label_results = dict(self.initial_probabilities)
        return normalize(label_results)

    def predict_label(self, message):
        predicted = self.predict(message)
        return sorted([(v, k) for k, v in predicted.items()], reverse=True)[0][1]

<IPython.core.display.Javascript object>

In [150]:
model = NaiveBayes().fit(messages)

<IPython.core.display.Javascript object>

In [151]:
model.words_per_label

defaultdict(int, {'ham': 1568, 'spam': 2715})

<IPython.core.display.Javascript object>

In [152]:
my_spam_message = "eric jones website opportunity"
my_ham_message = "das war aber eine schöne episode"

<IPython.core.display.Javascript object>

In [153]:
%%time
result = model.predict(my_ham_message)
for label, probability in result.items():
    print(f"{label:>4}: {probability:.8f}")

 ham: 1.00000000
spam: 0.00000000
CPU times: user 2.92 ms, sys: 1.95 ms, total: 4.88 ms
Wall time: 3.41 ms


<IPython.core.display.Javascript object>

# Split Train/Test

In [154]:
import random

random.seed(2021)  # make sure the same messages are choosen between restarts


def split_train_test(messages, test_quote=0.1):
    messages_by_label = defaultdict(list)
    for label, text in messages:
        messages_by_label[label].append((label, text))

    # stratified sampling
    test, train = [], []
    for label, label_messages in messages_by_label.items():
        indices = list(range(len(label_messages)))
        test_len = int(len(indices) * test_quote)
        test_indices = set(random.sample(indices, test_len))
        for index, message in enumerate(label_messages):
            if index in test_indices:
                test.append(message)
            else:
                train.append(message)
    return train, test

<IPython.core.display.Javascript object>

In [155]:
train_messages, test_messages = split_train_test(messages)
len(train_messages), len(test_messages)

(310, 34)

<IPython.core.display.Javascript object>

# Train Model

In [156]:
%%time
train_messages, test_messages = split_train_test(messages, test_quote=0.25)
print(len(train_messages), len(test_messages))
model = NaiveBayes().fit(train_messages)

258 86
CPU times: user 24.9 ms, sys: 2.03 ms, total: 27 ms
Wall time: 25.5 ms


<IPython.core.display.Javascript object>

# Evaluate

In [157]:
%%time

true_positives = 0
all_observations = len(test_messages)

for label, message in test_messages:
    if model.predict_label(message) == label:
        true_positives += 1

accuracy = true_positives / all_observations
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.965
CPU times: user 19.5 ms, sys: 1.58 ms, total: 21 ms
Wall time: 19.7 ms


<IPython.core.display.Javascript object>

In [158]:
outcomes = (("true", "false"), ("positive", "negative"))
possible_results = [f"{a}_{b}" for b in outcomes[1] for a in outcomes[0]]
result_template = dict.fromkeys(possible_results, 0)

labels = set(model.labels)
label_results = {label: dict(result_template) for label in labels}
all_observations = len(test_messages)

for label, message in test_messages:
    predicted = model.predict_label(message)
    if label == predicted:
        label_results[label]["true_positive"] += 1
    else:
        label_results[label]["false_negative"] += 1
        label_results[predicted]["false_positive"] += 1

<IPython.core.display.Javascript object>

In [159]:
def precision_recall_f1(result):
    all_observations = sum(result.values())
    tp = result["true_positive"]
    fp = result["false_positive"]
    fn = result["false_negative"]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1


def show_result(label_results):
    for label, result in label_results.items():
        precision, recall, f1 = precision_recall_f1(result)
        print(
            f"{label: >4} f1: {f1:.3f} precision: {precision:.3f} recall: {recall:.3f}"
        )

<IPython.core.display.Javascript object>

In [128]:
show_result(label_results)

spam f1: 0.977 precision: 0.955 recall: 1.000
 ham f1: 0.930 precision: 1.000 recall: 0.870


<IPython.core.display.Javascript object>

In [129]:
model.example_by_label

defaultdict(int, {'ham': 69, 'spam': 189})

<IPython.core.display.Javascript object>

In [143]:
show_result(label_results)

spam f1: 0.944 precision: 0.994 recall: 0.898
 ham f1: 0.992 precision: 0.984 recall: 0.999


<IPython.core.display.Javascript object>

In [160]:
show_result(label_results)

spam f1: 0.977 precision: 0.955 recall: 1.000
 ham f1: 0.930 precision: 1.000 recall: 0.870


<IPython.core.display.Javascript object>