In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# Vorteile

* Ist sackschnell
* Braucht wenig Trainingsdaten
* Lässt sich gut auf mehrere Kategorien erweitern
* Man benötigt kein pandas, numpy, sklearn nur eingebaute Datentypen
* Kaum Hyperparametertuning

In [48]:
import io
import zipfile
import requests


def get_sms_messages():
    # download zip archive
    training_data_url = (
        "https://d2b7dn9rofvhjd.cloudfront.net/sms-spam-collection-dataset.zip"
    )
    response = requests.get(training_data_url)
    z = zipfile.ZipFile(io.BytesIO(response.content))

    # parse messages
    spam_text = z.read("spam.csv").decode("latin1")
    lines = iter(spam_text.split("\r\n"))
    skipped = next(lines)  # skip first line

    messages = []
    for line in lines:
        line = line.rstrip(",")
        label, *message = line.split(",")
        messages.append((label, " ".join(message)))
    return messages

<IPython.core.display.Javascript object>

# Split Train/Test

In [49]:
import random


def split_train_test(messages, test_quote=0.25):
    train, test = [], []
    for label, message in messages:
        if random.random() < test_quote:
            test.append((label, message))
        else:
            train.append((label, message))
    return train, test

<IPython.core.display.Javascript object>

In [50]:
train_messages, test_messages = split_train_test(messages)
print(len(train_messages), len(test_messages))

4181 1391


<IPython.core.display.Javascript object>

# Train Model

In [46]:
import re

from collections import defaultdict

token_pattern = re.compile(r"(?u)\b\w\w+\b")
standard_tokenizer = token_pattern.findall


def tokenize(message):
    return standard_tokenizer(message.lower())

<IPython.core.display.Javascript object>

In [36]:
def get_initial_probabilities(total):
    initial_probabilities = {}
    all_observations = sum(total.values())
    for label, observations_per_label in total.items():
        initial_probabilities[label] = observations_per_label / all_observations
    return initial_probabilities


def get_words_per_label(counts):
    words_per_label = defaultdict(int)
    for word, label_counts in counts.items():
        for label, count in label_counts.items():
            words_per_label[label] += 1
    return words_per_label


def build_model(counts, total):
    words_per_label = get_words_per_label(counts)
    return {
        "counts": counts,
        "labels": list(total.keys()),
        "words_per_label": words_per_label,
        "minimal_probability": 1.0 / sum(words_per_label.values()),
        "initial_probabilities": get_initial_probabilities(total),
    }

<IPython.core.display.Javascript object>

In [37]:
def train(messages):
    total = defaultdict(int)
    counts = defaultdict(lambda: defaultdict(int))
    for label, message in messages:
        total[label] += 1
        for word in tokenize(message):
            counts[word][label] += 1
    return build_model(counts, total)

<IPython.core.display.Javascript object>

In [38]:
%%time
model = train(train_messages)

CPU times: user 54.3 ms, sys: 1.82 ms, total: 56.1 ms
Wall time: 55 ms


<IPython.core.display.Javascript object>

In [51]:
model["words_per_label"]

defaultdict(int, {'ham': 5984, 'spam': 2455})

<IPython.core.display.Javascript object>

# Predict Label

In [39]:
def normalize(label_results):
    try:
        scale_factor = 1.0 / sum(label_results.values())
    except ZeroDivisionError:
        scale_factor = 1
    for label, probability in label_results.items():
        label_results[label] = probability * scale_factor
    return label_results


def predict(
    message,
    labels=[],
    counts={},
    words_per_label={},
    minimal_probability=None,
    initial_probabilities={},
):
    label_results = {}
    for word in tokenize(message):
        for label in labels:
            counts_per_label = counts.get(word, {})
            if label in counts_per_label:
                label_frequency = counts_per_label[label] / words_per_label[label]
            else:
                label_frequency = minimal_probability
            previous_result = label_results.get(label, initial_probabilities[label])
            label_results[label] = previous_result * label_frequency
    if label_results == {}:
        label_results = dict(initial_probabilities)
    return normalize(label_results)


def predict_label(message, **model):
    result = predict(message, **model)
    return sorted([(v, k) for k, v in result.items()], reverse=True)[0][1]

<IPython.core.display.Javascript object>

In [40]:
my_spam_message = "renew your subscription for free now"
my_ham_message = "remember to go to church on sunday"

<IPython.core.display.Javascript object>

In [41]:
%%time
result = predict(my_ham_message, **model)
for label, probability in result.items():
    print(f"{label:>4}: {probability:.8f}")

 ham: 0.99995695
spam: 0.00004305
CPU times: user 2.53 ms, sys: 1.65 ms, total: 4.18 ms
Wall time: 3.01 ms


<IPython.core.display.Javascript object>

# Evaluate

In [42]:
%%time

true_positives = 0
all_observations = len(test_messages)

for label, message in test_messages:
    if predict_label(message, **model) == label:
        true_positives += 1

accuracy = true_positives / all_observations
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.977
CPU times: user 36.2 ms, sys: 1.48 ms, total: 37.7 ms
Wall time: 36.6 ms


<IPython.core.display.Javascript object>

In [43]:
outcomes = (("true", "false"), ("positive", "negative"))
possible_results = [f"{a}_{b}" for b in outcomes[1] for a in outcomes[0]]
result_template = dict.fromkeys(possible_results, 0)

labels = set(model["labels"])
label_results = {label: dict(result_template) for label in labels}
all_observations = len(test_messages)

for label, message in test_messages:
    predicted = predict_label(message, **model)
    if label == predicted:
        label_results[label]["true_positive"] += 1
    else:
        label_results[label]["false_negative"] += 1
        label_results[predicted]["false_positive"] += 1

<IPython.core.display.Javascript object>

In [44]:
def precision_recall_f1(result):
    all_observations = sum(result.values())
    tp = result["true_positive"]
    fp = result["false_positive"]
    fn = result["false_negative"]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1


def show_result(label_results):
    for label, result in label_results.items():
        precision, recall, f1 = precision_recall_f1(result)
        print(
            f"{label: >4} f1: {f1:.3f} precision: {precision:.3f} recall: {recall:.3f}"
        )

<IPython.core.display.Javascript object>

In [45]:
# standard tokenizer
show_result(label_results)

spam f1: 0.914 precision: 0.977 recall: 0.859
 ham f1: 0.986 precision: 0.976 recall: 0.997


<IPython.core.display.Javascript object>

In [32]:
# standard + lower
show_result(label_results)

spam f1: 0.937 precision: 0.988 recall: 0.890
 ham f1: 0.990 precision: 0.983 recall: 0.998


<IPython.core.display.Javascript object>

In [56]:
class Foobar:
    blub = "blua"

<IPython.core.display.Javascript object>

In [57]:
foo = Foobar()

<IPython.core.display.Javascript object>

In [59]:
foo.blub

'blua'

<IPython.core.display.Javascript object>

In [60]:
foo.blub = "asdf"

<IPython.core.display.Javascript object>

In [61]:
x = Foobar()

<IPython.core.display.Javascript object>

In [62]:
x.blub

'blua'

<IPython.core.display.Javascript object>

In [63]:
foo.blub

'asdf'

<IPython.core.display.Javascript object>