In [10]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [45]:
import re

from collections import defaultdict


def split_tokenize(message):
    return message.split()


token_pattern = re.compile(r"(?u)\b\w\w+\b")
standard_tokenizer = token_pattern.findall


def tokenize(message):
    return standard_tokenizer(message.lower())


class NaiveBayes:
    def __init__(self, tokenize=tokenize):
        self.tokenize = tokenize

    def get_label_counts(self, messages):
        label_counts = defaultdict(int)
        for label, text in messages:
            label_counts[label] += 1
        return label_counts

    def get_prior_probabilities(self, label_counts):
        number_of_messages = sum(label_counts.values())
        return {
            label: count / number_of_messages for label, count in label_counts.items()
        }

    def get_word_label_counts(self, messages):
        counts = defaultdict(lambda: defaultdict(int))
        for label, text in messages:
            for word in self.tokenize(text):
                counts[word][label] += 1
        return counts

    def get_number_of_words(self, word_label_counts):
        number_of_words = defaultdict(int)
        for word, counts in word_label_counts.items():
            for label, count in counts.items():
                number_of_words[label] += 1
        return number_of_words

    def fit(self, messages):
        self.prior_probabilities = self.get_prior_probabilities(
            self.get_label_counts(messages)
        )
        self.word_label_counts = self.get_word_label_counts(messages)
        self.number_of_words = self.get_number_of_words(self.word_label_counts)
        return self

    def update_probabilities(self, probabilities, counts_per_label, number_of_words):
        updated_probabilites = {}
        for label, prior_probability in probabilities.items():
            word_count = counts_per_label.get(label, 0.5)
            word_probability = word_count / number_of_words[label]
            updated_probabilites[label] = prior_probability * word_probability
        return updated_probabilites

    def normalize(self, probabilities):
        factor = 1.0 / float(sum(probabilities.values()))
        for name, value in probabilities.items():
            probabilities[name] *= factor
        return probabilities

    def predict(self, message):
        probabilities = dict(self.prior_probabilities)
        for word in self.tokenize(message):
            counts_per_label = self.word_label_counts.get(word, {})
            probabilities = self.update_probabilities(
                probabilities, counts_per_label, self.number_of_words
            )
            probabilities = self.normalize(probabilities)
        return probabilities

    def predict_label(self, message):
        probabilities = self.predict(message)
        return sorted(
            [(prob, label) for label, prob in probabilities.items()], reverse=True
        )[0][1]

<IPython.core.display.Javascript object>

In [46]:
def test_predict():
    train = [
        ("spam", "foo bar baz"),
        ("spam", "foo asdf bsdf"),
        ("ham", "asdf csdf"),
    ]
    test_cases = [
        # message, expected_probabilities
        ("foo", {"ham": (1 / 3) * (0.5 / 2), "spam": (2 / 3) * (2 / 5)})
    ]
    model = NaiveBayes().fit(train)
    for message, expected_probabilites in test_cases:
        probabilities = model.predict(message)
        assert probabilities == expected_probabilites


test_predict()

AssertionError: 

<IPython.core.display.Javascript object>

In [47]:
def test_predict_word_probabilities():
    test_cases = [
        # (probabilities, counts_per_label, number_of_words), expected
        (({}, {}, {}), {}),
        (({"spam": 1}, {"spam": 10}, {"spam": 100}), {"spam": 0.1}),
        (
            ({"spam": 1, "ham": 0.5}, {"spam": 10, "ham": 5}, {"spam": 100, "ham": 50}),
            {"spam": 0.1, "ham": 0.05},
        ),
        # no word count -> make sure probability is > 0
        (({"spam": 1}, {}, {"spam": 100}), {"spam": 0.005}),
    ]
    model = NaiveBayes()
    for (probabilities, counts_per_label, number_of_words), expected in test_cases:
        updated = model.update_probabilities(
            probabilities, counts_per_label, number_of_words
        )
        assert updated == expected


test_predict_word_probabilities()

<IPython.core.display.Javascript object>

In [48]:
def test_initial_probabilities():
    test_cases = [
        # (train, expected_initial_probabilities)
        ([], {}),
        ([("ham", ""), ("spam", "")], {"ham": 0.5, "spam": 0.5}),
        ([(i, "") for i in range(3)], {k: 1 / 3 for k in range(3)}),
    ]
    for train, expected_initial_probabilities in test_cases:
        model = NaiveBayes().fit(train)
        probabilities = model.predict("")
        assert probabilities == expected_initial_probabilities


test_initial_probabilities()

<IPython.core.display.Javascript object>

# Evaluation

In [65]:
import random

random.seed(2021)  # make sure the same messages are choosen between restarts


def split_train_test(messages, test_quote=0.1):
    messages_by_label = defaultdict(list)
    for label, text in messages:
        messages_by_label[label].append((label, text))

    # stratified sampling
    test, train = [], []
    for label, label_messages in messages_by_label.items():
        indices = list(range(len(label_messages)))
        test_len = int(len(indices) * test_quote)
        test_indices = set(random.sample(indices, test_len))
        for index, message in enumerate(label_messages):
            if index in test_indices:
                test.append(message)
            else:
                train.append(message)
    return train, test

<IPython.core.display.Javascript object>

In [66]:
from django_comments import get_model as get_comments_model

Comment = get_comments_model()


def get_messages_from_comments():
    messages = []
    for comment in Comment.objects.all():
        label = "spam" if comment.is_removed else "ham"
        message = f"{comment.name} {comment.title} {comment.comment}"
        messages.append((label, message))
    return messages


messages = get_messages_from_comments()

<IPython.core.display.Javascript object>

In [67]:
train_messages, test_messages = split_train_test(messages, test_quote=0.1)
print(len(train_messages), len(test_messages))
model = NaiveBayes().fit(train_messages)

310 34


<IPython.core.display.Javascript object>

In [68]:
%%time

true_positives = 0
all_observations = len(test_messages)

for label, message in test_messages:
    if model.predict_label(message) == label:
        true_positives += 1

accuracy = true_positives / all_observations
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.941
CPU times: user 17.9 ms, sys: 1.75 ms, total: 19.7 ms
Wall time: 18.1 ms


<IPython.core.display.Javascript object>

In [69]:
outcomes = (("true", "false"), ("positive", "negative"))
possible_results = [f"{a}_{b}" for b in outcomes[1] for a in outcomes[0]]
result_template = dict.fromkeys(possible_results, 0)

labels = set(model.prior_probabilities.keys())
label_results = {label: dict(result_template) for label in labels}
all_observations = len(test_messages)

for label, message in test_messages:
    predicted = model.predict_label(message)
    if label == predicted:
        label_results[label]["true_positive"] += 1
    else:
        label_results[label]["false_negative"] += 1
        label_results[predicted]["false_positive"] += 1

<IPython.core.display.Javascript object>

In [70]:
def precision_recall_f1(result):
    all_observations = sum(result.values())
    tp = result["true_positive"]
    fp = result["false_positive"]
    fn = result["false_negative"]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1


def show_result(label_results):
    for label, result in label_results.items():
        precision, recall, f1 = precision_recall_f1(result)
        print(
            f"{label: >4} f1: {f1:.3f} precision: {precision:.3f} recall: {recall:.3f}"
        )

<IPython.core.display.Javascript object>

In [71]:
show_result(label_results)

 ham f1: 0.900 precision: 0.818 recall: 1.000
spam f1: 0.958 precision: 1.000 recall: 0.920


<IPython.core.display.Javascript object>

In [55]:
show_result(label_results)

 ham f1: 0.900 precision: 0.818 recall: 1.000
spam f1: 0.958 precision: 1.000 recall: 0.920


<IPython.core.display.Javascript object>

In [64]:
show_result(label_results)

 ham f1: 0.987 precision: 0.986 recall: 0.988
spam f1: 0.912 precision: 0.918 recall: 0.905


<IPython.core.display.Javascript object>

In [38]:
show_result(label_results)

 ham f1: 0.991 precision: 0.996 recall: 0.985
spam f1: 0.941 precision: 0.911 recall: 0.973


<IPython.core.display.Javascript object>

In [56]:
import io
import zipfile
import requests


def get_sms_messages():
    # download zip archive
    training_data_url = (
        "https://d2b7dn9rofvhjd.cloudfront.net/sms-spam-collection-dataset.zip"
    )
    response = requests.get(training_data_url)
    z = zipfile.ZipFile(io.BytesIO(response.content))

    # parse messages
    spam_text = z.read("spam.csv").decode("latin1")
    lines = iter(spam_text.split("\r\n"))
    skipped = next(lines)  # skip first line

    messages = []
    for line in lines:
        line = line.rstrip(",")
        label, *message = line.split(",")
        messages.append((label, " ".join(message)))
    return messages


messages = get_sms_messages()

<IPython.core.display.Javascript object>

In [57]:
train_messages, test_messages = split_train_test(messages, test_quote=0.1)
print(len(train_messages), len(test_messages))
model = NaiveBayes().fit(train_messages)

5016 556


<IPython.core.display.Javascript object>

# False Positives Debug

In [72]:
false_positive = """
Hallo,

vielen Dank für den Tipp mit pywin32. Ich nutze auto-py-to-exe damit meine Kollegen nicht erst python installieren müssen. Ich habe ein pip Update gemacht und plötzlich  konnte ich nicht mehr die exe  Datei nutzen. Ich habe eine erzeugt bekommen. Da es im Visual Studio Code funktioniert hat, habe ich wie immer die alte exe gleich überschrieben. Ich habe drei Stunden geschwitzt um eine funktionierende Version in meinen Ordnern  zu finden. Ich werde es nach meinem Urlaub testen, hoffe euer Tipp funktioniert, da meine Kollegen sonst Python installieren müssen und eine .pyw starten müssen. Bei der exe können sie wenigstens nichts kaputt machen.

Also vielen Dank! Bleibt gesund.
Toller Podcast auch wenn ich nicht immer alles verstehe. :-)

Frage:
Würdet ihr Tkinter oder Django empfehlen wenn man ein Cockpit bauen möchte damit die Kollegen damit arbeiten können? Ich habe eure Django Folge zwar gehört aber war dann fachlich schnell raus. Wir haben keine Netzlaufwerke mehr sondern nur noch Onedrive und Sharepoints. Ich habe ein 3800 Zeilen langes Script geschrieben was tkinter nutzt. Kann man Django einfach so anderen bereitstellen oder benötigt man einen Server wo das laufen muss? Wenn ja dann muss ich bei tkinter bleiben was auch okay ist aber man will ja das Tool auch optisch anheben. :-)

Also wie gesagt, Dankeschön für die tollen Folgen, ich lerne immer etwas dazu und heute war es ganz wichtig mit dem pywin32 Hinweis!!!
"""

<IPython.core.display.Javascript object>

In [73]:
model.predict(false_positive)

{'ham': 0.9999999999999999, 'spam': 7.719642281555638e-247}

<IPython.core.display.Javascript object>

In [31]:
%%time
model.predict_label(false_positive)

CPU times: user 966 µs, sys: 0 ns, total: 966 µs
Wall time: 974 µs


'spam'

<IPython.core.display.Javascript object>

In [43]:
probabilities = dict(model.prior_probabilities)

<IPython.core.display.Javascript object>

In [28]:
probabilities

{'ham': 0.267741935483871, 'spam': 0.7322580645161291}

<IPython.core.display.Javascript object>

In [35]:
1.0 / 0.5

2.0

<IPython.core.display.Javascript object>

In [40]:
def normalize(probabilities):
    factor = 1.0 / float(sum(probabilities.values()))
    for name, value in probabilities.items():
        probabilities[name] *= factor
    return probabilities

<IPython.core.display.Javascript object>

In [37]:
normalize({"spam": 0.2, "ham": 0.0})

{'spam': 1.0, 'ham': 0.0}

<IPython.core.display.Javascript object>

In [38]:
normalize({"spam": 0.2, "ham": 0.1})

{'spam': 0.6666666666666666, 'ham': 0.3333333333333333}

<IPython.core.display.Javascript object>

In [39]:
normalize({"spam": 0.0, "ham": 0.0})

ZeroDivisionError: float division by zero

<IPython.core.display.Javascript object>

In [44]:
for word in model.tokenize(false_positive):
    counts_per_label = model.word_label_counts.get(word, {})
    probabilities = model.update_probabilities(
        probabilities, counts_per_label, model.number_of_words
    )
    print(probabilities)
    probabilities = normalize(probabilities)
    print(word, probabilities)

{'ham': 0.0012519662981877735, 'spam': 0.00014757316898753105}
hallo {'ham': 0.8945559075333699, 'spam': 0.10544409246663011}
{'ham': 0.005378091628457133, 'spam': 2.1250320932412357e-05}
vielen {'ham': 0.9960642757707141, 'spam': 0.0039357242292859295}
{'ham': 0.006653735977092279, 'spam': 7.931729603558907e-07}
dank {'ham': 0.9998808070499849, 'spam': 0.00011919295001504606}
{'ham': 0.021373537625650984, 'spam': 4.804230149739866e-08}
für {'ham': 0.9999977522582199, 'spam': 2.247741780146567e-06}
{'ham': 0.012692022239750285, 'spam': 4.529910883003964e-10}
den {'ham': 0.9999999643089916, 'spam': 3.569100838115142e-08}
{'ham': 0.0006680026481689992, 'spam': 7.192867468994644e-12}
tipp {'ham': 0.9999999892322772, 'spam': 1.0767722869452015e-08}
{'ham': 0.022044087938988075, 'spam': 2.170036853980656e-12}
mit {'ham': 0.9999999999015591, 'spam': 9.84407637899602e-11}
{'ham': 0.00033400133597246464, 'spam': 1.983892861546961e-14}
pywin32 {'ham': 0.9999999999406023, 'spam': 5.9397752277035

version {'ham': 1.0, 'spam': 3.2291404821272726e-72}
{'ham': 0.02404809619238477, 'spam': 1.2885324777533253e-73}
in {'ham': 1.0, 'spam': 5.358147553324245e-72}
{'ham': 0.0013360053440213762, 'spam': 1.0798362662886428e-75}
meinen {'ham': 0.9999999999999999, 'spam': 8.08257445317049e-73}
{'ham': 0.000334001336005344, 'spam': 1.6288944887485872e-76}
ordnern {'ham': 1.0, 'spam': 4.8769100993132705e-73}
{'ham': 0.02738810955243821, 'spam': 9.828516927273822e-77}
zu {'ham': 0.9999999999999999, 'spam': 3.5886072780802217e-75}
{'ham': 0.000668002672010688, 'spam': 7.232179117453087e-79}
finden {'ham': 1.0, 'spam': 1.0826572138827273e-75}
{'ham': 0.05744822979291917, 'spam': 2.1818968437781688e-79}
ich {'ham': 1.0, 'spam': 3.798022761785952e-78}
{'ham': 0.002004008016032064, 'spam': 7.654217577158307e-82}
werde {'ham': 1.0, 'spam': 3.8194545710019956e-79}
{'ham': 0.013360053440213761, 'spam': 7.697409453853276e-83}
es {'ham': 1.0, 'spam': 5.7615109762091765e-81}
{'ham': 0.004008016032064128, 

arbeiten {'ham': 0.9999999999999999, 'spam': 2.9820107308316193e-139}
{'ham': 0.0020040080160320635, 'spam': 6.009695144763441e-143}
können {'ham': 1.0, 'spam': 2.998837877236958e-140}
{'ham': 0.05744822979291917, 'spam': 6.043607168957997e-144}
ich {'ham': 1.0, 'spam': 1.0520092944104792e-142}
{'ham': 0.014696058784235137, 'spam': 2.1201315888965723e-146}
habe {'ham': 1.0, 'spam': 1.4426531766264405e-144}
{'ham': 0.0026720106880427524, 'spam': 2.9074026131125362e-148}
eure {'ham': 0.9999999999999999, 'spam': 1.0880954279573666e-145}
{'ham': 0.005344021376085504, 'spam': 2.1928565658149267e-149}
django {'ham': 1.0, 'spam': 4.1033828487811823e-147}
{'ham': 0.008684034736138945, 'spam': 8.269614769812943e-151}
folge {'ham': 1.0, 'spam': 9.522779469546136e-149}
{'ham': 0.0006680026720106881, 'spam': 1.919141368308371e-152}
zwar {'ham': 0.9999999999999999, 'spam': 2.8729546283576308e-149}
{'ham': 0.000334001336005344, 'spam': 5.789912592417635e-153}
gehört {'ham': 1.0, 'spam': 1.7334998301

{'ham': 0.047428189712758854, 'spam': 1.6079791213177458e-218}
das {'ham': 1.0, 'spam': 3.390344710722064e-217}
{'ham': 0.00033400133600534405, 'spam': 6.832617313023104e-221}
tool {'ham': 0.9999999999999999, 'spam': 2.045685623519117e-217}
{'ham': 0.028724114896459582, 'spam': 4.122703795886975e-221}
auch {'ham': 1.0, 'spam': 1.4352761819634424e-219}
{'ham': 0.00033400133600534405, 'spam': 2.8925356347509924e-223}
optisch {'ham': 0.9999999999999999, 'spam': 8.66025169044447e-220}
{'ham': 0.000334001336005344, 'spam': 1.7453147300371768e-223}
anheben {'ham': 1.0, 'spam': 5.225472301731308e-220}
{'ham': 0.0033400133600534404, 'spam': 3.580533217631287e-222}
also {'ham': 1.0, 'spam': 1.0720116453588071e-219}
{'ham': 0.01002004008016032, 'spam': 2.1604426548948148e-223}
wie {'ham': 0.9999999999999999, 'spam': 2.156121769585025e-221}
{'ham': 0.002672010688042752, 'spam': 4.3452675727227427e-225}
gesagt {'ham': 1.0, 'spam': 1.6262163890914868e-222}
{'ham': 0.00033400133600534405, 'spam': 3.

<IPython.core.display.Javascript object>

In [None]:
    def predict(self, message):
        probabilities = dict(self.prior_probabilities)
        for word in self.tokenize(message):
            counts_per_label = self.word_label_counts.get(word, {})
            probabilities = self.update_probabilities(
                probabilities, counts_per_label, self.number_of_words
            )
        return probabilities