In [1]:
import re
import requests

from collections import defaultdict

from getpass import getpass

from cast.models.moderation import NaiveBayes, SpamFilter, Evaluation

# Get Training Data From Production

In [2]:
username = "jochen"
password = getpass()
host = "http://localhost:8000"
host = "https://python-podcast.de"
token_url = f"{host}/api/api-token-auth/"
r = requests.post(token_url, data={"username": username, "password": password})
token = r.json()["token"]

# training_path = reverse("cast:api:comment-training-data")
training_path = "/api/comment_training_data/"
training_data_url = f"{host}{training_path}"
headers = {"Authorization": f"Token {token}"}
r = requests.get(training_data_url, headers=headers)
print(r.status_code)
messages = r.json()
print(len(messages))

········
200
1865


# Evaluate Model

In [3]:
def show_performance(performance):
    for label, result in performance.items():
        precision, recall, f1 = result["precision"], result["recall"], result["f1"]
        print(f"{label: >4} f1: {f1:.3f} precision: {precision:.3f} recall: {recall:.3f}")

In [4]:
evaluator = Evaluation()
performance = evaluator.evaluate(messages)
show_performance(performance)

 ham f1: 0.953 precision: 0.982 recall: 0.925
spam f1: 0.997 precision: 0.995 recall: 0.999


# Retrain From Scratch

In [5]:
sf = SpamFilter.default
sf.retrain_from_scratch(messages)

In [6]:
sf.performance

{'ham': {'precision': 0.9739130434782609,
  'recall': 0.9333333333333333,
  'f1': 0.9531914893617022},
 'spam': {'precision': 0.9954233409610984,
  'recall': 0.9982788296041308,
  'f1': 0.9968490403895731}}

# Debug Model

In [36]:
token_pattern = re.compile(r"(?u)\b\w\w+\b")
standard_tokenizer = token_pattern.findall


def regex_tokenize(message):
    return standard_tokenizer(message.lower())


def normalize(probabilities):
    try:
        factor = 1.0 / float(sum(probabilities.values()))
    except ZeroDivisionError:
        # not possible to scale -> skip
        return probabilities
    for name, value in probabilities.items():
        probabilities[name] *= factor
    return probabilities


class NaiveBayes:
    def __init__(self, tokenize=regex_tokenize, prior_probabilities={}, word_label_counts=None):
        self.tokenize = tokenize
        self.prior_probabilities = prior_probabilities
        if word_label_counts is None:
            self.word_label_counts = defaultdict(lambda: defaultdict(int))
        else:
            self.word_label_counts = word_label_counts
        self.number_of_words = self.get_number_of_words(self.word_label_counts)
        self.number_of_all_words = 1  # FIXME 0 or 1 - division by zero?

    @staticmethod
    def get_label_counts(messages):
        label_counts = defaultdict(int)
        for label, text in messages:
            label_counts[label] += 1
        return label_counts

    def set_prior_probabilities(self, label_counts):
        number_of_messages = sum(label_counts.values())
        self.prior_probabilities = {label: count / number_of_messages for label, count in label_counts.items()}

    def set_word_label_counts(self, messages):
        counts = self.word_label_counts
        for label, text in messages:
            for word in self.tokenize(text):
                counts[word][label] += 1

    @staticmethod
    def get_number_of_words(word_label_counts):
        number_of_words = defaultdict(int)
        for word, counts in word_label_counts.items():
            for label, count in counts.items():
                number_of_words[label] += 1
        return number_of_words

    def fit(self, messages):
        self.set_prior_probabilities(self.get_label_counts(messages))
        self.set_word_label_counts(messages)
        self.number_of_words = self.get_number_of_words(self.word_label_counts)
        self.number_of_all_words = sum(self.number_of_words.values())
        return self

    @staticmethod
    def update_probabilities(probabilities, counts_per_label, number_of_all_words):
        updated_probabilities = {}
        for label, prior_probability in probabilities.items():
            word_count = counts_per_label.get(label, 0.5)
            word_probability = word_count / number_of_all_words
            updated_probabilities[label] = prior_probability * word_probability
        return updated_probabilities

    def predict(self, message):
        probabilities = dict(self.prior_probabilities)
        for word in self.tokenize(message):
            counts_per_label = self.word_label_counts.get(word, {})
            probabilities = normalize(self.update_probabilities(probabilities, counts_per_label, self.number_of_all_words))
        return probabilities

    def predict_label(self, message):
        probabilities = self.predict(message)
        if len(probabilities) == 0:
            return None
        return sorted(((prob, label) for label, prob in probabilities.items()), reverse=True)[0][1]

    def dict(self):
        return {
            "class": "NaiveBayes",
            "prior_probabilities": self.prior_probabilities,
            "word_label_counts": self.word_label_counts,
        }

    def __eq__(self, other):
        return (
            self.prior_probabilities == other.prior_probabilities and self.word_label_counts == other.word_label_counts
        )

nb = NaiveBayes().fit(messages)

In [None]:
nb.number_of_words

In [37]:
evaluator = Evaluation(model_class=NaiveBayes)
performance = evaluator.evaluate(messages)
show_performance(performance)

 ham f1: 0.953 precision: 0.974 recall: 0.933
spam f1: 0.997 precision: 0.995 recall: 0.998


In [33]:
# nb.predict_label(messages[35][1])

In [22]:
nb.prior_probabilities

{'ham': 0.06541554959785523, 'spam': 0.9345844504021448}

In [21]:
nb = NaiveBayes().fit(messages)
for i, (label, message) in enumerate(messages):
    predicted = nb.predict_label(message)
    if label == "spam" and (predicted != label):
        print(message)
        break

Cibulja40  преобразователь частоты кварцевых резонаторов . Модерируемый рейтинг . Электронный блок питания транзисторов на запуске двигатель постоянного тока через пару контактов . Также большое количество товаров , но несколько оборотов вентилятора частоты являются настоящими любителями аппаратуры . И благодаря интеллектуальному управлению , пригонке муфт , когда из двух головных станций различного  
<a href=https://prom-electric.ru/articles/1/71/>как работает частотный преобразователь</a>  преобразователь частоты осуществлялась с нашими менеджерами нашей компании в системе водоснабжения , устойчивого к условиям , что их на повышение эластичности кожи или равно , имеет компактные размеры , благодаря встроенному позиционеру . Периодические испытания , изготовлен из перечисленных недостатков невозможность получить контакт с контроллерами отсутствие ограничения по цене .  
преобразователь частоты . Ранее все полученные данные любая из импортных . Так как привод . Таким образом , связанны