In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
%load_ext autoreload

%autoreload 1
%aimport ds_tutorial.datasets

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prepare Data

## Download Dataset

In [None]:
from ds_tutorial.utils import download_from_url

In [None]:
from pathlib import Path

root = Path.home()
archive_name = "sms-spam-collection-dataset.zip"
training_data_url = "https://d2b7dn9rofvhjd.cloudfront.net/{}".format(archive_name)
data_root = root / "data" / "tmp"
data_root.mkdir(parents=True, exist_ok=True)
training_data_path = data_root / archive_name

In [None]:
file_size = download_from_url(training_data_url, training_data_path)

HBox(children=(IntProgress(value=0, description='sms-spam-collection-dataset.zip', max=212825, style=ProgressS…




## Unpack Archive

In [None]:
import zipfile

In [None]:
%%time
archive = zipfile.ZipFile(str(training_data_path), 'r')
archive.extractall(str(data_root))
archive.close()

CPU times: user 4.16 ms, sys: 1.99 ms, total: 6.15 ms
Wall time: 6.65 ms


In [None]:
training_data_path.unlink()

In [None]:
data_csv_path = list(data_root.glob("*spam*"))[0]

## Parse CSV

In [None]:
import pandas as pd

pd.set_option("max_colwidth", 150)

In [None]:
df = pd.read_csv(
    data_csv_path, encoding="latin1", usecols=[0, 1],
    names=["label", "message"], header=1
)

In [None]:
messages = [(r["label"], r["message"]) for r in df.to_dict(orient="rows")]

In [None]:
messages[:5]

[('ham', 'Ok lar... Joking wif u oni...'),
 ('spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"),
 ('ham', 'U dun say so early hor... U c already then say...'),
 ('ham', "Nah I don't think he goes to usf, he lives around here though"),
 ('spam',
  "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv")]

## Split Train/Test

In [None]:
import random

def split_train_test(messages, test_quote=0.25):
    train, test = [], []
    for label, message in messages:
        if random.random() < test_quote:
            test.append((label, message))
        else:
            train.append((label, message))
    return train, test

In [None]:
train_messages, test_messages = split_train_test(messages)
print(len(train_messages), len(test_messages))

4145 1426


# Train Model

In [None]:
import re

from collections import defaultdict

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

token_pattern = re.compile(r"(?u)\b\w\w+\b")
standard_tokenizer = token_pattern.findall

def tokenize(message):
    # return message.split()
    return standard_tokenizer(message)
    # return [w.lower() for w in standard_tokenizer(message)]
    # return [w for w in standard_tokenizer(message) if w.lower() not in ENGLISH_STOP_WORDS]

In [None]:
def get_initial_probabilities(total):
    initial_probabilities = {}
    all_observations = sum(total.values())
    for label, observations_per_label in total.items():
        initial_probabilities[label] = observations_per_label / all_observations
    return initial_probabilities


def get_words_per_label(counts):
    words_per_label = defaultdict(int)
    for word, label_counts in counts.items():
        for label, count in label_counts.items():
            # words_per_label[label] = words_per_label.get(label, 0) + 1
            words_per_label[label] += 1
    return words_per_label


def build_model(counts, total):
    words_per_label = get_words_per_label(counts)
    return {
        "counts": counts,
        "labels": list(total.keys()),
        "words_per_label": words_per_label,
        "minimal_probability": 1. / sum(words_per_label.values()),
        "initial_probabilities": get_initial_probabilities(total),
    }

In [None]:
#def train(messages):
#    counts, total = {}, {}    
#    for label, message in messages:
#        total[label] = total.get(label, 0) + 1
#        for word in tokenize(message):
#            label_counts = counts.get(word, {})
#            label_counts[label] = label_counts.get(label, 0) + 1
#            counts[word] = label_counts
#    return build_model(counts, total)
#
## counts.setdefault(word, {})[label] = counts.get(word, {}).get(label, 0) + 1

In [None]:
def train(messages):
    total = defaultdict(int)
    counts = defaultdict(lambda: defaultdict(int)) 
    for label, message in messages:
        total[label] += 1
        for word in tokenize(message):
            counts[word][label] += 1
    return build_model(counts, total)

In [None]:
%%time
model = train(train_messages)

CPU times: user 51.1 ms, sys: 3.5 ms, total: 54.6 ms
Wall time: 52.9 ms


# Predict Label

In [None]:
def normalize(label_results):
    try:
        scale_factor = 1. / sum(label_results.values())
    except ZeroDivisionError:
        scale_factor = 1
    for label, probability in label_results.items():
        label_results[label] = probability * scale_factor
    return label_results


def predict(
    message,
    labels=[],
    counts={},
    words_per_label={},
    minimal_probability=None,
    initial_probabilities={},
):
    label_results = {}
    for word in tokenize(message):
        for label in labels:
            counts_per_label = counts.get(word, {})
            if label in counts_per_label:
                label_frequency = counts_per_label[label] / words_per_label[label]
            else:
                label_frequency = minimal_probability
            previous_result = label_results.get(label, initial_probabilities[label])
            label_results[label] = previous_result * label_frequency
    if label_results == {}:
        label_results = dict(initial_probabilities)
    return normalize(label_results)


def predict(
    message,
    labels=[],
    counts={},
    words_per_label={},
    minimal_probability=None,
    initial_probabilities={},
):
    label_results = {}
    for word in tokenize(message):
        for label in labels:
            counts_per_label = counts.get(word, {})
            counts_per_label.get(label, )
            
            if label in counts_per_label:
                label_frequency = counts_per_label[label] / words_per_label[label]
            else:
                label_frequency = minimal_probability
            previous_result = label_results.get(label, initial_probabilities[label])
            label_results[label] = previous_result * label_frequency
    if label_results == {}:
        label_results = dict(initial_probabilities)
    return normalize(label_results)


def predict_label(message, **model):
    result = predict(message, **model)
    return sorted([(v, k) for k, v in result.items()], reverse=True)[0][1]

In [None]:
my_spam_message = "renew your subscription for free now"
my_ham_message = "remember to go to church on sunday"

In [None]:
%%time
result = predict(my_ham_message, **model)
for label, probability in result.items():
    print(f"{label:>4}: {probability:.8f}")

 ham: 0.99998820
spam: 0.00001180
CPU times: user 73 µs, sys: 8 µs, total: 81 µs
Wall time: 83 µs


In [None]:
print(predict_label(my_ham_message, **model))

ham


In [None]:
messages

[('ham', 'Ok lar... Joking wif u oni...'),
 ('spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"),
 ('ham', 'U dun say so early hor... U c already then say...'),
 ('ham', "Nah I don't think he goes to usf, he lives around here though"),
 ('spam',
  "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"),
 ('ham',
  'Even my brother is not like to speak with me. They treat me like aids patent.'),
 ('ham',
  "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"),
 ('spam',
  'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'),
 ('spam',
  'Had your mobile 11 months

# Evaluate

In [None]:
%%time

true_positives = 0
all_observations = len(test_messages)

for label, message in test_messages:
    if predict_label(message, **model) == label:
        true_positives += 1

accuracy = true_positives / all_observations
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.975
CPU times: user 26.9 ms, sys: 772 µs, total: 27.7 ms
Wall time: 27.1 ms


In [None]:
outcomes = (("true", "false"), ("positive", "negative"))
possible_results = [f"{a}_{b}" for b in outcomes[1] for a in outcomes[0]]
result_template = dict.fromkeys(possible_results, 0)

labels = set(model["labels"])
label_results = {label: dict(result_template) for label in labels}
all_observations = len(test_messages)

for label, message in test_messages:
    predicted = predict_label(message, **model)
    if label == predicted:
        label_results[label]["true_positive"] += 1
    else:
        label_results[label]["false_negative"] += 1
        label_results[predicted]["false_positive"] += 1

In [None]:
def precision_recall_f1(result):
    all_observations = sum(result.values())
    tp = result["true_positive"]
    fp = result["false_positive"]
    fn = result["false_negative"]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1


def show_result(label_results):
    for label, result in label_results.items():
        precision, recall, f1 = precision_recall_f1(result)
        print(f"{label: >4} f1: {f1:.3f} precision: {precision:.3f} recall: {recall:.3f}")

In [None]:
# just split
show_result(label_results)

spam f1: 0.933 precision: 0.949 recall: 0.918
 ham f1: 0.990 precision: 0.988 recall: 0.993


In [None]:
# standard tokenizer
show_result(label_results)

 ham f1: 0.986 precision: 0.974 recall: 0.998
spam f1: 0.907 precision: 0.983 recall: 0.842


In [None]:
# standard tokenizer
show_result(label_results)

 ham f1: 0.986 precision: 0.974 recall: 0.998
spam f1: 0.907 precision: 0.983 recall: 0.842


In [None]:
# standard tokenizer + lower
show_result(label_results)

 ham f1: 0.989 precision: 0.988 recall: 0.990
spam f1: 0.934 precision: 0.939 recall: 0.930


In [None]:
# standard tokenizer + stopword removal
show_result(label_results)

 ham f1: 0.988 precision: 0.988 recall: 0.987
spam f1: 0.925 precision: 0.920 recall: 0.930


In [None]:
WTF?

https://towardsdatascience.com/spam-classifier-in-python-from-scratch-27a98ddd8e73
https://github.com/tejank10/Spam-or-Ham/blob/master/spam_ham.ipynb

# Tests

In [None]:
messages = [
    ("ham", "foo bar baz"),
    ("ham", "foo bar blub"),
    ("spam", "bla blub blubber"),
    ("spam", "bla asdf bsdf"),
]

model = train(messages)
assert model["counts"]["foo"]["ham"] == 2
assert model["counts"]["bla"]["spam"] == 2
assert model["words_per_label"]["ham"] == 4
assert model["words_per_label"]["spam"] == 5
assert model["initial_probabilities"] == {'ham': 0.5, 'spam': 0.5}

# Scikit-Learn

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

In [None]:
def to_docs_and_labels(messages):
    labels, docs = [], []
    for label, message in messages:
        docs.append(message)
        labels.append([label])
    return docs, labels

train_docs, train_labels = to_docs_and_labels(train_messages)
y_train = MultiLabelBinarizer().fit_transform(train_labels)
        
test_docs, test_labels = to_docs_and_labels(test_messages)
y_test = MultiLabelBinarizer().fit_transform(test_labels)

## Naive Bayes from sklearn

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [None]:
vectorizer = TfidfVectorizer()
#vectorizer = CountVectorizer()
vectorizer.fit(train_docs)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
X_train = vectorizer.transform(train_docs)
X_test = vectorizer.transform(test_docs)

In [None]:
#model = OneVsRestClassifier(GaussianNB())
model = OneVsRestClassifier(MultinomialNB())

In [None]:
model.fit(X_train.toarray(), y_train)
y_pred = model.predict(X_test.toarray())

In [None]:
print(classification_report(y_test, y_pred, target_names=["ham", "spam"], labels=[0, 1], digits=3))

              precision    recall  f1-score   support

         ham      0.944     1.000     0.971      1223
        spam      1.000     0.640     0.781       203

   micro avg      0.949     0.949     0.949      1426
   macro avg      0.972     0.820     0.876      1426
weighted avg      0.952     0.949     0.944      1426
 samples avg      0.949     0.949     0.949      1426



## Linear Classifiers (still kind of SOTA)

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_docs)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
X_train = vectorizer.transform(train_docs)
X_test = vectorizer.transform(test_docs)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
#model = OneVsRestClassifier(LogisticRegression(C=100, solver="liblinear", multi_class="ovr"))
model = OneVsRestClassifier(LogisticRegression(solver="liblinear", multi_class="ovr"))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, target_names=["ham", "spam"], labels=[0, 1], digits=3))

              precision    recall  f1-score   support

         ham      0.962     0.999     0.980      1223
        spam      0.994     0.764     0.864       203

   micro avg      0.966     0.966     0.966      1426
   macro avg      0.978     0.881     0.922      1426
weighted avg      0.967     0.966     0.964      1426
 samples avg      0.966     0.966     0.966      1426



### Linear Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC
model = OneVsRestClassifier(LinearSVC())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, target_names=["ham", "spam"], labels=[0, 1], digits=3))

              precision    recall  f1-score   support

         ham      0.984     0.999     0.991      1223
        spam      0.995     0.901     0.946       203

   micro avg      0.985     0.985     0.985      1426
   macro avg      0.989     0.950     0.969      1426
weighted avg      0.985     0.985     0.985      1426
 samples avg      0.985     0.985     0.985      1426



# SpaCy

In [None]:
import spacy

from spacy.util import minibatch, compounding

import thinc.extra.datasets

In [None]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]["cats"]
        #print("gold: ", gold)
        #print("cats: ", doc.cats)
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "ham":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [None]:
#nlp = spacy.load("en")
nlp = spacy.load('en_vectors_web_lg')

In [None]:
textcat = nlp.create_pipe(
    "textcat",
    config={
        "exclusive_classes": True,
        "architecture": "simple_cnn",
    }
)
nlp.add_pipe(textcat, last=True)

In [None]:
textcat.add_label("ham")
textcat.add_label("spam")

1

In [None]:
dev_texts = [m for l, m in test_messages]
dev_cats = [{"cats": {"ham": l == "ham", "spam": l == "spam"}} for l, m in test_messages]

In [None]:
train_data = [(m, {"cats": {"ham": l == "ham", "spam": l == "spam"}}) for l, m in train_messages]

In [None]:
train_data[:5]

[('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}}),
 ('U dun say so early hor... U c already then say...',
  {'cats': {'ham': True, 'spam': False}}),
 ("Nah I don't think he goes to usf, he lives around here though",
  {'cats': {'ham': True, 'spam': False}}),
 ("FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv",
  {'cats': {'ham': False, 'spam': True}})]

In [None]:
def train_spacy(nlp, textcat, train_data, dev_texts, dev_cats, other_pipes):
    n_iter = 6

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

In [None]:
# plain en model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
train_spacy(nlp, textcat, train_data, dev_texts, dev_cats, other_pipes)

Training the model...
LOSS 	  P  	  R  	  F  
2.059	0.983	0.934	0.958
0.049	0.988	0.940	0.963
0.006	0.983	0.951	0.966
0.000	0.983	0.951	0.966
0.000	0.983	0.940	0.961
0.000	0.977	0.945	0.961


In [None]:
# en with embeddings model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
train_spacy(nlp, textcat, train_data, dev_texts, dev_cats, other_pipes)

Training the model...
LOSS 	  P  	  R  	  F  
2.176	0.961	0.945	0.953
0.062	0.972	0.940	0.955
0.007	0.977	0.945	0.961
0.000	0.983	0.945	0.964
0.000	0.989	0.945	0.966
0.000	0.989	0.945	0.966


In [None]:
test_text = "winner!! as a valued customer you have been selected"
doc = nlp(test_text)
print(test_text, doc.cats)

winner!! as a valued customer you have been selected {'ham': 0.023861024528741837, 'spam': 0.9761389493942261}
