In [1]:
!python -m spacy download en_core_web_sm

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd
import numpy as np
import random
from pathlib import Path

import spacy
from spacy.util import minibatch, compounding

In [3]:
# code adapted from https://spacy.io/usage/training

def train(split_data, output_dir=None, n_iter=20):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    nlp = spacy.load("en_core_web_sm")
    print("Created blank 'en' model")

    # add the text classifier to the pipeline 
    # nlp.create_pipe works for built-ins that are registered with spaCy
    textcat = nlp.create_pipe(
        "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
    )
    nlp.add_pipe(textcat, last=True)
    
    (train_texts, train_cats), (dev_texts, dev_cats) = split_data
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            len(train_texts) + len(dev_texts), len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    
    # add textcat labels
    for cat in set(train_cats[0].keys()):
        textcat.add_label(cat)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [4]:
def load_data(folder_path):
    train_data = pd.read_csv(folder_path + "train.csv")
    test_data = pd.read_csv(folder_path + "test.csv")
    train_texts = train_data.text
    test_texts = test_data.text
    cats = train_data.target.unique().tolist()
    # create spacy required one-hot-encoding dict
    train_dict = [{str(cat): x==cat for cat in cats} for x in train_data.target]
    test_dict = [{str(cat): x==cat for cat in cats} for x in test_data.target]
    return (train_texts, train_dict), (test_texts, test_dict)

In [5]:
for use_case in ['fake-news', 'hate-speech', 'spam']:
    print('Training model for use case', use_case)
    train(load_data('res/' + use_case + '/'), 'blackbox-models/' + use_case, n_iter=5)

Training model for use case fake-news
Created blank 'en' model
Using 44898 examples (35902 training, 8996 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
0.652	0.999	0.999	0.999
0.001	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.001	1.000	1.000	1.000
0.001	1.000	1.000	1.000
Saved model to blackbox-models\fake-news
Training model for use case hate-speech
Created blank 'en' model
Using 24783 examples (19754 training, 5029 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
11.122	0.898	0.883	0.890
0.105	0.899	0.890	0.894
0.076	0.899	0.890	0.894
0.065	0.897	0.889	0.893
0.057	0.896	0.890	0.893
Saved model to blackbox-models\hate-speech
Training model for use case spam
Created blank 'en' model
Using 5572 examples (4438 training, 1134 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
2.037	0.992	0.992	0.992
0.045	0.993	0.993	0.993
0.007	0.994	0.994	0.994
0.001	0.994	0.994	0.994
0.000	0.994	0.994	0.994
Saved model to blackbox-models\spam
