In [1]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets

In [2]:
import spacy
from spacy.util import minibatch, compounding

In [3]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_texts=("Number of texts to train from", "option", "t", int),
    n_iter=("Number of training iterations", "option", "n", int),
    init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    print ("train_data: ")
    print (train_data)
    # converts the data to the format:
    # (text, {'cats': {'POSITIVE': True, 'NEGATIVE': False}}))
    
    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                print ("Batch: ")
                print (batch)
                texts, annotations = zip(*batch)
 #               print ("Texts: ")
 #               print (texts)
                # Eliminación del 20% de los casos para evitar generalizaciones
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    # test the trained model
    test_text = "L'actitud de la professora em resulta molt motivadora."
    
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)



In [4]:
def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    # Array of (text, label)
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    print (train_data[0])
    # split text and label
    texts, labels = zip(*train_data)
#    print ("Texts: ", end="")
#    print (texts[0])
#    print ("Labels: ", end="")
#    print (labels[0])
    # Converts: label=1 -> {"POSITIVE": True, "NEGATIVE": False}
    # label=0 -> {"POSITIVE": False, "NEGATIVE": True}
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
#    print ("cats")
#    print (cats[0])
    split = int(len(train_data) * split)
#    print ("split")
#    print (split)   
    # Return train data and test data
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [5]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}




In [6]:
if __name__ == "__main__":
    plac.call(main)


usage: ipykernel_launcher.py [-h] [-m None] [-o None] [-n 20] [-t 2000]
                             [-t2v None]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/david/.local/share/jupyter/runtime/kernel-ec4f071a-32d2-4735-8683-5506f172a9c0.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [6]:
model = "es_core_news_sm"
output_dir = "../data/processed/"
n_texts = 2
n_iter = 1
init_tok2vec=None


In [7]:
main (model, output_dir, n_iter, n_texts, init_tok2vec)


Loaded model 'es_core_news_sm'
Loading IMDB data...
Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
("Steely, powerful gangster supreme Frankie Diomede (the always terrific Lee Van Cleef in fine rugged form) has himself arrested and sent to prison so he can rub out a traitorous partner sans detection. Fawning goofball small-time hood and wiseguy wannabe Tony Breda (an amiable portrayal by Tony Lo Bianco) gets busted as well. Frank and Tony form an unlikely friendship behind bars. Tony helps Frank break out of the joint and assists him on his quest to exact revenge on a rival group of mobsters lead by the ruthless Louis Annunziata (smoothly played by Jean Rochefort). Director Michele Lupo, working from an absorbing script by Sergio Donati and Luciano Vincenzoni, relates the neat story at a constant brisk pace, sustains a suitably gritty, but occasionally lighthearted tone throughout, and stages the rousing action set pieces with considerable rip-snor

0.125	0.495	1.000	0.662
This movie sucked {'POSITIVE': 0.49531424045562744, 'NEGATIVE': 0.5046857595443726}
Saved model to ../data/processed
Loading from ../data/processed
This movie sucked {'POSITIVE': 0.4969375729560852, 'NEGATIVE': 0.5030624270439148}
