In [61]:
from __future__ import unicode_literals, print_function

import pandas as pd
import spacy
import plac
import random
from pathlib import Path
from spacy.util import minibatch, compounding

nlp = spacy.load('en_core_web_sm')

In [2]:
DATA_SET_PATH = "/opt/luciapp/data/temp/data_set.xlsx"
df = pd.read_excel(DATA_SET_PATH)
df = df.drop('Unnamed: 0', axis=1)

In [6]:
# Prepare the training data such as
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]

## Make the training data

In [62]:
TRAIN_DATA = []
tags = []
no_of_records = 204
for rc in range(0, no_of_records):
    paragraph = df.iloc[rc]['Paragraphs']
    paragraph_list = paragraph.split(' ')
    output_list = df.iloc[rc]['output'].split(' ')
    len_till = 0
    entities = []
    for word_index, word in enumerate(paragraph_list):
        end = len_till + len(word)
        tag = output_list[word_index]
        entities.append((len_till, end, tag))
        len_till += len(word)
        if tag not in tags:
            tags.append(tag)
    TRAIN_DATA.append((paragraph, {"entities": entities}))

In [65]:
# Define custom labels
LABELS = tags[1:]

In [69]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)


def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
        
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    for LABEL in LABELS:
        ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


In [70]:
main('en_core_web_sm','en_core_web_sm_im' , '/opt/luciapp/data/temp', 100)

Loaded model 'en_core_web_sm'


KeyError: "[E022] Could not find a transition with the name 'B-O' in the NER model."

In [54]:
# Load the model
model = 'en_core_web_sm'
# Setting up the pipeline and entity recognizer.
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

Loaded model 'en_core_web_sm'


In [60]:
# Get names of other pipes to disable them during training to train
# only NER and update the weights
import random
from spacy.util import minibatch, compounding

n_iter = 100
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, 
                            size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch) 
            # Updating the weights
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print('Losses', losses)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print('Losses', losses)

KeyError: "[E022] Could not find a transition with the name 'U-O' in the NER model."

In [55]:
LABEL = tags[1:]
# Add new entity labels to entity recognizer
for i in LABEL:
    ner.add_label(i)
# Inititalizing optimizer
if model is None:
    optimizer = nlp.begin_training()
else:
    optimizer = nlp.entity.create_optimizer()