In [1]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

#### Training the Named Entity Recognizer (NER)
##### Adding An Additional Entity (NER)
+ Load the model
    + spacy.load('en')
     - Disable existing pipe line (nlp.disable_pipes)
    + spacy.blank('en')
     - Added Entity Recognizer to Pipeline
+ Add a Label eg(ner.add_label(LABEL) & (nlp.begin_training())
+ Shuffle and loop over the examples
     - update the model (nlp.update)
+ Save the trained model (nlp.to_disk)
+ Test

In [2]:
TRAIN_DATA = [
        ("Please book a cab from airport to hauz khaas at 3 PM", {"entities": [(23,30, "LOCATION"),(34, 44, 'LOCATION')]}),
        ("Kindly book a cab for me at 1 PM from hauz khaas to dwarka sector 23", {"entities": [(38, 48, "LOCATION"),(52, 68, "LOCATION")]}),
("Please book a cab from dwarka sector 23 to dwarka sector 21 at 3 PM", {"entities": [(23,39, "LOCATION"),(43, 59, "LOCATION")]}), ("I want to go to dwarka sector 23 from dwarka sector 21 leaving at 10 AM",
  {"entities": [(16,32, "LOCATION"),(38,54, 'LOCATION'),(66,71, 'TIME')]}),

 ("Kindly book a cab for me at 12 AM from dwarka sector 21 to airport",
  {"entities": [(28,33, "LOCATION"),(39,55, 'LOCATION'),(59,66, 'TIME')]}),

("hauz khaas to airport at 6 PM",
  {"entities": [(0,11, "LOCATION"),(14,21, 'LOCATION'),(25,30, 'TIME')]})]

In [3]:

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [4]:
main()

Created blank 'en' model
Losses {'ner': 67.61615478992462}
Losses {'ner': 65.27188408374786}
Losses {'ner': 60.89177322387695}
Losses {'ner': 54.057390093803406}
Losses {'ner': 47.42455333471298}
Losses {'ner': 38.15659460425377}
Losses {'ner': 31.4411893337965}
Losses {'ner': 25.253413321916014}
Losses {'ner': 24.792237991001457}
Losses {'ner': 27.08376437611878}
Losses {'ner': 28.38225531601347}
Losses {'ner': 26.933002838632092}
Losses {'ner': 24.60131986020133}
Losses {'ner': 25.887421667575836}
Losses {'ner': 21.633119106292725}
Losses {'ner': 23.578279107809067}
Losses {'ner': 24.97397291660309}
Losses {'ner': 24.09725451655686}
Losses {'ner': 21.32455402240157}
Losses {'ner': 25.54311622492969}
Losses {'ner': 23.96941846422851}
Losses {'ner': 19.532400651834905}
Losses {'ner': 21.45470502972603}
Losses {'ner': 23.07109503215179}
Losses {'ner': 24.241117200930603}
Losses {'ner': 26.65587829548167}
Losses {'ner': 29.74948314303765}
Losses {'ner': 26.921938360785134}
Losses {'ner':

 ## Loading the Trained Spacy NER Model

In [6]:
from pathlib import Path
output_dir=Path("C:\\Users\\mayank singh\\Downloads\\NIT Warangal _file\\NLP\\MidTermProject2\\Models")
nlp2 = spacy.load(output_dir)

 ### Testing on Unseen Data

In [9]:
docx = nlp2('hauz khaas to airport at 9 AM')
print("Entities", [(ent.text, ent.label_) for ent in docx.ents])

Entities [('hauz khaas', 'LOCATION'), ('airport', 'LOCATION'), ('9 AM', 'TIME')]


In [7]:
docx = nlp2('I want to go to dwarka sector 23 from airport leaving at 7 PM')
print("Entities", [(ent.text, ent.label_) for ent in docx.ents])

Entities [('dwarka sector 23', 'LOCATION'), ('airport leaving', 'LOCATION'), ('7 PM', 'TIME')]


In [8]:
docx = nlp2('I want to go to dwarka sector 21 from dwarka sector 23 leaving at 11 PM')
print("Entities", [(ent.text, ent.label_) for ent in docx.ents])

Entities [('dwarka sector 21', 'LOCATION'), ('dwarka sector 23', 'LOCATION'), ('11 PM', 'TIME')]
