### Updating the existing NER model

In [7]:
import pandas as pd
import numpy as np
import os
import re
import spacy
from spacy.lang.en import English
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt

In [8]:
# load spaCy model
nlp = spacy.load("en_core_web_lg")

In [681]:
# Getting the pipeline component
ner=nlp.get_pipe("ner")

In [702]:
# training data
TRAIN_DATA = [
            ("Activist Shareholders have launched more Campaigns.",{"entities":[(0,21,"ORG")]}),
            ("Appeals Court ruled for Microsoft.",{"entities":[(0,13,"ORG")]}),
            ("ARM Processor threatens Intel dominance.",{"entities":[(0,13,"PRODUCT")]}),
            ("Business Survey has warned of China protectionism.",{"entities":[(0,15,"PRODUCT")]}),
            ("Code update has crashed Bing search engine.",{"entities":[(0,4,"PRODUCT")]}),
            ("Consumer Reports has pulled its recommended rating.",{"entities":[(0,16,"ORG")]}),
            ("Dow has hit Low.",{"entities":[(0,3,"ORG")]}),
            ("Equity Markets rise by Technology stock gains.",{"entities":[(0,14,"ORG")]}),
            ("Forex Traders questions Apple tax row.",{"entities":[(0,13,"ORG")]}),
            ("Futures rise by investors earnings.",{"entities":[(0,7,"PRODUCT")]}),
            ("Lizard Squad has hacked Lenovo website.",{"entities":[(0,12,"ORG")]}),
            ("Social Media Firms have reduced online hate speech.",{"entities":[(7,18,"ORG")]}),
            ("Nasdaq falls by Technology stock loss.",{"entities":[(0,6,"PRODUCT")]}),
            ("Nikkei Index fell on profit taking.",{"entities":[(0,6,"PRODUCT")]}),
            ("Oil prices rises on OPEC deal.",{"entities":[(0,3,"PRODUCT")]}),
            ("Pounds sterling falls after UK election shock.",{"entities":[(0,6,"MONEY")]}),
            ("Security Experts find ransomware worm clues.",{"entities":[(0,16,"ORG")]}),
            ("Business Software Alliance has urged the U.S. Trade Representative.",{"entities":[(9,26,"ORG")]}),
            ("Tax Bill eats BarnesNoble profit.",{"entities":[(0,8,"PRODUCT")]}),
            ("Tax Reforms threatens Bond Market.",{"entities":[(0,11,"PRODUCT")]}),
            ("Tay bot crashed on racist tweets.",{"entities":[(0,3,"PRODUCT")]}),
            ("Tech Firms would tackle extremism.",{"entities":[(0,10,"ORG")]}),
            ("Wall Street rises by Technology stock gains.",{"entities":[(0,11,"ORG")]}),
            ("Windows upgrade has met criticism.",{"entities":[(0,7,"PRODUCT")]})
             ]

In [703]:
# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [704]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [706]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

In [707]:
# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 14.266927182674408}
Losses {'ner': 23.875394094502553}
Losses {'ner': 33.928607855777955}
Losses {'ner': 47.70458920953752}
Losses {'ner': 55.095878325486865}
Losses {'ner': 65.77533700325557}
Losses {'ner': 5.159110188404156}
Losses {'ner': 14.296669818380906}
Losses {'ner': 21.32342330218671}
Losses {'ner': 27.941490478457126}
Losses {'ner': 39.898399042540404}
Losses {'ner': 44.528372298744216}
Losses {'ner': 3.697652095463127}
Losses {'ner': 9.611559730255976}
Losses {'ner': 13.920592521550134}
Losses {'ner': 19.108732437802246}
Losses {'ner': 27.399660565977683}
Losses {'ner': 39.15907766026794}
Losses {'ner': 9.930829423014075}
Losses {'ner': 15.00798585240409}
Losses {'ner': 21.80606380824611}
Losses {'ner': 23.381429198337173}
Losses {'ner': 33.44686221531538}
Losses {'ner': 35.211631144768944}
Losses {'ner': 10.010283339768648}
Losses {'ner': 20.69887478009332}
Losses {'ner': 29.16814612624512}
Losses {'ner': 35.22658518170283}
Losses {'ner': 39.963183988857054}

In [713]:
# Testing the model
doc = nlp("Wall Street rises after Brexit.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Wall Street', 'ORG')]


In [732]:
# Save the  model to directory
output_dir = Path('/model/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to \model


In [734]:
# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Loading from \model
Entities [('Fridge', 'PRODUCT')]
