<a href="https://colab.research.google.com/github/elvinaqa/NER-Python/blob/main/Spacy_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Packages

In [3]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 

In [5]:
import en_core_web_sm
nlp1 = en_core_web_sm.load()

In [6]:
# nlp1 = spacy.load('en_core_web_lg')

## Working of NER

In [19]:
docx1 = nlp1(u"Where is Apple?")

In [20]:
for token in docx1.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Apple 9 14 ORG


In [21]:
docx2 = nlp1(u"Who is Tim Cook?")

In [22]:
for token in docx2.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Tim Cook 7 15 PERSON


## Train Data

In [23]:
TRAIN_DATA = [
    ('Who is Tim Cook?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Elvin Aghammad?', {
        'entities': [(7, 21, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

## Define our variables

In [24]:
model = None
output_dir=Path("C:\\Users\\nithi\\Documents\\ner")
n_iter=100

## Load the model

In [25]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


## Set up the pipeline

In [26]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

## Train the Recognizer

In [27]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 27.41it/s]
100%|██████████| 3/3 [00:00<00:00, 33.54it/s]
100%|██████████| 3/3 [00:00<00:00, 35.65it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 13.22383338212967}
{'ner': 13.243457615375519}
{'ner': 11.741348505020142}


100%|██████████| 3/3 [00:00<00:00, 35.16it/s]
100%|██████████| 3/3 [00:00<00:00, 36.36it/s]
100%|██████████| 3/3 [00:00<00:00, 36.66it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 10.410413801670074}
{'ner': 8.14183023571968}
{'ner': 7.373452980071306}


100%|██████████| 3/3 [00:00<00:00, 35.29it/s]
100%|██████████| 3/3 [00:00<00:00, 35.11it/s]
100%|██████████| 3/3 [00:00<00:00, 35.53it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.832124270964414}
{'ner': 7.013361018151045}
{'ner': 7.029564581927843}


100%|██████████| 3/3 [00:00<00:00, 34.16it/s]
100%|██████████| 3/3 [00:00<00:00, 36.10it/s]
100%|██████████| 3/3 [00:00<00:00, 33.86it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.290621590451337}
{'ner': 5.398109335321351}
{'ner': 6.387756956319208}


100%|██████████| 3/3 [00:00<00:00, 31.99it/s]
100%|██████████| 3/3 [00:00<00:00, 35.50it/s]
100%|██████████| 3/3 [00:00<00:00, 35.75it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.676897504832596}
{'ner': 5.694663970032707}
{'ner': 5.563373059849255}


100%|██████████| 3/3 [00:00<00:00, 32.99it/s]
100%|██████████| 3/3 [00:00<00:00, 36.29it/s]
100%|██████████| 3/3 [00:00<00:00, 36.32it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 7.822900665225461}
{'ner': 5.7569833705201745}
{'ner': 6.650798753835261}


100%|██████████| 3/3 [00:00<00:00, 35.07it/s]
100%|██████████| 3/3 [00:00<00:00, 36.56it/s]
100%|██████████| 3/3 [00:00<00:00, 34.33it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.471342464792542}
{'ner': 6.051193244406022}
{'ner': 4.1085591328155715}


100%|██████████| 3/3 [00:00<00:00, 34.73it/s]
100%|██████████| 3/3 [00:00<00:00, 35.09it/s]
100%|██████████| 3/3 [00:00<00:00, 35.66it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.6497093991292786}
{'ner': 2.765557823279323}
{'ner': 1.8129306382056711}


100%|██████████| 3/3 [00:00<00:00, 35.17it/s]
100%|██████████| 3/3 [00:00<00:00, 35.70it/s]
100%|██████████| 3/3 [00:00<00:00, 36.75it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.064614453267069}
{'ner': 4.972680451333872}
{'ner': 3.767554746091264}


100%|██████████| 3/3 [00:00<00:00, 35.30it/s]
100%|██████████| 3/3 [00:00<00:00, 36.69it/s]
100%|██████████| 3/3 [00:00<00:00, 36.54it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.4372659660416502}
{'ner': 0.47808768992399564}
{'ner': 3.1008967197257036}


100%|██████████| 3/3 [00:00<00:00, 34.80it/s]
100%|██████████| 3/3 [00:00<00:00, 34.82it/s]
100%|██████████| 3/3 [00:00<00:00, 35.86it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.7672298250523237}
{'ner': 0.32648245295512446}
{'ner': 0.10079726603566153}


100%|██████████| 3/3 [00:00<00:00, 35.61it/s]
100%|██████████| 3/3 [00:00<00:00, 34.99it/s]
100%|██████████| 3/3 [00:00<00:00, 34.49it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.09276199345330713}
{'ner': 0.0069278366984825215}
{'ner': 0.3727662818854379}


100%|██████████| 3/3 [00:00<00:00, 34.36it/s]
100%|██████████| 3/3 [00:00<00:00, 36.11it/s]
100%|██████████| 3/3 [00:00<00:00, 35.54it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.03208789693805732}
{'ner': 0.013212232449870074}
{'ner': 0.007757397898805441}


100%|██████████| 3/3 [00:00<00:00, 34.98it/s]
100%|██████████| 3/3 [00:00<00:00, 35.94it/s]
100%|██████████| 3/3 [00:00<00:00, 37.18it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.137446918900503e-05}
{'ner': 0.0003485751696065069}
{'ner': 0.007949100729844366}


100%|██████████| 3/3 [00:00<00:00, 36.04it/s]
100%|██████████| 3/3 [00:00<00:00, 37.67it/s]
100%|██████████| 3/3 [00:00<00:00, 36.70it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0008251407004387062}
{'ner': 0.06612408193759371}
{'ner': 8.950840984179218e-06}


100%|██████████| 3/3 [00:00<00:00, 31.24it/s]
100%|██████████| 3/3 [00:00<00:00, 35.54it/s]
100%|██████████| 3/3 [00:00<00:00, 35.74it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 9.183361913670641e-05}
{'ner': 0.00021619653076292026}
{'ner': 4.695131823447027e-06}


100%|██████████| 3/3 [00:00<00:00, 35.01it/s]
100%|██████████| 3/3 [00:00<00:00, 36.73it/s]
100%|██████████| 3/3 [00:00<00:00, 34.10it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.2045538468927345e-05}
{'ner': 2.3124817965732813e-05}
{'ner': 3.5946712189959194e-06}


100%|██████████| 3/3 [00:00<00:00, 36.67it/s]
100%|██████████| 3/3 [00:00<00:00, 36.95it/s]
100%|██████████| 3/3 [00:00<00:00, 36.65it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.128765298689484e-05}
{'ner': 0.00023167721107166436}
{'ner': 4.169207171929684e-06}


100%|██████████| 3/3 [00:00<00:00, 35.85it/s]
100%|██████████| 3/3 [00:00<00:00, 36.62it/s]
100%|██████████| 3/3 [00:00<00:00, 36.22it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.396959296741892e-05}
{'ner': 3.8951049167531066e-07}
{'ner': 1.4158111531110093e-05}


100%|██████████| 3/3 [00:00<00:00, 34.92it/s]
100%|██████████| 3/3 [00:00<00:00, 35.33it/s]
100%|██████████| 3/3 [00:00<00:00, 36.13it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0007148397448347101}
{'ner': 2.17523053880507e-07}
{'ner': 5.983276256687924e-06}


100%|██████████| 3/3 [00:00<00:00, 35.86it/s]
100%|██████████| 3/3 [00:00<00:00, 36.48it/s]
100%|██████████| 3/3 [00:00<00:00, 36.98it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.501351727145792e-07}
{'ner': 4.5704821581088584e-05}
{'ner': 3.4300925892040745e-06}


100%|██████████| 3/3 [00:00<00:00, 33.29it/s]
100%|██████████| 3/3 [00:00<00:00, 36.19it/s]
100%|██████████| 3/3 [00:00<00:00, 35.81it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.8607641454811188e-06}
{'ner': 1.2733078872951892e-06}
{'ner': 7.33987205504303e-08}


100%|██████████| 3/3 [00:00<00:00, 33.86it/s]
100%|██████████| 3/3 [00:00<00:00, 36.07it/s]
100%|██████████| 3/3 [00:00<00:00, 33.85it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.3317494185323342e-06}
{'ner': 1.7702680521644059e-09}
{'ner': 6.830329837548008e-06}


100%|██████████| 3/3 [00:00<00:00, 33.99it/s]
100%|██████████| 3/3 [00:00<00:00, 37.31it/s]
100%|██████████| 3/3 [00:00<00:00, 36.44it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.125746959139984e-07}
{'ner': 3.685018736351576e-05}
{'ner': 4.486946985589728e-06}


100%|██████████| 3/3 [00:00<00:00, 36.74it/s]
100%|██████████| 3/3 [00:00<00:00, 36.88it/s]
100%|██████████| 3/3 [00:00<00:00, 33.29it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.1320375635270752e-05}
{'ner': 7.147047069998807e-08}
{'ner': 7.072801606382014e-06}


100%|██████████| 3/3 [00:00<00:00, 35.87it/s]
100%|██████████| 3/3 [00:00<00:00, 36.53it/s]
100%|██████████| 3/3 [00:00<00:00, 37.26it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.015393317443614e-07}
{'ner': 1.0332871437098302e-08}
{'ner': 9.453372756598404e-07}


100%|██████████| 3/3 [00:00<00:00, 36.54it/s]
100%|██████████| 3/3 [00:00<00:00, 36.96it/s]
100%|██████████| 3/3 [00:00<00:00, 37.00it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.4725850230273883e-05}
{'ner': 0.0001848262947284852}
{'ner': 1.3865434413102389e-06}


100%|██████████| 3/3 [00:00<00:00, 33.51it/s]
100%|██████████| 3/3 [00:00<00:00, 35.77it/s]
100%|██████████| 3/3 [00:00<00:00, 36.63it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.856015709040853e-09}
{'ner': 1.0874855784208067e-08}
{'ner': 0.2706799345275852}


100%|██████████| 3/3 [00:00<00:00, 35.79it/s]
100%|██████████| 3/3 [00:00<00:00, 36.15it/s]
100%|██████████| 3/3 [00:00<00:00, 35.49it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.2081168578775941e-07}
{'ner': 0.0008447281707978089}
{'ner': 1.4036583139932091e-05}


100%|██████████| 3/3 [00:00<00:00, 35.65it/s]
100%|██████████| 3/3 [00:00<00:00, 35.12it/s]
100%|██████████| 3/3 [00:00<00:00, 34.42it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.142629647397201e-08}
{'ner': 0.00020446951519048192}
{'ner': 7.164342313966741e-05}


100%|██████████| 3/3 [00:00<00:00, 33.77it/s]
100%|██████████| 3/3 [00:00<00:00, 34.61it/s]
100%|██████████| 3/3 [00:00<00:00, 34.97it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.1761898833871147e-06}
{'ner': 9.905843742297513e-07}
{'ner': 1.0051792822346135e-08}


100%|██████████| 3/3 [00:00<00:00, 34.94it/s]
100%|██████████| 3/3 [00:00<00:00, 36.72it/s]
100%|██████████| 3/3 [00:00<00:00, 37.46it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00021411247041190347}
{'ner': 3.549238299726795e-05}
{'ner': 9.969806526858837e-07}


100%|██████████| 3/3 [00:00<00:00, 34.61it/s]
100%|██████████| 3/3 [00:00<00:00, 34.89it/s]
100%|██████████| 3/3 [00:00<00:00, 36.17it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0019722550947805697}
{'ner': 1.0566166881849238e-07}
{'ner': 2.2525544896878526e-09}


100%|██████████| 3/3 [00:00<00:00, 35.64it/s]

{'ner': 1.2817092528369279e-09}





## Test the trained model

In [28]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Tim Cook', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Tim', 'PERSON', 3), ('Cook', 'PERSON', 1), ('?', '', 2)]
Entities [('Elvin Aghammad', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Elvin', 'PERSON', 3), ('Aghammad', 'PERSON', 1), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]


## Save the model

In [29]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)        

Saved model to C:\Users\nithi\Documents\ner


## Test the saved model

In [30]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from C:\Users\nithi\Documents\ner
Entities [('Tim Cook', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Tim', 'PERSON', 3), ('Cook', 'PERSON', 1), ('?', '', 2)]
Entities [('Elvin Aghammad', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Elvin', 'PERSON', 3), ('Aghammad', 'PERSON', 1), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
