In [None]:
# Import Libraries
import numpy as np
import pandas as pd

# NLP
import spacy

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext tensorboard

In [None]:
# Load Initial NLP Model from SpaCy
nlp = spacy.load('en_core_web_sm') # Is there a BILUO formatted version?

# Load NER Pipeline
ner = nlp.get_pipe('ner')

In [None]:
# For Checking if Label Addition was Successful Later On
old_labels = list(ner.labels)

In [None]:
{geo: LOC}

In [None]:
# Kaggle Tags: 8
LOC | geo = Geographical Entity
ORG | org = Organization
PERSON | per = Person
GPE | gpe = Geopolitical Entity
TIME |tim = Time indicator
art = Artifact
EVENT | eve = Event
nat = Natural Phenomenon

In [None]:
# SpaCy Tags: 18 - Suggests might be a loss of fidelity by mapping them
old_labels

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [None]:
PERSON:      People, including fictional.
NORP:        Nationalities or religious or political groups.
FAC:         Buildings, airports, highways, bridges, etc.
ORG:         Companies, agencies, institutions, etc.
GPE:         Countries, cities, states.
LOC:         Non-GPE locations, mountain ranges, bodies of water.
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
EVENT:       Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART: Titles of books, songs, etc.
LAW:         Named documents made into laws.
LANGUAGE:    Any named language.
DATE:        Absolute or relative dates or periods.
TIME:        Times smaller than a day.
PERCENT:     Percentage, including ”%“.
MONEY:       Monetary values, including unit.
QUANTITY:    Measurements, as of weight or distance.
ORDINAL:     “first”, “second”, etc.
CARDINAL:    Numerals that do not fall under another type.

In [None]:
ner_dataset[ner_dataset.Tag.str.contains('art', regex = False)][0:20]

Unnamed: 0,Sentence #,Word,POS,Tag,Sentence_No
263,,Nuclear,NNP,B-art,12.0
264,,Non-Proliferation,NNP,I-art,12.0
3769,,Saltillo,NNP,B-art,169.0
3810,,Pentastar,NNP,B-art,171.0
3811,,V-6,NNP,I-art,171.0
3814,,Chrysler,NNP,B-art,171.0
3816,,Dodge,NNP,B-art,171.0
3818,,Jeep,NNP,B-art,171.0
3820,,Ram,NNP,B-art,171.0
3863,,Vioxx,NNP,B-art,173.0


In [None]:
# Unzip + Load NER Data
!unzip ner_data.zip

Archive:  ner_data.zip
   creating: data/
  inflating: data/.DS_Store          
  inflating: __MACOSX/data/._.DS_Store  
   creating: data/external/
   creating: data/interim/
   creating: data/processed/
   creating: data/raw/
  inflating: data/external/.gitkeep  
  inflating: data/interim/.gitkeep   
  inflating: data/processed/.gitkeep  
  inflating: data/raw/.gitkeep       
  inflating: data/raw/ner.csv        
  inflating: data/raw/ner_dataset.csv  


In [None]:
# Set RAW DATA PATH
RAW_DATA_PATH = '/content/data/raw/'

In [None]:
# Load NER Data into DF
ner_dataset = pd.read_csv(RAW_DATA_PATH + 'ner_dataset.csv', encoding='ISO-8859-1', error_bad_lines=False)

In [None]:
# NER Dataset: ~1MM Rows
ner_dataset.shape

(1048575, 4)

In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
# Load NER Lemmas Dataset
ner_lemmas = pd.read_csv(RAW_DATA_PATH + 'ner.csv', encoding='ISO-8859-1', error_bad_lines=False)

b'Skipping line 281837: expected 25 fields, saw 34\n'


In [None]:
ner_lemmas.shape

(1050795, 25)

### Testing Out-of-the-Box SpaCy NER Performance on Sample Data

In [None]:
article_text="""India that previously comprised only a handful of players in the e-commerce space, is now home to many biggies and giants battling out with each other to reach the top. This is thanks to the overwhelming internet and smartphone penetration coupled with the ever-increasing digital adoption across the country. These new-age innovations not only gave emerging startups a unique platform to deliver seamless shopping experiences but also provided brick and mortar stores with a level-playing field to begin their online journeys without leaving their offline legacies.
In the wake of so many players coming together on one platform, the Indian e-commerce market is envisioned to reach USD 84 billion in 2021 from USD 24 billion in 2017. Further, with the rate at which internet penetration is increasing, we can expect more and more international retailers coming to India in addition to a large pool of new startups. This, in turn, will provide a major Philip to the organized retail market and boost its share from 12% in 2017 to 22-25% by 2021. 
Here’s a view to the e-commerce giants that are dominating India’s online shopping space:
Amazon – One of the uncontested global leaders, Amazon started its journey as a simple online bookstore that gradually expanded its reach to provide a large suite of diversified products including media, furniture, food, and electronics, among others. And now with the launch of Amazon Prime and Amazon Music Limited, it has taken customer experience to a godly level, which will remain undefeatable for a very long time. 
Flipkart – Founded in 2007, Flipkart is recognized as the national leader in the Indian e-commerce market. Just like Amazon, it started operating by selling books and then entered other categories such as electronics, fashion, and lifestyle, mobile phones, etc. And now that it has been acquired by Walmart, one of the largest leading platforms of e-commerce in the US, it has also raised its bar of customer offerings in all aspects and giving huge competition to Amazon. 
Snapdeal – Started as a daily deals platform in 2010, Snapdeal became a full-fledged online marketplace in 2011 comprising more than 3 lac sellers across India. The platform offers over 30 million products across 800+ diverse categories from over 125,000 regional, national, and international brands and retailers. The Indian e-commerce firm follows a robust strategy to stay at the forefront of innovation and deliver seamless customer offerings to its wide customer base. It has shown great potential for recovery in recent years despite losing Freecharge and Unicommerce. 
ShopClues – Another renowned name in the Indian e-commerce industry, ShopClues was founded in July 2011. It’s a Gurugram based company having a current valuation of INR 1.1 billion and is backed by prominent names including Nexus Venture Partners, Tiger Global, and Helion Ventures as its major investors. Presently, the platform comprises more than 5 lac sellers selling products in nine different categories such as computers, cameras, mobiles, etc. 
Paytm Mall – To compete with the existing e-commerce giants, Paytm, an online payment system has also launched its online marketplace – Paytm Mall, which offers a wide array of products ranging from men and women fashion to groceries and cosmetics, electronics and home products, and many more. The unique thing about this platform is that it serves as a medium for third parties to sell their products directly through the widely-known app – Paytm. 
Reliance Retail – Given Reliance Jio’s disruptive venture in the Indian telecom space along with a solid market presence of Reliance, it is no wonder that Reliance will soon be foraying into retail space. As of now, it has plans to build an e-commerce space that will be established on online-to-offline market program and aim to bring local merchants on board to help them boost their sales and compete with the existing industry leaders. 
Big Basket – India’s biggest online supermarket, Big Basket provides a wide variety of imported and gourmet products through two types of delivery services – express delivery and slotted delivery. It also offers pre-cut fruits along with a long list of beverages including fresh juices, cold drinks, hot teas, etc. Moreover, it not only provides farm-fresh products but also ensures that the farmer gets better prices. 
Grofers – One of the leading e-commerce players in the grocery segment, Grofers started its operations in 2013 and has reached overwhelming heights in the last 5 years. Its wide range of products includes atta, milk, oil, daily need products, vegetables, dairy products, juices, beverages, among others. With its growing reach across India, it has become one of the favorite supermarkets for Indian consumers who want to shop grocery items from the comforts of their homes. 
Digital Mall of Asia – Going live in 2020, Digital Mall of Asia is a very unique concept coined by the founders of Yokeasia Malls. It is designed to provide an immersive digital space equipped with multiple visual and sensory elements to sellers and shoppers. It will also give retailers exclusive rights to sell a particular product category or brand in their respective cities. What makes it unique is its zero-commission model enabling retailers to pay only a fixed amount of monthly rental instead of paying commissions. With its one-of-a-kind features, DMA is expected to bring
never-seen transformation to the current e-commerce ecosystem while addressing all the existing e-commerce worries such as counterfeiting. """


In [None]:
# Perform Standard Out-of-the-Box NER on the text
doc = nlp(article_text)

In [None]:
# Example of Retrieving NER Tags from the Entities of the submitted text in the doc object
doc.ents[0].label_

'GPE'

In [None]:
for entity in doc.ents:
  print(entity.text, entity.label_)

India GPE
one CARDINAL
Indian NORP
USD 84 billion MONEY
2021 DATE
USD 24 billion MONEY
2017 DATE
India GPE
Philip PERSON
12% PERCENT
2017 DATE
22-25% PERCENT
2021 DATE
India GPE
Amazon ORG
One CARDINAL
Amazon ORG
Amazon ORG
Amazon Music Limited ORG
Flipkart PERSON
2007 DATE
Flipkart PERSON
Indian NORP
Amazon ORG
Walmart LOC
one CARDINAL
US GPE
Amazon ORG
daily DATE
2010 DATE
2011 DATE
more than 3 CARDINAL
India GPE
over 30 million CARDINAL
over 125,000 CARDINAL
Indian NORP
recent years DATE
Freecharge PERSON
Unicommerce GPE
ShopClues PERSON
Indian NORP
ShopClues ORG
July 2011 DATE
Gurugram ORG
INR ORG
1.1 billion CARDINAL
Nexus Venture Partners ORG
Helion Ventures ORG
more than 5 CARDINAL
nine CARDINAL
Paytm Mall PERSON
Paytm ORG
Paytm Mall FAC
third ORDINAL
Paytm GPE
Indian NORP
Reliance ORG
Reliance ORG
India GPE
Big Basket ORG
two CARDINAL
One CARDINAL
2013 DATE
the last 5 years DATE
daily DATE
India GPE
Indian NORP
Digital Mall FAC
Asia LOC
2020 DATE
Digital Mall ORG
Asia LOC
Yokea

spaCy accepts training data as list of tuples.

Each tuple should contain the text and a dictionary. The dictionary should hold the start and end indices of the named enity in the text, and the category or label of the named entity.

For example, ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]})

In [None]:
ner_dataset[ner_dataset.Tag.str.contains('art', regex = False)][0:20]

Unnamed: 0,Sentence #,Word,POS,Tag,Sentence_No
263,,Nuclear,NNP,B-art,12.0
264,,Non-Proliferation,NNP,I-art,12.0
3769,,Saltillo,NNP,B-art,169.0
3810,,Pentastar,NNP,B-art,171.0
3811,,V-6,NNP,I-art,171.0
3814,,Chrysler,NNP,B-art,171.0
3816,,Dodge,NNP,B-art,171.0
3818,,Jeep,NNP,B-art,171.0
3820,,Ram,NNP,B-art,171.0
3863,,Vioxx,NNP,B-art,173.0


In [None]:
BILUO

In [None]:
ner_dataset.Tag.unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [None]:
# Extract the Integer representing the Sentence Number
ner_dataset['Sentence_No'] = ner_dataset['Sentence #'].str.split().dropna().apply(lambda row: int(row[-1]))
ner_dataset['Sentence_No'] = ner_dataset['Sentence_No'].ffill()


In [None]:
# Group Sentences Together
sentences = ner_dataset.groupby('Sentence_No')['Word'].apply(lambda grp: ' '.join(grp).strip())

In [None]:
sentences

Sentence_No
1.0        Thousands of demonstrators have marched throug...
2.0        Families of soldiers killed in the conflict jo...
3.0        They marched from the Houses of Parliament to ...
4.0        Police put the number of marchers at 10,000 wh...
5.0        The protest comes on the eve of the annual con...
                                 ...                        
47955.0    Indian border security forces are accusing the...
47956.0    Indian officials said no one was injured in Sa...
47957.0    Two more landed in fields belonging to a nearb...
47958.0    They say not all of the rockets exploded upon ...
47959.0      Indian forces said they responded to the attack
Name: Word, Length: 47959, dtype: object

In [None]:
# Wrangle Entities
tags = ner_dataset.groupby('Sentence_No')['Tag'].apply(lambda grp: ' '.join(grp).strip().split())
tags = tags.apply(lambda val: [word.upper() for word in val])

In [None]:
# Wrangle into SpaCy Training Format
TRAIN_DATA = list(zip(sentences, tags.apply(lambda val: {'entities': val})))

In [None]:
# Get the original input text
# TRAIN_DATA[0][0]

In [None]:
# Get the List of Entity Tags in BILOU Format
# TRAIN_DATA[0][1]

In [None]:
tags

Sentence_No
1.0        [O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO...
2.0        [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
3.0        [O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO...
4.0            [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
5.0        [O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,...
                                 ...                        
47955.0    [B-GPE, O, O, O, O, O, O, B-GPE, O, O, O, O, O...
47956.0    [B-GPE, O, O, O, O, O, O, O, B-TIM, O, O, O, O...
47957.0                    [O, O, O, O, O, O, O, O, O, O, O]
47958.0                    [O, O, O, O, O, O, O, O, O, O, O]
47959.0                         [B-GPE, O, O, O, O, O, O, O]
Name: Tag, Length: 47959, dtype: object

### Try Forcing IOB Tags with Dictionary to BILUO to then map to SpaCy Format

In [None]:
from spacy.training import biluo_tags_to_offsets

doc = nlp("I like London.")
tags = ["O", "O", "U-LOC", "O"]
entities = biluo_tags_to_offsets(doc, tags)
assert entities == [(7, 13, "LOC")]

AttributeError: ignored

### Add New Labels to the NER Model

In [None]:
def add_new_training_labels_to_existing_model(ner_model, train_data_json):
  
  # The dreaded double FOR LOOP!
  for _, annotations in train_data_json:
    for entity in annotations.get('entities'):
     ner.add_label(entity)


  return ner

In [None]:
ner = add_new_training_labels_to_existing_model(ner, TRAIN_DATA)

In [None]:
[label for label in ner.labels if label not in old_labels]

['B', 'I', 'O']

### Disable Other Pipelines

In [None]:
# Import requirements
import random
from spacy.util import minibatch, compounding

# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
# unaffected_pipes

In [None]:
for batch in batches:
                texts, _ = zip(*batch)
                golds = [GoldParse(nlp.make_doc(t), entities = a) for t, a in batch]
                
                nlp.update(
                    texts,  # batch of texts
                    golds,  # batch of annotations
                    drop=0.4,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer
                )
            print(losses)
    return nlp

In [None]:
# Train the Model
with nlp.disable_pipes(*unaffected_pipes):

  # Train for 30 iterations:
  for iteration in range(30):

    # Shuffle Examples before every Iteration
    random.shuffle(TRAIN_DATA)
    losses = {}

    # Create Minibatches
    batches = minibatch(TRAIN_DATA, size = compounding(4.0, 32.0, 1.001))

    # Iterate through each batch:
    for batch in batches:
      # Confirm batch properties
      print(batch[0])

      # Confirm batch
      print(type(batch[0]))
      
      # Extract Text
      texts, _ = zip(*batch)
      
      # Confirm texts
      print(texts)

      # Confirm texts
      print(type(texts)), print(len(texts))
      

      # Create a List of GoldParse Objects to Pass Through
      golds = [GoldParse(nlp.make_doc(t), entities = a['entities']) for t, a in batch]
      
      # Training Happens Here
      nlp.update(
                texts, #X
                golds, #Y
                drop = 0.5, 
                losses = losses,)
      
      print("Losses", losses)

('The main opposition , Movement for Democratic Change said it had no confidence in the commission .', {'entities': ['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']})
<class 'tuple'>
('The main opposition , Movement for Democratic Change said it had no confidence in the commission .', 'At least one strong aftershock with a 5.2 magnitude struck about 30 minutes later .', 'In Tanzania , officials say at least 10 swimmers died when they were swept out to sea Sunday near the beach at Dar es Salaam .', "On Iraq , the president said he expects the country 's new constitution will be completed by August 15 .")
<class 'tuple'>
4


KeyError: ignored

### See `Setting Entity Annotations` at: https://spacy.io/usage/linguistic-features

### If still getting the `List Index Out of Range` Error: try writing a script that skips training examples if the length of the entities doesn't match the text length, after removing whitespace

In [None]:
ner.labels

('B',
 'CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'I',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'O',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [None]:
B-GEO I_GEO

In [None]:
# Setup a Test Batch to play with
batch = [('Iran has rejected U.S. and European economic incentives offered in exchange for abandoning its nuclear enrichment activities , saying it will not bend to external pressure .', {'entities': ['B-GEO', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}), ('The International Committee of the Red Cross is calling on Israel to allow Palestinians from the Gaza Strip to resume visits with relatives held in Israeli jails .', {'entities': ['B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'B-GPE', 'O', 'O', 'B-GEO', 'I-GEO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O']}), ('Authorities said the governor of eastern Lagham province was unhurt in the attack Saturday .', {'entities': ['O', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TIM', 'O']}), ('India and Sri Lanka say peace talks with Tamil rebels should resume soon to prevent the island nation from plunging back into civil war .', {'entities': ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']})]


In [None]:
texts, _ = zip(*batch)

In [None]:
texts

('Iran has rejected U.S. and European economic incentives offered in exchange for abandoning its nuclear enrichment activities , saying it will not bend to external pressure .',
 'The International Committee of the Red Cross is calling on Israel to allow Palestinians from the Gaza Strip to resume visits with relatives held in Israeli jails .',
 'Authorities said the governor of eastern Lagham province was unhurt in the attack Saturday .',
 'India and Sri Lanka say peace talks with Tamil rebels should resume soon to prevent the island nation from plunging back into civil war .')

In [None]:
# Get Example of Unpack
# [(t, a) for t, a in batch][0]

In [None]:
type(batch[0])

tuple

In [None]:
golds = [GoldParse(nlp.make_doc(t), entities = a['entities']) for t, a in batch]

**StackOverflow**: I think i got the issue, some of the text inside train_data is having space, as a result the number of tokens and annotations are not matching.

In [None]:
golds = [GoldParse(nlp.make_doc(t), entities = a['entities']) for t,a in batch]

IndexError: ignored

In [None]:
texts, _ = zip(*batch)
golds = [GoldParse(nlp.make_doc(t), entities = a['entities']) for t, a in batch]


nlp.update(
    texts,  # batch of texts
    golds,  # batch of annotations
    drop=0.4,  # dropout - make it harder to memorise data
    losses=losses,
    sgd=optimizer
)
print(losses)

In [None]:
from spacy.gold import GoldParse  #<--- add this

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):
    print(f'Starting Iteration: {iteration}')
    
    # shuffling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))

    for batch in batches:
        print('One Batch: ', batch) # A list containing a tuple of each (text, annotation) example pair
        print(len(batch))

        texts, _ = zip(*batch)
   

        nlp.update(
                    [text],  # batch of texts
                    [gold],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses)
        
        print("Losses", losses)
  
  print('Finished Training.')

Starting Iteration: 0
One Batch:  [('Iran has rejected U.S. and European economic incentives offered in exchange for abandoning its nuclear enrichment activities , saying it will not bend to external pressure .', {'entities': ['B-GEO', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}), ('The International Committee of the Red Cross is calling on Israel to allow Palestinians from the Gaza Strip to resume visits with relatives held in Israeli jails .', {'entities': ['B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'B-GPE', 'O', 'O', 'B-GEO', 'I-GEO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O']}), ('Authorities said the governor of eastern Lagham province was unhurt in the attack Saturday .', {'entities': ['O', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TIM', 'O']}), ('India and Sri Lanka say peace talks with Tamil rebels should re

NameError: ignored

In [None]:
def train_spacy():

    TRAIN_DATA = convert_dataturks_to_spacy("C:\\Users\\akjain\\Downloads\\Entity-Recognition-In-Resumes-SpaCy-master\\traindata.json")
    TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
    nlp = spacy.blank('en')  # create blank Language class
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)


    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
            print(losses)

In [None]:
from spacy import displacy
displacy.render(doc, style="ent") # if from notebook else displacy.serve(doc, style="ent") generally

# Appendix
---

In [None]:
# Wrangle Words with Entities
entities = ner_dataset[ner_dataset['Tag'] != 'O'].groupby('Sentence_No')['Word'].apply(lambda grp: ' '.join(grp).strip().split())

In [None]:
entities

Sentence_No
1.0                                 [London, Iraq, British]
2.0                                                  [Bush]
3.0                                            [Hyde, Park]
5.0              [Britain, Labor, Party, English, Brighton]
6.0                                [Britain, Iraq, British]
                                 ...                       
47953.0    [Afghan, Taleban, Bermel, Paktika, NATO, Afghan]
47954.0                      [Taleban, 2001, Afghan, Kabul]
47955.0                         [Indian, Pakistani, Punjab]
47956.0                                  [Indian, Saturday]
47959.0                                            [Indian]
Name: Word, Length: 40917, dtype: object

In [None]:
tags

Sentence_No
1.0        [O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO...
2.0        [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
3.0        [O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO...
4.0            [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
5.0        [O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,...
                                 ...                        
47955.0    [B-GPE, O, O, O, O, O, O, B-GPE, O, O, O, O, O...
47956.0    [B-GPE, O, O, O, O, O, O, O, B-TIM, O, O, O, O...
47957.0                    [O, O, O, O, O, O, O, O, O, O, O]
47958.0                    [O, O, O, O, O, O, O, O, O, O, O]
47959.0                         [B-GPE, O, O, O, O, O, O, O]
Name: Tag, Length: 47959, dtype: object

In [None]:
# Get Word Position of Tagged Entity
# tags_by_sentence = 
tags.str.findall('(\w+-\w{3})')

Sentence_No
1.0       NaN
2.0       NaN
3.0       NaN
4.0       NaN
5.0       NaN
           ..
47955.0   NaN
47956.0   NaN
47957.0   NaN
47958.0   NaN
47959.0   NaN
Name: Tag, Length: 47959, dtype: float64

In [None]:
tags_by_sentence

Sentence_No
1.0       NaN
2.0       NaN
3.0       NaN
4.0       NaN
5.0       NaN
           ..
47955.0   NaN
47956.0   NaN
47957.0   NaN
47958.0   NaN
47959.0   NaN
Name: Tag, Length: 47959, dtype: float64

In [None]:
# Merge Words and tags Together
df = pd.concat([entities, tags_by_sentence], axis = 1)

# Concat Word Lists with Tag Lists
df['raw_words_tags'] = list(zip(df.Word, df.Tag))

# Clean Up Words_w_tags
df['Words_w_Tags'] = df['raw_words_tags'].apply(lambda val: [[i, j] for i, j in zip(val[0], val[1])] if type(val[0]) == list else np.nan)

# Drop raw_words_tags
df = df.drop(columns = 'raw_words_tags')

TypeError: ignored