In [None]:
# Named-Entity Recognition (NER)
# ------------------------------
# It seeks to locate and classify named entity mentions in unstructured text into
# pre-defined categories such as the person names, organizations, locations, medical
# codes, time expressions, quantities, monetary values, percentages, etc.

In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ': ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')
        
def get_entities(doc):
    return doc.ents

In [5]:
# Showing Entities identified by Spacy
text = "May i go to Washington, DC next May to see the Washington Document?"
doc = nlp(text)
show_ents(doc)

Washington, DC: GPE - Countries, cities, states
next May: DATE - Absolute or relative dates or periods
the Washington Document: ORG - Companies, agencies, institutions, etc.


In [18]:
# Showing entity attributes

# - ent.text -> Original text
# - ent.label -> entity hash value
# - ent.label_ -> entity type description
# - ent.start -> Where does the token span starts in the doc
# - ent.end -> Where does the token span ends in the doc
# - ent.start_char 
# - ent.end_char
ents = get_entities(doc)

for ent in ents:
    print('_'*40)
    print('ent.text: ', ent.text)
    print('ent.label: ', ent.label)
    print('ent.label_:', ent.label_)
    print('ent.start: ', ent.start)
    print('ent.end: ', ent.end)
    print('ent.start_char: ', ent.start_char)
    print('ent.end_char: ', ent.end_char)
    
print(doc[ent.start: ent.end])
print(type(doc[ent.start: ent.end]))
print(text[ent.start_char: ent.end_char])

________________________________________
ent.text:  Washington, DC
ent.label:  382
ent.label_: GPE
ent.start:  4
ent.end:  7
ent.start_char:  12
ent.end_char:  26
________________________________________
ent.text:  next May
ent.label:  388
ent.label_: DATE
ent.start:  7
ent.end:  9
ent.start_char:  27
ent.end_char:  35
________________________________________
ent.text:  the Washington Document
ent.label:  381
ent.label_: ORG
ent.start:  11
ent.end:  14
ent.start_char:  43
ent.end_char:  66
the Washington Document
<class 'spacy.tokens.span.Span'>
the Washington Document


In [3]:
# Adding a Named Entity to our vocab
text = "Humber, in future he will be located in Colombia"
doc = nlp(text)
show_ents(doc)
# Humber is not recognizede by Spacy

(Colombia,)
<class 'tuple'>
True
Colombia: GPE - Countries, cities, states


In [29]:
from spacy.tokens import Span

def add_entity_to_vocab(doc, start, end, entity_tag):
    new_GPE = doc.vocab.strings[entity_tag]
    new_ent = Span(doc, 0, 1, label=new_GPE)
    doc.ents = list(doc.ents) + [new_ent]

add_entity_to_vocab(doc, 0, 1, 'GPE')
show_ents(doc)

Humber: GPE - Countries, cities, states
Colombia: GPE - Countries, cities, states


In [3]:
## Adding multiple terms in our NER
# ---------------------------------
text = """Our companyy created a brand new vacuum cleaner. This new vacuum-cleaner is the best in show."""
doc = nlp(text)
show_ents(doc) # No entities found

No entities found


In [5]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('newproduct', None, *phrase_patterns)

found_matches = matcher(doc)
print(f"Found Matches:\n{found_matches}")

Found Matches:
[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]


In [6]:
# Create spans
from spacy.tokens import Span
PROD = doc.vocab.strings["PRODUCT"]

new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]
doc.ents = list(doc.ents) + new_ents

# Show entities again
# Now, vacuum cleaner & vacuum-cleaner are recognized as named entity
show_ents(doc)

vacuum cleaner: PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner: PRODUCT - Objects, vehicles, foods, etc. (not services)


In [7]:
# Visualizing Entities
# --------------------

from spacy import displacy

doc = nlp("McDonalds sold 300 hundred burguers in 1 hour")
displacy.render(doc, style='ent', jupyter=True)

In [14]:
# Highlighting some entities
colors = {'ORG': 'yellow'}
options = {'ents': ['ORG', 'CARDINAL'], 'colors': colors}
displacy.render(doc, style='ent', jupyter=True, options=options)