# Named Entity Recognition (NER)


In [2]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [7]:
doc = nlp(u'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the nokia mobile phone market and ordered the company to alter its practices')

show_ents(doc)

European - NORP - Nationalities or religious or political groups
Google - ORG - Companies, agencies, institutions, etc.
$5.1 billion - MONEY - Monetary values, including unit
Wednesday - DATE - Absolute or relative dates or periods


## Entity annotations
`Doc.ents` are token spans with their own set of annotations.
<table>
<tr><td>`ent.text`</td><td>The original entity text</td></tr>
<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>
<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>
<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>
<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>
<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>
<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>
</table>



In [5]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [17]:
res = requests.get("https://en.wikipedia.org/wiki/Machine_learning")

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en"><head>
<meta charset="utf-8"/>
<title>Machine learning - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"03d1e600-33c4-47b1-94e2-2d0947155ec7","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Machine_learning","wgTitle":"Machine learning","wgCurRevisionId":1081891971,"wgRevisionId":1081891971,"wgArticleId":233488,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 errors: missing periodical","CS1 maint: uses authors parameter","CS1 maint: url-status","Articles with short description","Short description i

In [13]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [21]:
ny_bb = url_to_string("https://en.wikipedia.org/wiki/Machine_learning")


In [25]:
ny_bb = nlp(ny_bb)

In [26]:
len(ny_bb.ents)

1338

In [28]:
show_ents(ny_bb)

Wikipedia - ORG - Companies, agencies, institutions, etc.
Jump - PERSON - People, including fictional
Machine Learning - PERSON - People, including fictional
Data Cleaning AutoML Association - ORG - Companies, agencies, institutions, etc.
Unsupervised - ORG - Companies, agencies, institutions, etc.
Grammar - ORG - Companies, agencies, institutions, etc.
k-NN Linear - ORG - Companies, agencies, institutions, etc.
Logistic - LOC - Non-GPE locations, mountain ranges, bodies of water
Perceptron Relevance - ORG - Companies, agencies, institutions, etc.
EM - ORG - Companies, agencies, institutions, etc.
Graphical - ORG - Companies, agencies, institutions, etc.
Bayes - ORG - Companies, agencies, institutions, etc.
Conditional - ORG - Companies, agencies, institutions, etc.
Markov Anomaly - PERSON - People, including fictional
k-NN Local - ORG - Companies, agencies, institutions, etc.
GRU ESN Restricted Boltzmann - ORG - Companies, agencies, institutions, etc.
U-Net Transformer Vision Spiking 

Software - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Control - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Software - ORG - Companies, agencies, institutions, etc.
Theory - ORG - Companies, agencies, institutions, etc.
Formal language Automata - WORK_OF_ART - Titles of books, songs, etc.
Computational - ORG - Companies, agencies, institutions, etc.
Computational - ORG - Companies, agencies, institutions, etc.
Mathematics - NORP - Nationalities or religious or political groups
Mathematical - ORG - Companies, agencies, institutions, etc.
Enterprise - ORG - Companies, agencies, institutions,

___
## Adding a Named Entity to a Span


In [35]:
doc = nlp(u'Devd to build a electric cars factory in INDIA $6 million')

show_ents(doc)

INDIA - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [36]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

In [37]:
ORG

383

In [38]:
# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

In [39]:
new_ent

Devd

In [40]:
# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [41]:
show_ents(doc)

Devd - ORG - Companies, agencies, institutions, etc.
INDIA - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


___
## Adding Named Entities to All Matching Spans


In [42]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum-cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [43]:
# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [44]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [45]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 17)]

In [46]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [47]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


___
## Counting Entities


In [52]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [53]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

# Visualizing Named Entities

In [54]:
# Import the displaCy library
from spacy import displacy

In [59]:
displacy.render(ny_bb, style='ent', jupyter=True)

In [61]:
options = {'ents': ['ORG', 'PRODUCT']}

displacy.render(ny_bb, style='ent', jupyter=True, options=options
               
               )

# Using NLTK

In [62]:
#NER

import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Devdatta
[nltk_data]     Supnekar\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Devdatta
[nltk_data]     Supnekar\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [63]:
#my_sent = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."
my_sent= "In 1999, Vajpayee laid the foundation for the GoldenQuadrilateralHighway project, which would link four major cities: Delhi, Mumbai, Chennai and Kolkata."
#my_sent="“Indians is a football country now,” FIFA president Giani Infantino declared after arriving here to chair the FIFA Council meeting on Friday and attend the U-17 World Cup final."

for sent in nltk.sent_tokenize(my_sent):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

PERSON Vajpayee
ORGANIZATION GoldenQuadrilateralHighway
PERSON Delhi
GPE Mumbai
PERSON Chennai
PERSON Kolkata
