<a href="https://colab.research.google.com/github/erieglakes/erieglakes.github.io/blob/codes/NER_in_SpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition using SpaCy

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from pprint import pprint


In [None]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [None]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])


## Extracting Named Entity from an article

In [None]:
link1 = 'https://www.thehindu.com/news/national/centre-may-raise-loan-to-pay-shortfall-of-gst-compensation-amount/article31329841.ece?homepage=true'
link2 = 'https://www.thehindu.com/news/national/several-union-ministers-officials-return-to-work-at-ministries/article31329079.ece?homepage=true'
link3 = 'https://www.thehindu.com/news/national/plea-to-bring-back-to-punjab-stranded-sikh-pilgrims/article31329103.ece?homepage=true'

In [None]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
art = url_to_string(link2)
article = nlp(art)
len(article.ents)

216

In [None]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 41,
         'DATE': 31,
         'GPE': 21,
         'LOC': 2,
         'NORP': 20,
         'ORDINAL': 4,
         'ORG': 53,
         'PERCENT': 2,
         'PERSON': 36,
         'PRODUCT': 1,
         'QUANTITY': 1,
         'TIME': 2,
         'WORK_OF_ART': 2})

In [None]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Hindu', 7), ('Coronavirus', 7), ('Indian', 6)]

In [None]:
sentences = [x for x in article.sents]
#any sentence can be selected randomly
sent_num=52
print(sentences[sent_num])

Minority Affairs Minister Mukhtar Abbas Naqvi, Minister of State for Youth Affairs Kiren Rijiju, Culture and Tourism Minister Prahlad Patel were some of the early office-goers on Monday, along with other senior officials.


In [None]:
displacy.render(nlp(str(sentences[sent_num])), jupyter=True, style='ent')


In [None]:
displacy.render(nlp(str(sentences[sent_num])), style='dep', jupyter = True, options = {'distance': 120})
#sentence and its dependencies

In [None]:
# extracting part of speech and lemmatization
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[sent_num]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Minority', 'PROPN', 'Minority'),
 ('Affairs', 'PROPN', 'Affairs'),
 ('Minister', 'PROPN', 'Minister'),
 ('Mukhtar', 'PROPN', 'Mukhtar'),
 ('Abbas', 'PROPN', 'Abbas'),
 ('Naqvi', 'PROPN', 'Naqvi'),
 ('Minister', 'PROPN', 'Minister'),
 ('State', 'PROPN', 'State'),
 ('Youth', 'PROPN', 'Youth'),
 ('Affairs', 'PROPN', 'Affairs'),
 ('Kiren', 'PROPN', 'Kiren'),
 ('Rijiju', 'PROPN', 'Rijiju'),
 ('Culture', 'PROPN', 'Culture'),
 ('Tourism', 'PROPN', 'Tourism'),
 ('Minister', 'PROPN', 'Minister'),
 ('Prahlad', 'PROPN', 'Prahlad'),
 ('Patel', 'PROPN', 'Patel'),
 ('early', 'ADJ', 'early'),
 ('office', 'NOUN', 'office'),
 ('goers', 'NOUN', 'goer'),
 ('Monday', 'PROPN', 'Monday'),
 ('senior', 'ADJ', 'senior'),
 ('officials', 'NOUN', 'official')]

In [None]:
dict([(str(x), x.label_) for x in nlp(str(sentences[sent_num])).ents])


{'Kiren Rijiju': 'PERSON',
 'Monday': 'DATE',
 'Mukhtar Abbas Naqvi': 'PERSON',
 'Prahlad Patel': 'PERSON'}

In [None]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[sent_num]])

[(Minority, 'O', ''), (Affairs, 'O', ''), (Minister, 'O', ''), (Mukhtar, 'B', 'PERSON'), (Abbas, 'I', 'PERSON'), (Naqvi, 'I', 'PERSON'), (,, 'O', ''), (Minister, 'O', ''), (of, 'O', ''), (State, 'O', ''), (for, 'O', ''), (Youth, 'O', ''), (Affairs, 'O', ''), (Kiren, 'B', 'PERSON'), (Rijiju, 'I', 'PERSON'), (,, 'O', ''), (Culture, 'O', ''), (and, 'O', ''), (Tourism, 'O', ''), (Minister, 'O', ''), (Prahlad, 'B', 'PERSON'), (Patel, 'I', 'PERSON'), (were, 'O', ''), (some, 'O', ''), (of, 'O', ''), (the, 'O', ''), (early, 'O', ''), (office, 'O', ''), (-, 'O', ''), (goers, 'O', ''), (on, 'O', ''), (Monday, 'B', 'DATE'), (,, 'O', ''), (along, 'O', ''), (with, 'O', ''), (other, 'O', ''), (senior, 'O', ''), (officials, 'O', ''), (., 'O', '')]


In [None]:
displacy.render(article, jupyter=True, style='ent')
