<a href="https://colab.research.google.com/github/diem-ai/natural-language-processing/blob/master/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**NLTL**

In [0]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [0]:
sent = "European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices"

In [0]:
def preprocess(sent):
  """
  - tokenize the input
  - apply part of speech tagging into each token
  - return a list of tuple containing individual word and their associated part of speech (pos)
  """
  sent = nltk.word_tokenize(sent)
  sent = nltk.pos_tag(sent)
  return sent

In [0]:
processed_sent = preprocess(sent)

processed_sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

We get a list of tuples containing the individual words in the sentence and their associated part-of-speech.

***Chunking***

Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

In [0]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(processed_sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [0]:
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [0]:
ne_tree = nltk.ne_chunk(processed_sent)
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


***spaCy***

In [0]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from bs4 import BeautifulSoup as bs
import requests
import re


In [0]:
nlp = en_core_web_sm.load()

doc = nlp(sent)

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [0]:
for token in doc:
  print(token.text, token.tag_, token.pos_, token.ent_type_)

European JJ ADJ NORP
authorities NNS NOUN 
fined VBD VERB 
Google NNP PROPN ORG
a DT DET 
record NN NOUN 
$ $ SYM MONEY
5.1 CD NUM MONEY
billion CD NUM MONEY
on IN ADP 
Wednesday NNP PROPN DATE
for IN ADP 
abusing VBG VERB 
its PRP$ DET 
power NN NOUN 
in IN ADP 
the DT DET 
mobile JJ ADJ 
phone NN NOUN 
market NN NOUN 
and CC CCONJ 
ordered VBD VERB 
the DT DET 
company NN NOUN 
to TO PART 
alter VB VERB 
its PRP$ DET 
practices NNS NOUN 


In [0]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


European is NORD (nationalities or religious or political groups), Google is an organization, $5.1 billion is monetary value and Wednesday is a date object. They are all correct.

In [0]:
def get_reuters_news(url):
  headline_text  = []
  soup = bs(requests.get(url).text, 'html.parser')
  contents = soup.find_all('div', class_ ='story-content')
  for content in contents:
     if content.find('p') is not None:
      headline_text.append(content.find('p').string)
  return "".join(headline_text)

# Retrieve 1 page with 10 news
url = 'https://www.reuters.com/news/archive/mcbreakingviews?view=page&page=1&pageSize=10'

headline = get_reuters_news(url)

article = nlp(headline)

In [0]:
ent_labels = [ent.label_ for ent in article.ents]

set(ent_labels)

{'CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON'}

In [0]:
dict(Counter(ent_labels))

{'CARDINAL': 3,
 'DATE': 6,
 'EVENT': 1,
 'FAC': 1,
 'GPE': 10,
 'MONEY': 4,
 'NORP': 9,
 'ORDINAL': 1,
 'ORG': 14,
 'PERCENT': 1,
 'PERSON': 7}

In [0]:
type(Counter(ent_labels))

collections.Counter

In [0]:
#ent_per = [ (ent.text, ent.label_) for ent in article.ents if ent.label_ == "PERSON" ]
ent_per = [ ent.text for ent in article.ents if ent.label_ == "PERSON" ]

Counter(ent_per).most_common()

[('Steve Easterbrook', 1),
 ('Aramco’s IPO', 1),
 ('Crown Prince Mohammed bin Salman', 1),
 ('Orsted', 1),
 ('Boris Johnson', 1),
 ('Jeremy Corbyn', 1),
 ('Donald Trump', 1)]

In [0]:
print([ (ent.text, ent.label_) for ent in article.ents if ent.label_=="CARDINAL" ])

[('two', 'CARDINAL'), ('zero', 'CARDINAL'), ('two', 'CARDINAL')]


In [0]:
#ent_org = [ (ent.text, ent.label_) for ent in article.ents if ent.label_=="ORG" ]
ent_org = [ ent.text for ent in article.ents if ent.label_=="ORG" ]

Counter(ent_org)

Counter({'AstraZeneca': 1,
         'Big Pharma': 1,
         'Cartier and Van Cleef & Arpels': 1,
         'Congress': 1,
         'European Commission': 1,
         'Fiat Chrysler’s': 1,
         'Labour': 1,
         'McDonald’s': 1,
         'Peugeot': 1,
         'Riksbank': 1,
         'Saudi Aramco’s': 1,
         'Standard Chartered and Tencent': 1,
         'Tiffany & Co’s': 1,
         'Trump': 1,
         'Vestas Wind Systems': 1})

In [0]:
#ent_gpe = [ (ent.text, ent.label_) for ent in article.ents if ent.label_=="GPE" ]
ent_gpe = [ent.text for ent in article.ents if ent.label_=="GPE" ]

Counter(ent_gpe).most_common()

[('Beijing', 2),
 ('Hong Kong', 1),
 ('Denmark', 1),
 ('New York', 1),
 ('Britain', 1),
 ('UK', 1),
 ("the People's Republic", 1),
 ('the United States', 1),
 ('China', 1)]

In [0]:
#ent_norp = [ (ent.text, ent.label_) for ent in article.ents if ent.label_=="NORP" ]
ent_norp = [ ent.text for ent in article.ents if ent.label_=="NORP" ]

Counter(ent_norp)

Counter({'American': 1,
         'Chinese': 2,
         'European': 1,
         'Republican': 1,
         'Saudi Arabian': 1,
         'Shakespearean': 1,
         'Swedish': 1,
         'Swiss': 1})

In [0]:
'''
options = {"ents": ["PERSON", "ORG", "DATE", "GPE", "NORP"]
           , "colors": {"ORG": "yellow", "PERSON" : "green", "DATE" : "pink"} }
'''
displacy.render(article, jupyter=True, style='ent')

In [0]:
spacy.explain("GPE")

'Countries, cities, states'

In [0]:
displacy.render(article, jupyter=True, style='dep', options=options)

***Reference***

- https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

- https://spacy.io/api/top-level#displacy.render

- https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175
