# NLTK

In [84]:
import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Information Extraction
I took a sentence from https://futurism.com/coronavirus-app-mit-safe-paths, “The app, which is available for free, and was developed by a team of 43 tech workers and academics in their spare time, is called Private Kit: Safe Paths and the beta can be downloaded now for iOS and Android.”

In [145]:
example = 'MIT Media Lab professor Ramesh Raskar has been rallying other researchers and tech executives to the effort, and he has been in contact with the World Health Organization WHO, the US Centers for Disease Control and Prevention, and the US Department of Health and Human Services. “They are giving us guidance on what will work,” he says, although none has yet endorsed the idea.'

# Then we apply word tokenization and part-of-speech tagging to the sentence.

In [146]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [147]:
sent = preprocess(example)
sent

[('MIT', 'NNP'),
 ('Media', 'NNP'),
 ('Lab', 'NNP'),
 ('professor', 'NN'),
 ('Ramesh', 'NNP'),
 ('Raskar', 'NNP'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('rallying', 'VBG'),
 ('other', 'JJ'),
 ('researchers', 'NNS'),
 ('and', 'CC'),
 ('tech', 'JJ'),
 ('executives', 'NNS'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('effort', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('he', 'PRP'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('in', 'IN'),
 ('contact', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('World', 'NNP'),
 ('Health', 'NNP'),
 ('Organization', 'NNP'),
 ('WHO', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('US', 'NNP'),
 ('Centers', 'NNPS'),
 ('for', 'IN'),
 ('Disease', 'NNP'),
 ('Control', 'NNP'),
 ('and', 'CC'),
 ('Prevention', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('US', 'NNP'),
 ('Department', 'NNP'),
 ('of', 'IN'),
 ('Health', 'NNP'),
 ('and', 'CC'),
 ('Human', 'NNP'),
 ('Services', 'NNPS'),
 ('.', '.'),
 ('“', 'IN'),
 ('They', 'PRP'),
 ('are', 'VBP'),
 ('giving', 'VBG'),
 ('us', 'PRP'),
 ('guid

We get a list of tuples containing the individual words in the sentence and their associated part-of-speech.

Now we’ll implement noun phrase chunking to identify named entities using a regular expression consisting of rules that indicate how sentences should be chunked.

Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

In [148]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

# Chunking

Using this pattern, we create a chunk parser and test it on our sentence.

In [149]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  MIT/NNP
  Media/NNP
  Lab/NNP
  (NP professor/NN)
  Ramesh/NNP
  Raskar/NNP
  has/VBZ
  been/VBN
  rallying/VBG
  other/JJ
  researchers/NNS
  and/CC
  tech/JJ
  executives/NNS
  to/TO
  (NP the/DT effort/NN)
  ,/,
  and/CC
  he/PRP
  has/VBZ
  been/VBN
  in/IN
  (NP contact/NN)
  with/IN
  the/DT
  World/NNP
  Health/NNP
  Organization/NNP
  WHO/NNP
  ,/,
  the/DT
  US/NNP
  Centers/NNPS
  for/IN
  Disease/NNP
  Control/NNP
  and/CC
  Prevention/NNP
  ,/,
  and/CC
  the/DT
  US/NNP
  Department/NNP
  of/IN
  Health/NNP
  and/CC
  Human/NNP
  Services/NNPS
  ./.
  “/IN
  They/PRP
  are/VBP
  giving/VBG
  us/PRP
  (NP guidance/NN)
  on/IN
  what/WP
  will/MD
  work/VB
  ,/,
  ”/VB
  he/PRP
  says/VBZ
  ,/,
  although/IN
  (NP none/NN)
  has/VBZ
  yet/RB
  endorsed/VBN
  (NP the/DT idea/NN)
  ./.)


The output can be read as a tree or a hierarchy with S as the first level, denoting sentence. 

IOB tags have become the standard way to represent chunk structures in files, and we will also be using this format.

In [150]:
from nltk.chunk import conlltags2tree, tree2conlltags, ne_chunk
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('MIT', 'NNP', 'O'),
 ('Media', 'NNP', 'O'),
 ('Lab', 'NNP', 'O'),
 ('professor', 'NN', 'B-NP'),
 ('Ramesh', 'NNP', 'O'),
 ('Raskar', 'NNP', 'O'),
 ('has', 'VBZ', 'O'),
 ('been', 'VBN', 'O'),
 ('rallying', 'VBG', 'O'),
 ('other', 'JJ', 'O'),
 ('researchers', 'NNS', 'O'),
 ('and', 'CC', 'O'),
 ('tech', 'JJ', 'O'),
 ('executives', 'NNS', 'O'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'B-NP'),
 ('effort', 'NN', 'I-NP'),
 (',', ',', 'O'),
 ('and', 'CC', 'O'),
 ('he', 'PRP', 'O'),
 ('has', 'VBZ', 'O'),
 ('been', 'VBN', 'O'),
 ('in', 'IN', 'O'),
 ('contact', 'NN', 'B-NP'),
 ('with', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('World', 'NNP', 'O'),
 ('Health', 'NNP', 'O'),
 ('Organization', 'NNP', 'O'),
 ('WHO', 'NNP', 'O'),
 (',', ',', 'O'),
 ('the', 'DT', 'O'),
 ('US', 'NNP', 'O'),
 ('Centers', 'NNPS', 'O'),
 ('for', 'IN', 'O'),
 ('Disease', 'NNP', 'O'),
 ('Control', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('Prevention', 'NNP', 'O'),
 (',', ',', 'O'),
 ('and', 'CC', 'O'),
 ('the', 'DT', 'O'),
 ('US', 'NNP', '

In this representation, there is one token per line, each with its part-of-speech tag and its named entity tag. Based on this training corpus, we can construct a tagger that can be used to label new sentences; and use the nltk.chunk.conlltags2tree() function to convert the tag sequences into a chunk tree.

# With the function nltk.ne_chunk(), we can recognize named entities using a classifier, the classifier adds category labels such as PERSON, ORGANIZATION, and GPE.

In [151]:
ne_tree = ne_chunk(pos_tag(word_tokenize(example)))
print(ne_tree)

(S
  (ORGANIZATION MIT/NNP Media/NNP Lab/NNP)
  professor/NN
  (PERSON Ramesh/NNP Raskar/NNP)
  has/VBZ
  been/VBN
  rallying/VBG
  other/JJ
  researchers/NNS
  and/CC
  tech/JJ
  executives/NNS
  to/TO
  the/DT
  effort/NN
  ,/,
  and/CC
  he/PRP
  has/VBZ
  been/VBN
  in/IN
  contact/NN
  with/IN
  the/DT
  (ORGANIZATION World/NNP)
  Health/NNP
  Organization/NNP
  WHO/NNP
  ,/,
  the/DT
  (ORGANIZATION US/NNP Centers/NNPS)
  for/IN
  (PERSON Disease/NNP Control/NNP)
  and/CC
  Prevention/NNP
  ,/,
  and/CC
  the/DT
  (ORGANIZATION US/NNP Department/NNP)
  of/IN
  (GPE Health/NNP)
  and/CC
  (ORGANIZATION Human/NNP Services/NNPS)
  ./.
  “/IN
  They/PRP
  are/VBP
  giving/VBG
  us/PRP
  guidance/NN
  on/IN
  what/WP
  will/MD
  work/VB
  ,/,
  ”/VB
  he/PRP
  says/VBZ
  ,/,
  although/IN
  none/NN
  has/VBZ
  yet/RB
  endorsed/VBN
  the/DT
  idea/NN
  ./.)


# SpaCy

SpaCy’s named entity recognition has been trained on the OntoNotes 5 corpus and it supports the following entity types:

![spacy_entity_types.png](attachment:spacy_entity_types.png)

# Entity

In [152]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

We are using the same sentence, “MIT Media Lab professor Ramesh Raskar has been rallying other researchers and tech executives to the effort, and he has been in contact with the World Health Organization WHO, the US Centers for Disease Control and Prevention, and the US Department of Health and Human Services. “They are giving us guidance on what will work,” he says, although none has yet endorsed the idea.”

One of the nice things about Spacy is that we only need to apply nlp once, the entire background pipeline will return the objects.

In [114]:
doc = nlp('MIT Media Lab professor Ramesh Raskar has been rallying other researchers and tech executives to the effort, and he has been in contact with the World Health Organization WHO, the US Centers for Disease Control and Prevention, and the US Department of Health and Human Services. “They are giving us guidance on what will work,” he says, although none has yet endorsed the idea.')
pprint([(X.text, X.label_) for X in doc.ents])

[('MIT Media Lab', 'ORG'),
 ('Ramesh Raskar', 'PERSON'),
 ('the World Health Organization WHO', 'ORG'),
 ('US', 'GPE'),
 ('Centers for Disease Control and Prevention', 'ORG'),
 ('the US Department of Health and Human Services', 'ORG')]


MIT Media Lab is ORG, Ramesh Raskar is PERSON and US is GPE (Countries, cities, states.). They are all correct.

# Token

During the above example, we were working on entity level, in the following example, we are demonstrating token-level entity annotation using the BILUO tagging scheme to describe the entity boundaries.


![token.png](attachment:token.png)

In [153]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(MIT, 'B', 'ORG'),
 (Media, 'I', 'ORG'),
 (Lab, 'I', 'ORG'),
 (professor, 'O', ''),
 (Ramesh, 'B', 'PERSON'),
 (Raskar, 'I', 'PERSON'),
 (has, 'O', ''),
 (been, 'O', ''),
 (rallying, 'O', ''),
 (other, 'O', ''),
 (researchers, 'O', ''),
 (and, 'O', ''),
 (tech, 'O', ''),
 (executives, 'O', ''),
 (to, 'O', ''),
 (the, 'O', ''),
 (effort, 'O', ''),
 (,, 'O', ''),
 (and, 'O', ''),
 (he, 'O', ''),
 (has, 'O', ''),
 (been, 'O', ''),
 (in, 'O', ''),
 (contact, 'O', ''),
 (with, 'O', ''),
 (the, 'B', 'ORG'),
 (World, 'I', 'ORG'),
 (Health, 'I', 'ORG'),
 (Organization, 'I', 'ORG'),
 (WHO, 'I', 'ORG'),
 (,, 'O', ''),
 (the, 'O', ''),
 (US, 'B', 'GPE'),
 (Centers, 'B', 'ORG'),
 (for, 'I', 'ORG'),
 (Disease, 'I', 'ORG'),
 (Control, 'I', 'ORG'),
 (and, 'I', 'ORG'),
 (Prevention, 'I', 'ORG'),
 (,, 'O', ''),
 (and, 'O', ''),
 (the, 'B', 'ORG'),
 (US, 'I', 'ORG'),
 (Department, 'I', 'ORG'),
 (of, 'I', 'ORG'),
 (Health, 'I', 'ORG'),
 (and, 'I', 'ORG'),
 (Human, 'I', 'ORG'),
 (Services, 'I', 'ORG'),
 

"B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set.

# Extracting named entity from an article

Now let’s get serious with SpaCy and extracting named entities from a Futurism.com article, — “This MIT and Harvard-Built App Could Slow the Spread of Coronavirus”

In [154]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
#     soup = BeautifulSoup(html, 'html5lib')
    soup = BeautifulSoup(html,  "html.parser")
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://futurism.com/coronavirus-app-mit-safe-paths')
article = nlp(ny_bb)
len(article.ents)

57

There are 57 entities in the article and they are represented as 12 unique labels:

In [156]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 31,
         'PERSON': 4,
         'ORDINAL': 3,
         'CARDINAL': 2,
         'WORK_OF_ART': 3,
         'FAC': 1,
         'GPE': 6,
         'DATE': 3,
         'PERCENT': 1,
         'TIME': 1,
         'NORP': 1,
         'PRODUCT': 1})

The following are 7 most frequent tokens.

In [118]:
items = [x.text for x in article.ents]
Counter(items).most_common(7)

[('MIT', 2),
 ('Harvard-Built App', 2),
 ('Google', 2),
 ('Facebook', 2),
 ('first', 2),
 ('Israel', 2),
 ('Guardian', 2)]

Let’s randomly select one sentence to learn more.

In [162]:
sentences = [x for x in article.sents]
print(sentences[2])

The Byte/Neoscope+Videos+Newsletter+SocialTopicsSearchAboutSubmitMITShare to FacebookTweet ThisCopy LinkShare via EmailGabe Said We're Into MovementsThis MIT and Harvard-Built App Could Slow the Spread of CoronavirusThe team also included Google, Facebook, and Mayo Clinic workers, among others, who made it in their spare time.


Let’s run displacy.render to generate the raw markup.

In [163]:
displacy.render(nlp(str(sentences[2])), jupyter=True, style='ent')

One miss-classification here is Byte. It is hard, isn’t it?

Using spaCy’s built-in displaCy visualizer, here’s what the above sentence and its dependencies look like:

In [165]:
displacy.render(nlp(str(sentences[2])), style='dep', jupyter = True, options = {'distance': 120})

Next, we verbatim, extract part-of-speech and lemmatize this sentence.

In [166]:
[(x.orth_,x.pos_, x.lemma_) 
     for x in [y for y in nlp(str(sentences[30])) 
               if not y.is_stop and y.pos_ != 'PUNCT']]

[('Clicking', 'VERB', 'click'),
 ('link', 'NOUN', 'link'),
 ('takes', 'VERB', 'take'),
 ('user', 'NOUN', 'user'),
 ('website', 'NOUN', 'website'),
 ('district', 'NOUN', 'district'),
 ('office', 'NOUN', 'office'),
 ('lists', 'VERB', 'list'),
 ('places', 'NOUN', 'place'),
 ('patient', 'NOUN', 'patient'),
 ('visited', 'VERB', 'visit'),
 ('testing', 'VERB', 'test'),
 ('positive', 'ADJ', 'positive')]

In [167]:
dict([(str(x), x.label_) for x in nlp(str(sentences[3])).ents])

{'Foster KamerMarch': 'PERSON', '18th': 'ORDINAL'}

Named entity extraction are correct.

In [168]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(Another, 'O', ''), (obvious, 'O', ''), (issue, 'O', ''), (with, 'O', ''), (widespread, 'O', ''), (adoption, 'O', ''), (of, 'O', ''), (an, 'O', ''), (app, 'O', ''), (like, 'O', ''), (this, 'O', ''), (is, 'O', ''), (n’t, 'O', ''), (a, 'O', ''), (matter, 'O', ''), (of, 'O', ''), (choice, 'O', ''), (so, 'O', ''), (much, 'O', ''), (as, 'O', ''), (resources, 'O', ''), (., 'O', '')]


Finally, we visualize the entity of the entire article.

In [169]:
displacy.render(article, jupyter=True, style='ent')