In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [4]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [6]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [7]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [8]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [12]:
from nltk import ne_chunk

In [17]:
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


# Using Spacy

In [18]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [19]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [20]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [21]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

154

In [22]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 85,
         'DATE': 21,
         'GPE': 10,
         'ORG': 28,
         'CARDINAL': 5,
         'WORK_OF_ART': 2,
         'NORP': 2,
         'ORDINAL': 1})

In [23]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 34), ('F.B.I.', 15), ('Trump', 11)]

In [32]:
sentences = [x for x in article.sents]
print(sentences[10])

Mr. Trump’s victory traces back to June, when Mr. Strzok’s conduct was laid out in a wide-ranging inspector general’s report on how the F.B.I. handled the investigation of Hillary Clinton’s emails in the run-up to the 2016 election.


In [33]:
displacy.render(nlp(str(sentences[10])), jupyter=True, style='ent')

In [36]:
displacy.render(nlp(str(sentences[10])), style='dep', jupyter = True, options = {'distance': 120})

In [38]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[10])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Mr.', 'PROPN', 'Mr.'),
 ('Trump', 'PROPN', 'Trump'),
 ('victory', 'NOUN', 'victory'),
 ('traces', 'VERB', 'trace'),
 ('June', 'PROPN', 'June'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('conduct', 'NOUN', 'conduct'),
 ('laid', 'VERB', 'lay'),
 ('wide', 'ADV', 'wide'),
 ('ranging', 'VERB', 'range'),
 ('inspector', 'NOUN', 'inspector'),
 ('general', 'NOUN', 'general'),
 ('report', 'NOUN', 'report'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('handled', 'VERB', 'handle'),
 ('investigation', 'NOUN', 'investigation'),
 ('Hillary', 'PROPN', 'Hillary'),
 ('Clinton', 'PROPN', 'Clinton'),
 ('emails', 'NOUN', 'email'),
 ('run', 'NOUN', 'run'),
 ('2016', 'NUM', '2016'),
 ('election', 'NOUN', 'election')]

In [40]:
dict([(str(x), x.label_) for x in nlp(str(sentences[10])).ents])

{'Trump': 'PERSON',
 'June': 'DATE',
 'Strzok': 'PERSON',
 'F.B.I.': 'ORG',
 'Hillary Clinton': 'PERSON',
 '2016': 'DATE'}

In [41]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[10]])

[(Mr., 'O', ''), (Trump, 'B', 'PERSON'), (’s, 'O', ''), (victory, 'O', ''), (traces, 'O', ''), (back, 'O', ''), (to, 'O', ''), (June, 'B', 'DATE'), (,, 'O', ''), (when, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (’s, 'O', ''), (conduct, 'O', ''), (was, 'O', ''), (laid, 'O', ''), (out, 'O', ''), (in, 'O', ''), (a, 'O', ''), (wide, 'O', ''), (-, 'O', ''), (ranging, 'O', ''), (inspector, 'O', ''), (general, 'O', ''), (’s, 'O', ''), (report, 'O', ''), (on, 'O', ''), (how, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'ORG'), (handled, 'O', ''), (the, 'O', ''), (investigation, 'O', ''), (of, 'O', ''), (Hillary, 'B', 'PERSON'), (Clinton, 'I', 'PERSON'), (’s, 'O', ''), (emails, 'O', ''), (in, 'O', ''), (the, 'O', ''), (run, 'O', ''), (-, 'O', ''), (up, 'O', ''), (to, 'O', ''), (the, 'O', ''), (2016, 'B', 'DATE'), (election, 'O', ''), (., 'O', '')]


In [42]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')