## NER with focus on locations

packages to test
- nltk
- spaCy
- neuralcoref (maybe, built on spaCy so maybe not worth it)

For `nltk` and `spaCy`, starting with this nice post using both: https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

## NLTK

start with post's examples to make sure it all works as advertised

In [1]:
# nltk imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# not in post, but trying to run `word_tokenize` told me to run this:
# nltk.download('punkt')

# as well as this:
# nltk.download('averaged_perceptron_tagger')

In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
def preprocess(sent):
    sent = word_tokenize(sent)
    sent = pos_tag(sent)
    return sent

In [4]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [5]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [6]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [7]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [8]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

In [9]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


## spaCy

In [10]:
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

In [11]:
doc = nlp(ex)

In [12]:
doc.ents

(European, Google, $5.1 billion, Wednesday)

In [13]:
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [14]:
doc.print_tree()

[{'word': 'fined',
  'lemma': 'fin',
  'NE': '',
  'POS_fine': 'VBD',
  'POS_coarse': 'VERB',
  'arc': 'ROOT',
  'modifiers': [{'word': 'authorities',
    'lemma': 'authority',
    'NE': '',
    'POS_fine': 'NNS',
    'POS_coarse': 'NOUN',
    'arc': 'nsubj',
    'modifiers': [{'word': 'European',
      'lemma': 'European',
      'NE': 'NORP',
      'POS_fine': 'JJ',
      'POS_coarse': 'ADJ',
      'arc': 'amod',
      'modifiers': []}]},
   {'word': 'Google',
    'lemma': 'Google',
    'NE': 'ORG',
    'POS_fine': 'NNP',
    'POS_coarse': 'PROPN',
    'arc': 'dative',
    'modifiers': []},
   {'word': 'record',
    'lemma': 'record',
    'NE': '',
    'POS_fine': 'NN',
    'POS_coarse': 'NOUN',
    'arc': 'dobj',
    'modifiers': [{'word': 'a',
      'lemma': 'a',
      'NE': '',
      'POS_fine': 'DT',
      'POS_coarse': 'DET',
      'arc': 'det',
      'modifiers': []},
     {'word': '$ 5.1 billion',
      'lemma': '$ 5.1 billion',
      'NE': 'MONEY',
      'POS_fine': 'CD',
    

In [15]:
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'NORP': 1, 'ORG': 1, 'MONEY': 1, 'DATE': 1})

In [16]:
displacy.render(nlp(ex), jupyter=True, style='ent')

In [17]:
ex

'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [18]:
displacy.serve(doc, style='dep') 

# error - known issue: https://github.com/explosion/spaCy/issues/2868 - incompatible msgpack version, it seems

TypeError: __init__() got an unexpected keyword argument 'encoding'

In [23]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

186

In [24]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 83,
         'GPE': 35,
         'ORG': 24,
         'CARDINAL': 6,
         'DATE': 29,
         'EVENT': 1,
         'NORP': 5,
         'ORDINAL': 1,
         'WORK_OF_ART': 1,
         'LOC': 1})

In [25]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 10)]

In [26]:
sentences = [x for x in article.sents]
print(sentences[20])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


In [27]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [29]:
# displacy.render(nlp(str(sentences[20])), jupyter=True, style='dep')

In [30]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Firing', 'VERB', 'fire'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Strzok', 'PROPN', 'strzok'),
 ('removes', 'VERB', 'remove'),
 ('favorite', 'ADJ', 'favorite'),
 ('target', 'NOUN', 'target'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Trump', 'PROPN', 'trump'),
 ('ranks', 'NOUN', 'rank'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('gives', 'VERB', 'give'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Bowdich', 'PROPN', 'bowdich'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('director', 'NOUN', 'director'),
 ('Christopher', 'PROPN', 'christopher'),
 ('A.', 'PROPN', 'a.'),
 ('Wray', 'PROPN', 'wray'),
 ('chance', 'NOUN', 'chance'),
 ('president', 'NOUN', 'president'),
 ('’s', 'PART', '’s'),
 ('ire', 'NOUN', 'ire')]

In [31]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Strzok': 'PERSON',
 'Trump': 'PERSON',
 'F.B.I.': 'GPE',
 'Bowdich': 'PERSON',
 'Christopher A. Wray': 'PERSON'}

In [36]:
sentences_ents = [[(str(x), x.label_) for x in nlp(str(s)).ents] for s in sentences]

In [46]:
sentences_ents

[[('Peter Strzok', 'PERSON'),
  ('Who Criticized Trump', 'PERSON'),
  ('Fired', 'GPE')],
 [('The New York Times', 'ORG')],
 [('InToday', 'DATE'),
  ('PaperSupported byF.B.I. Agent Peter Strzok', 'ORG'),
  ('Who Criticized Trump', 'PERSON'),
  ('F.B.I.', 'GPE'),
  ('Trump', 'PERSON')],
 [('CreditCreditT.J. Kirkpatrick', 'PERSON'),
  ('The New York TimesBy Adam Goldman', 'ORG'),
  ('Michael S. SchmidtAug', 'PERSON')],
 [('13', 'CARDINAL'), ('2018WASHINGTON', 'CARDINAL')],
 [('Peter Strzok', 'PERSON'),
  ('F.B.I.', 'GPE'),
  ('Trump', 'PERSON'),
  ('Hillary Clinton', 'PERSON'),
  ('Russia', 'GPE'),
  ('Strzok', 'PERSON'),
  ('Monday', 'DATE')],
 [],
 [('2016', 'DATE'),
  ('F.B.I.', 'GPE'),
  ('Lisa Page — in', 'PERSON'),
  ('Russia', 'GPE')],
 [],
 [('Strzok', 'PERSON'),
  ('20 years', 'DATE'),
  ('F.B.I.', 'GPE'),
  ('the early months', 'DATE')],
 [('Strzok', 'PERSON')],
 [('F.B.I.', 'GPE'),
  ('Trump', 'PERSON'),
  ('Strzok', 'PERSON'),
  ('last summer', 'DATE'),
  ('Robert S. Mueller I

In [45]:
for s in sentences_ents:
    for ents in s:
        if len(ents) < 2: continue
        if ents[1] == 'GPE':
            print('spaCy says \033[1m{}\033[0m is a Geopolitical Entity'.format(ents[0]))

spaCy says [1mFired[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mTwitter[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a 

## Real articles

Test NER against 2009 News Crawl data provided at [kylebgorman/LING83600-mp00.md](https://gist.github.com/kylebgorman/3e28fc834962017c9ac01f7434485519)
