## NER with focus on locations

packages to test
- nltk
- spaCy
- neuralcoref (maybe, built on spaCy so maybe not worth it)

For `nltk` and `spaCy`, starting with this nice post using both: https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

## NLTK

start with post's examples to make sure it all works as advertised

In [9]:
# nltk imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# not in post, but trying to run `word_tokenize` told me to run this:
# nltk.download('punkt')

# as well as this:
# nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/clayton/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [10]:
def preprocess(sent):
    sent = word_tokenize(sent)
    sent = pos_tag(sent)
    return sent

In [11]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [12]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [13]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [17]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [22]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

[nltk_data] Downloading package words to /home/clayton/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [23]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


## spaCy

In [25]:
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

In [26]:
doc = nlp(ex)

In [27]:
doc.ents

(European, Google, $5.1 billion, Wednesday)

In [28]:
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [30]:
doc.print_tree()

[{'word': 'fined',
  'lemma': 'fin',
  'NE': '',
  'POS_fine': 'VBD',
  'POS_coarse': 'VERB',
  'arc': 'ROOT',
  'modifiers': [{'word': 'authorities',
    'lemma': 'authority',
    'NE': '',
    'POS_fine': 'NNS',
    'POS_coarse': 'NOUN',
    'arc': 'nsubj',
    'modifiers': [{'word': 'European',
      'lemma': 'European',
      'NE': 'NORP',
      'POS_fine': 'JJ',
      'POS_coarse': 'ADJ',
      'arc': 'amod',
      'modifiers': []}]},
   {'word': 'Google',
    'lemma': 'Google',
    'NE': 'ORG',
    'POS_fine': 'NNP',
    'POS_coarse': 'PROPN',
    'arc': 'dative',
    'modifiers': []},
   {'word': 'record',
    'lemma': 'record',
    'NE': '',
    'POS_fine': 'NN',
    'POS_coarse': 'NOUN',
    'arc': 'dobj',
    'modifiers': [{'word': 'a',
      'lemma': 'a',
      'NE': '',
      'POS_fine': 'DT',
      'POS_coarse': 'DET',
      'arc': 'det',
      'modifiers': []},
     {'word': '$ 5.1 billion',
      'lemma': '$ 5.1 billion',
      'NE': 'MONEY',
      'POS_fine': 'CD',
    

In [31]:
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'NORP': 1, 'ORG': 1, 'MONEY': 1, 'DATE': 1})

In [32]:
displacy.render(nlp(ex), jupyter=True, style='ent')

In [35]:
ex

'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [38]:
displacy.serve(doc, style='dep') 

# error - known issue: https://github.com/explosion/spaCy/issues/2868 - incompatible msgpack version, it seems

TypeError: __init__() got an unexpected keyword argument 'encoding'

## Real articles

Test NER against 2009 News Crawl data provided at [kylebgorman/LING83600-mp00.md](https://gist.github.com/kylebgorman/3e28fc834962017c9ac01f7434485519)
