## NER with focus on locations

packages to test
- nltk
- spaCy
- neuralcoref (maybe, built on spaCy so maybe not worth it)

For `nltk` and `spaCy`, starting with this nice post using both: https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

## NLTK

start with post's examples to make sure it all works as advertised

In [1]:
# nltk imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# not in post, but trying to run `word_tokenize` told me to run this:
# nltk.download('punkt')

# as well as this:
# nltk.download('averaged_perceptron_tagger')

In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
def preprocess(sent):
    sent = word_tokenize(sent)
    sent = pos_tag(sent)
    return sent

In [4]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [5]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [6]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [7]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [8]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

In [9]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


## spaCy

In [10]:
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

In [11]:
doc = nlp(ex)

In [12]:
doc.ents

(European, Google, $5.1 billion, Wednesday)

In [13]:
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [14]:
doc.print_tree()

[{'word': 'fined',
  'lemma': 'fin',
  'NE': '',
  'POS_fine': 'VBD',
  'POS_coarse': 'VERB',
  'arc': 'ROOT',
  'modifiers': [{'word': 'authorities',
    'lemma': 'authority',
    'NE': '',
    'POS_fine': 'NNS',
    'POS_coarse': 'NOUN',
    'arc': 'nsubj',
    'modifiers': [{'word': 'European',
      'lemma': 'European',
      'NE': 'NORP',
      'POS_fine': 'JJ',
      'POS_coarse': 'ADJ',
      'arc': 'amod',
      'modifiers': []}]},
   {'word': 'Google',
    'lemma': 'Google',
    'NE': 'ORG',
    'POS_fine': 'NNP',
    'POS_coarse': 'PROPN',
    'arc': 'dative',
    'modifiers': []},
   {'word': 'record',
    'lemma': 'record',
    'NE': '',
    'POS_fine': 'NN',
    'POS_coarse': 'NOUN',
    'arc': 'dobj',
    'modifiers': [{'word': 'a',
      'lemma': 'a',
      'NE': '',
      'POS_fine': 'DT',
      'POS_coarse': 'DET',
      'arc': 'det',
      'modifiers': []},
     {'word': '$ 5.1 billion',
      'lemma': '$ 5.1 billion',
      'NE': 'MONEY',
      'POS_fine': 'CD',
    

In [15]:
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'NORP': 1, 'ORG': 1, 'MONEY': 1, 'DATE': 1})

In [16]:
displacy.render(nlp(ex), jupyter=True, style='ent')

In [17]:
ex

'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [18]:
displacy.serve(doc, style='dep') 

# error - known issue: https://github.com/explosion/spaCy/issues/2868 - incompatible msgpack version, it seems

TypeError: __init__() got an unexpected keyword argument 'encoding'

In [23]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

186

In [24]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 83,
         'GPE': 35,
         'ORG': 24,
         'CARDINAL': 6,
         'DATE': 29,
         'EVENT': 1,
         'NORP': 5,
         'ORDINAL': 1,
         'WORK_OF_ART': 1,
         'LOC': 1})

In [25]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 10)]

In [26]:
sentences = [x for x in article.sents]
print(sentences[20])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


In [27]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [29]:
# displacy.render(nlp(str(sentences[20])), jupyter=True, style='dep')

In [30]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Firing', 'VERB', 'fire'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Strzok', 'PROPN', 'strzok'),
 ('removes', 'VERB', 'remove'),
 ('favorite', 'ADJ', 'favorite'),
 ('target', 'NOUN', 'target'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Trump', 'PROPN', 'trump'),
 ('ranks', 'NOUN', 'rank'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('gives', 'VERB', 'give'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Bowdich', 'PROPN', 'bowdich'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('director', 'NOUN', 'director'),
 ('Christopher', 'PROPN', 'christopher'),
 ('A.', 'PROPN', 'a.'),
 ('Wray', 'PROPN', 'wray'),
 ('chance', 'NOUN', 'chance'),
 ('president', 'NOUN', 'president'),
 ('’s', 'PART', '’s'),
 ('ire', 'NOUN', 'ire')]

In [31]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Strzok': 'PERSON',
 'Trump': 'PERSON',
 'F.B.I.': 'GPE',
 'Bowdich': 'PERSON',
 'Christopher A. Wray': 'PERSON'}

In [36]:
sentences_ents = [[(str(x), x.label_) for x in nlp(str(s)).ents] for s in sentences]

In [46]:
sentences_ents

[[('Peter Strzok', 'PERSON'),
  ('Who Criticized Trump', 'PERSON'),
  ('Fired', 'GPE')],
 [('The New York Times', 'ORG')],
 [('InToday', 'DATE'),
  ('PaperSupported byF.B.I. Agent Peter Strzok', 'ORG'),
  ('Who Criticized Trump', 'PERSON'),
  ('F.B.I.', 'GPE'),
  ('Trump', 'PERSON')],
 [('CreditCreditT.J. Kirkpatrick', 'PERSON'),
  ('The New York TimesBy Adam Goldman', 'ORG'),
  ('Michael S. SchmidtAug', 'PERSON')],
 [('13', 'CARDINAL'), ('2018WASHINGTON', 'CARDINAL')],
 [('Peter Strzok', 'PERSON'),
  ('F.B.I.', 'GPE'),
  ('Trump', 'PERSON'),
  ('Hillary Clinton', 'PERSON'),
  ('Russia', 'GPE'),
  ('Strzok', 'PERSON'),
  ('Monday', 'DATE')],
 [],
 [('2016', 'DATE'),
  ('F.B.I.', 'GPE'),
  ('Lisa Page — in', 'PERSON'),
  ('Russia', 'GPE')],
 [],
 [('Strzok', 'PERSON'),
  ('20 years', 'DATE'),
  ('F.B.I.', 'GPE'),
  ('the early months', 'DATE')],
 [('Strzok', 'PERSON')],
 [('F.B.I.', 'GPE'),
  ('Trump', 'PERSON'),
  ('Strzok', 'PERSON'),
  ('last summer', 'DATE'),
  ('Robert S. Mueller I

In [45]:
for s in sentences_ents:
    for ents in s:
        if len(ents) < 2: continue
        if ents[1] == 'GPE':
            print('spaCy says \033[1m{}\033[0m is a Geopolitical Entity'.format(ents[0]))

spaCy says [1mFired[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mTwitter[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mF.B.I.[0m is a Geopolitical Entity
spaCy says [1mRussia[0m is a 

## Compare to simple list search
As mentioned [here](https://stackoverflow.com/questions/30150047/find-all-locations-cities-places-in-a-text), using NLP may be unnecessary. In the above example FBI, Twitter, Fired, Candor, guidemultimediaphotographyvideoNYT, and Ad ChoicesPrivacyTerms all get misclassified as GPE.

Try Geonames with `Feature Class` in ('A', 'P') (see [feature codes](http://www.geonames.org/export/codes.html))

In [47]:
# download geonames AllCountries.zip file:
# !cd data # oops forgot to go up a folder..
!wget http://download.geonames.org/export/dump/allCountries.zip

/bin/sh: 1: cd: can't cd to data
--2018-11-10 10:52:25--  http://download.geonames.org/export/dump/allCountries.zip
Resolving download.geonames.org (download.geonames.org)... 188.40.33.19
Connecting to download.geonames.org (download.geonames.org)|188.40.33.19|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 356408587 (340M) [application/zip]
Saving to: ‘allCountries.zip’


2018-11-10 10:53:30 (5.22 MB/s) - ‘allCountries.zip’ saved [356408587/356408587]



In [48]:
# move zipped file to ../data/ folder
!mv allCountries.zip ../data/

In [54]:
from zipfile import ZipFile

with ZipFile('../data/allCountries.zip') as z:
#     print(z.namelist())
        with z.open('allCountries.txt') as f:
            head = [next(f) for x in range(5)]
        
pprint(head)

[b'2986043\tPic de Font Blanca\tPic de Font Blanca\tPic de Font Blanca,Pic du '
 b'Port\t42.64991\t1.53335\tT\tPK\tAD\t\t00\t\t\t\t0\t\t2860\tEurope/Andorra\t'
 b'2014-11-05\n',
 b'2994701\tRoc M\xc3\xa9l\xc3\xa9\tRoc Mele\tRoc Mele,Roc Meler,Roc M'
 b'\xc3\xa9l\xc3\xa9\t42.58765\t1.74028\tT\tMT\tAD\tAD,FR\t00\t\t\t\t0\t\t28'
 b'03\tEurope/Andorra\t2014-11-05\n',
 b'3007683\tPic des Langounelles\tPic des Langounelles\tPic des Langounelles\t4'
 b'2.61203\t1.47364\tT\tPK\tAD\tAD,FR\t00\t\t\t\t0\t\t2685\tEurope/Andorra\t2'
 b'014-11-05\n',
 b'3017832\tPic de les Abelletes\tPic de les Abelletes\tPic de la Font-Negre,P'
 b'ic de la Font-N\xc3\xa8gre,Pic de les Abelletes\t42.52535\t1.73343\tT\tPK\t'
 b'AD\tFR\tA9\t66\t663\t66146\t0\t\t2411\tEurope/Andorra\t2014-11-05\n',
 b'3017833\tEstany de les Abelletes\tEstany de les Abelletes\tEstany de les Ab'
 b'elletes,Etang de Font-Negre,\xc3\x89tang de Font-N\xc3\xa8gre\t42.52915\t1'
 b'.73362\tH\tLK\tAD\tFR\tA9\t\t\t\t0\t\t2260\tEurope/Andorra\t20

In [55]:
len(head)

5

In [59]:
for i, line in enumerate(head):
    print(i, line)

0 b'2986043\tPic de Font Blanca\tPic de Font Blanca\tPic de Font Blanca,Pic du Port\t42.64991\t1.53335\tT\tPK\tAD\t\t00\t\t\t\t0\t\t2860\tEurope/Andorra\t2014-11-05\n'
1 b'2994701\tRoc M\xc3\xa9l\xc3\xa9\tRoc Mele\tRoc Mele,Roc Meler,Roc M\xc3\xa9l\xc3\xa9\t42.58765\t1.74028\tT\tMT\tAD\tAD,FR\t00\t\t\t\t0\t\t2803\tEurope/Andorra\t2014-11-05\n'
2 b'3007683\tPic des Langounelles\tPic des Langounelles\tPic des Langounelles\t42.61203\t1.47364\tT\tPK\tAD\tAD,FR\t00\t\t\t\t0\t\t2685\tEurope/Andorra\t2014-11-05\n'
3 b'3017832\tPic de les Abelletes\tPic de les Abelletes\tPic de la Font-Negre,Pic de la Font-N\xc3\xa8gre,Pic de les Abelletes\t42.52535\t1.73343\tT\tPK\tAD\tFR\tA9\t66\t663\t66146\t0\t\t2411\tEurope/Andorra\t2014-11-05\n'
4 b'3017833\tEstany de les Abelletes\tEstany de les Abelletes\tEstany de les Abelletes,Etang de Font-Negre,\xc3\x89tang de Font-N\xc3\xa8gre\t42.52915\t1.73362\tH\tLK\tAD\tFR\tA9\t\t\t\t0\t\t2260\tEurope/Andorra\t2014-11-05\n'


In [70]:
# dictionary from http://download.geonames.org/export/dump/readme.txt

geonames_dd = [('geonameid', 'integer id of record in geonames database'),
                ('name', 'name of geographical point (utf8) varchar(200)'),
                ('asciiname', 'name of geographical point in plain ascii characters, varchar(200)'),
                ('alternatenames', 'alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)'),
                ('latitude', "latitude in decimal degrees (wgs84)"),
                ('longitude', 'longitude in decimal degrees (wgs84)'),
                ('feature class', 'see http://www.geonames.org/export/codes.html, char(1)'),
                ('feature code','see http://www.geonames.org/export/codes.html, varchar(10)'),
                ('country code', 'ISO-3166 2-letter country code, 2 characters'),
                ('cc2', 'alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters'),
                ('admin1 code', 'fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)'),
                ('admin2 code', 'code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)'), 
                ('admin3 code', 'code for third level administrative division, varchar(20)'),
                ('admin4 code', 'code for fourth level administrative division, varchar(20)'),
                ('population', 'bigint (8 byte int)'),
                ('elevation', 'in meters, integer'),
                ('dem', 'digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.'),
                ('timezone', 'the iana timezone id (see file timeZone.txt) varchar(40)'),
                ('modification date', 'date of last modification in yyyy-MM-dd format')
]

In [71]:
print([c[0] for c in geonames_dd])

['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'feature class', 'feature code', 'country code', 'cc2', 'admin1 code', 'admin2 code', 'admin3 code', 'admin4 code', 'population', 'elevation', 'dem', 'timezone', 'modification date']


In [69]:
# from https://stackoverflow.com/questions/11059390/parsing-a-tab-separated-file-in-python
# and to read from zip: https://stackoverflow.com/questions/5627954/py3k-how-do-you-read-a-file-inside-a-zip-file-as-text-not-bytes

import csv, io

i = 0

with ZipFile('../data/allCountries.zip') as z:
    with z.open("allCountries.txt", 'r') as tsv:
        file_items = io.TextIOWrapper(tsv)
        for line in csv.reader(file_items, delimiter="\t"):
            if line[6] in ('A', 'P'): # feature class is the 7th column
                print(line)
#             break # test with just one row
            # find first 5 rows with feature 'A' 'P'
                i += 1
            if i==5:
                break

['3038816', 'Xixerella', 'Xixerella', '', '42.55327', '1.48736', 'P', 'PPL', 'AD', '', '04', '', '', '', '0', '', '1417', 'Europe/Andorra', '2009-04-24']
['3038832', 'Vila', 'Vila', 'Casas Vila,Vila', '42.53176', '1.56654', 'P', 'PPL', 'AD', '', '03', '', '', '', '0', '', '1318', 'Europe/Andorra', '2012-04-13']
['3038899', 'Tossalet i Vinyals', 'Tossalet i Vinyals', 'Tossalet i Vinyals', '42.48597', '1.4891', 'P', 'PPLL', 'AD', '', '07', '', '', '', '0', '', '998', 'Europe/Andorra', '2018-09-05']
['3038987', 'Sornàs', 'Sornas', 'Sornas,Sornàs', '42.56461', '1.52757', 'P', 'PPL', 'AD', '', '05', '', '', '', '0', '', '1328', 'Europe/Andorra', '2014-11-05']
['3038999', 'Soldeu', 'Soldeu', '', '42.57688', '1.66769', 'P', 'PPL', 'AD', '', '02', '', '', '', '602', '', '1832', 'Europe/Andorra', '2017-11-06']


In [73]:
# extract all the location names we're likely to care about

# list to hold 'A' and 'P' geonames features
geonames_locations = []

# from above
with ZipFile('../data/allCountries.zip') as z:
    with z.open("allCountries.txt", 'r') as tsv:
        file_items = io.TextIOWrapper(tsv)
        for line in csv.reader(file_items, delimiter="\t"):
            if line[6] in ('A', 'P'): # feature class is the 7th column
                geonames_locations.append(line)

In [74]:
# make a dataframe
import pandas as pd

df_geonames = pd.DataFrame(geonames_locations, columns=[c[0] for c in geonames_dd])

In [76]:
df_geonames.head()

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,3038816,Xixerella,Xixerella,,42.55327,1.48736,P,PPL,AD,,4,,,,0,,1417,Europe/Andorra,2009-04-24
1,3038832,Vila,Vila,"Casas Vila,Vila",42.53176,1.56654,P,PPL,AD,,3,,,,0,,1318,Europe/Andorra,2012-04-13
2,3038899,Tossalet i Vinyals,Tossalet i Vinyals,Tossalet i Vinyals,42.48597,1.4891,P,PPLL,AD,,7,,,,0,,998,Europe/Andorra,2018-09-05
3,3038987,Sornàs,Sornas,"Sornas,Sornàs",42.56461,1.52757,P,PPL,AD,,5,,,,0,,1328,Europe/Andorra,2014-11-05
4,3038999,Soldeu,Soldeu,,42.57688,1.66769,P,PPL,AD,,2,,,,602,,1832,Europe/Andorra,2017-11-06


In [77]:
df_geonames.shape

(5141957, 19)

In [78]:
df_geonames['feature class'].value_counts()

P    4709154
A     432803
Name: feature class, dtype: int64

In [79]:
df_geonames['asciiname'].nunique()

2892663

### raw text search of place names in article sentences

In [93]:
# searching through 2.89M records for every document isn't going to be efficient...
geonames_ascii_unique = sorted(df_geonames['asciiname'].unique())

In [94]:
len(geonames_ascii_unique)

2892663

In [88]:
type(sentences[20].text)

str

In [89]:
sentences[20].text

'Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.'

In [95]:
geonames_ascii_unique

['',
 '!Kheis',
 "'A'amanu",
 "'Ab al 'Ali",
 "'Ababayli",
 "'Abadat Hasan",
 "'Abarat Darah",
 "'Abasirah",
 "'Abbad Maqbal",
 "'Abbadi 'Idan",
 "'Abbadi Mayyas",
 "'Abbar Muhsin",
 "'Abbarah",
 "'Abbas",
 "'Abbas 'Abd Allah",
 "'Abbas 'Abud",
 "'Abbas 'Ajawi Zanad",
 "'Abbas 'Ali",
 "'Abbas 'Alush",
 "'Abbas 'Arak",
 "'Abbas Agha",
 "'Abbas Ahmad",
 "'Abbas Ahmad Hasun",
 "'Abbas Al Makun",
 "'Abbas Bin Ta'mah",
 "'Abbas Da'ish",
 "'Abbas Dalli",
 "'Abbas Dawud",
 "'Abbas Dhiyab",
 "'Abbas Fayhan",
 "'Abbas Habniri",
 "'Abbas Hajji Laylu",
 "'Abbas Hamadi",
 "'Abbas Hamdan",
 "'Abbas Hammadi",
 "'Abbas Hamzah",
 "'Abbas Hanzal",
 "'Abbas Hasan",
 "'Abbas Husayn",
 "'Abbas Ibrahim",
 "'Abbas Isma'il",
 "'Abbas Jabr",
 "'Abbas Jaqal",
 "'Abbas Jasim",
 "'Abbas Jasim 'Uwayd",
 "'Abbas Jassam",
 "'Abbas Jawad",
 "'Abbas Jawu'",
 "'Abbas Jud",
 "'Abbas Kaddum",
 "'Abbas Kala",
 "'Abbas Kazim",
 "'Abbas Khad",
 "'Abbas Khalaf",
 "'Abbas Khan Kala",
 "'Abbas Khel",
 "'Abbas Khel Shamali",
 

In [96]:
# how long does it take to find matches for just one sentence
import time

start_time = time.time()

matches = [x for x in geonames_ascii_unique if x in sentences[20].text]

print('search of all places in 1 sentence took {:,.1f} seconds'.format(time.time()-start_time))

search of all places in 1 sentence took 72.1 seconds


In [98]:
# what did that find:
print(matches)

['', 'A', 'Bo', 'Bow', 'Chri', 'Christ', 'Christophe', 'Christopher', 'Fi', 'Fir', 'Firi', 'Trum', 'Trump', 'Wray']


In [103]:
# and 3 sentences:
import time

start_time = time.time()

matches = []

for s in sentences[19:22]:
    matches.append([x for x in geonames_ascii_unique if x in s.text])

print('search of all places in 3 sentences took {:,.1f} seconds'.format(time.time()-start_time))

search of all places in 1 sentence took 136.4 seconds


In [104]:
matches

[['', 'A'],
 ['',
  'A',
  'Bo',
  'Bow',
  'Chri',
  'Christ',
  'Christophe',
  'Christopher',
  'Fi',
  'Fir',
  'Firi',
  'Trum',
  'Trump',
  'Wray'],
 ['', 'A', 'Ai', 'Ait', 'Aita', 'Go', 'Goe', 'Goel', 'Goelm']]

In [None]:
# results seem pretty crappy, Geonames likely too messy for this effort.

# to try once performace matters: https://github.com/WojciechMula/pyahocorasick

## Many articles

Test NER against 2009 News Crawl data provided at [kylebgorman/LING83600-mp00.md](https://gist.github.com/kylebgorman/3e28fc834962017c9ac01f7434485519)
