In [1]:
import geonamescache
import pandas as pd
import numpy as np
import re
import sys
from unidecode import unidecode

In [2]:
gc = geonamescache.GeonamesCache()
# !@#$$**$! unicode ... 
cities = [unidecode(x['name']) for x in gc.get_cities().values()] 
countries = [unidecode(x) for x in gc.get_countries_by_names().keys()]
headlines_dict = dict()

In [3]:
def country_lookup(countries, line):
    # All the cities are capitalized, so filter those out
    country = [ x for x in countries if re.search(r'x', line)]
    if country:
        f = set(country)
        return f
    else:
        return np.nan

In [4]:
def lookup(lookuptype, line):
    # All the cities are capitalized, so filter those out
    words = re.findall(rf'[A-Z]\w+', line)

    if "St" in words:
        for i in range(len(words)):
            if words[i] == "St":
                words[i] = re.sub(r'St', 'St.', words[i])

    front = words.copy()
    word_string = " ".join(words) 

    found = [ x for x in lookuptype if word_string.startswith(x) ]
    if found:
        if len(found) > 1:
            f=set(found)
            return f.pop()
        else:
            return found[0]

    found =  [x for x in lookuptype if word_string.endswith(x)]
    if found:
        if len(found) > 1:
            f =set(found)
            return f.pop()
        else:
            return found[0]

    while len(front) > 0:
        back = front.copy()
        while len(back) > 0:
            back_string = " ".join(back)
            if back_string in lookuptype:
                return back_string
            else:
                back.pop()
        front.pop(0)

    return(np.nan)

In [6]:
try:
    with open("data/headlines.txt") as f:
        headlines = f.readlines()
        f.close()
except OSError as e:
    print("Error: ", e)

In [7]:
for counter in range(0,len(headlines)):
    headlines_dict[counter] = {}
    line = headlines[counter].strip()
    headlines_dict[counter]['headline'] = line
    country = lookup(countries, line)
    headlines_dict[counter]['countries'] = country
    city = lookup(cities, line)
    headlines_dict[counter]['cities'] = city

In [8]:
f = zip([headlines_dict[x]['headline'] for x in headlines_dict.keys()],
        [headlines_dict[x]['countries'] for x in headlines_dict.keys()],
        [headlines_dict[x]['cities'] for x in headlines_dict.keys()])

In [9]:
my_df = pd.DataFrame(f, columns =["Headline", "Country", "City"])

In [10]:
from IPython.display import display, HTML

display(HTML(my_df.to_html()))

Unnamed: 0,Headline,Country,City
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo
