In [55]:
import re

import pandas as pd
from unidecode import unidecode
import geonamescache


def get_longest_match(matches):
    # get longest city name found in text
    lens = [len(m) for m in matches]
    max_len = max(lens)
    return matches[lens.index(max_len)]


gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities().values()

city_names = '|'.join([unidecode(c['name']) for c in cities])

# prepare a dict with unidecoded city names as key
# useful to get city country data with geonamescache (which has unicoded names)
for c in cities:
    c['unidecoded_name'] = unidecode(c['name']).lower()
cities_by_unidecoded_name = {c['unidecoded_name']: c for c in cities}

# regex to find a city name in text
city_list = f'({city_names})'
re_city = re.compile(f'\\b{city_list}\\b', flags=re.IGNORECASE)

# data structure
d = {'headline': [], 'city': [], 'country': []}

# analyzing headlines
with open('data/headlines.txt') as data:
    for hl in data:
        # work on a unidecoded string
        hl = unidecode(hl.strip())
        
        matches = re_city.findall(hl)
        if matches:
            # put all matches from re.findall into a list
            matches = [m[0] for m in matches]
            # get the longest match...heuristic...
            match = get_longest_match(matches)
            # fetch the entire city ad country data from geonamescache
            city_unicode_name = cities_by_unidecoded_name[match.lower()]['name']
            city_dict = list(gc.get_cities_by_name(city_unicode_name)[0].values())[0]
            country = countries[city_dict['countrycode']]
            
            # add data
            d['headline'].append(hl)
            d['city'].append(city_dict)
            d['country'].append(country)
        
        else:
            print('No city found in' , hl)
            # we still add headline to our structure to deeper analysis
            d['headline'].append(hl)
            d['city'].append(None)
            d['country'].append(None)

no_city_num = len([c for c in d['city'] if c is None])
print('Could not find a city mention in', no_city_num, 'headlines')
# put everything into a DataFrame
df = pd.DataFrame(data=d)
df.to_pickle('./data/task_1.pkl')

No city found in Louisiana Zika cases up to 26
No city found in Zika infects pregnant woman in Cebu
No city found in Spanish Flu Sighted in Antigua
No city found in Zika case reported in Oton
No city found in Hillsborough uses innovative trap against Zika 20 minutes ago
No city found in Maka City Experiences Influenza Outbreak
No city found in West Nile Virus Outbreak in Saint Johns
No city found in Malaria Exposure in Sussex
No city found in Greenwich Establishes Zika Task Force
No city found in Will West Nile Virus vaccine help Parsons?
No city found in Zika case reported in Los Fresnos
No city found in More people in Boucau are infected with HIV every year
No city found in Bronchitis Outbreak in Manhasset
No city found in Rumors about Influenza Spreading in Dobbs Ferry have been Refuted
No city found in More people in Huron are infected with Dengue every year
No city found in Will Tuberculosis vaccine help Cherry Creek?
No city found in Gympie Patient in Critical Condition after Con