In [2]:
import json
import geonamescache
import pandas as pd
import numpy as np
import re
import unidecode as unidecode


def get_headlines(source):
    with open(source) as handle:
        for line in handle:
            yield unidecode.unidecode(line.strip('\n'))


def find_match(text, regex):
    found = re.search(regex, text)
    if found:
        return found.group(0)
    return None


def get_city_name(headline, regex):
    return find_match(headline, regex)


def get_country_name(headline, regex):
    return find_match(headline, regex)


gc = geonamescache.GeonamesCache()
countries = [country["name"] for country in gc.get_countries().values()]
cities = [city["name"] for city in gc.get_cities().values()]
country_code_lat_lang = {
    city['name']: {
        'countrycode': city['countrycode'],
        'lat': city['latitude'],
        'lon': city['longitude']
    } for city in gc.get_cities().values()
}

countries_accent_map = {
    unidecode.unidecode(country): country for country in countries
}

cities_accent_map = {
    unidecode.unidecode(city): city for city in cities
}

unaccented_countries = set(countries_accent_map.keys())
unaccented_cities = list(cities_accent_map.keys())

unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)

city_regex = r'\b|\b'.join(unaccented_cities)
country_regex = r'\b|\b'.join(unaccented_countries)

headlines = get_headlines('../data/headlines.txt')
headlines_cities_countries = [
    dict(headline=headline,
         countries=get_country_name(headline, country_regex),
         cities=get_city_name(headline, city_regex)) for headline in headlines]

with open('../data/headlines_cities_countries.json', 'w') as fout:
    fout.write(json.dumps(headlines_cities_countries))

with open('../data/unaccented_countries.json', 'w') as fout:
    fout.write(json.dumps(unaccented_countries))

with open('../data/unaccented_cities.json', 'w') as fout:
    fout.write(json.dumps(unaccented_cities))

data = pd.read_json('../data/headlines_cities_countries.json')
data = data.replace({None: np.nan})
data.head()

# add city countrycode, latitude and longitude
def get_country_code(cities):
    return country_code_lat_lang.get(cities, {}).get('countrycode')

def get_latitude(cities):
    return country_code_lat_lang.get(cities, {}).get('lat')

def get_longitude(cities):
    return country_code_lat_lang.get(cities, {}).get('lon')

data['countrycode'] = data.apply(lambda row: get_country_code(
    row['cities']
), axis=1)
data['latitude'] = data.apply(lambda row: get_latitude(
    row['cities']
), axis=1)

data['longitude'] = data.apply(lambda row: get_longitude(
    row['cities']
), axis=1)

ValueError: Wrong number of items passed 0, placement implies 1