In [None]:
import json
import geonamescache
import pandas as pd
import numpy as np
import re
import unidecode as unidecode
import matplotlib
%matplotlib inline


def get_headlines(source):
    with open(source) as handle:
        for line in handle:
            yield unidecode.unidecode(line.strip('\n'))


def find_match(text, regex):
    found = re.search(regex, text)
    if found:
        return found.group(0)
    return None


def get_city_name(headline, regex):
    return find_match(headline, regex)


def get_country_name(headline, regex):
    return find_match(headline, regex)


gc = geonamescache.GeonamesCache()
countries = [country["name"] for country in gc.get_countries().values()]
cities = [city["name"] for city in gc.get_cities().values()]
country_code_lat_lang = {
    city['name']: {
        'countrycode': city.get('countrycode'),
        'lat': float(city['latitude']),
        'lon': float(city['longitude']),
        'population': int(city['population']),
    } for city in gc.get_cities().values()
}

countries_accent_map = {
    unidecode.unidecode(country): country for country in countries
}

cities_accent_map = {
    unidecode.unidecode(city): city for city in cities
}

unaccented_countries = set(countries_accent_map.keys())
unaccented_cities = list(cities_accent_map.keys())

unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)

city_regex = r'\b|\b'.join(unaccented_cities)
country_regex = r'\b|\b'.join(unaccented_countries)

headlines = get_headlines('../data/headlines.txt')
headlines_cities_countries = [
    dict(headline=headline,
         countries=get_country_name(headline, country_regex),
         cities=get_city_name(headline, city_regex)) for headline in headlines]

with open('../data/headlines_cities_countries.json', 'w') as fout:
    fout.write(json.dumps(headlines_cities_countries))

with open('../data/unaccented_countries.json', 'w') as fout:
    fout.write(json.dumps(unaccented_countries))

with open('../data/unaccented_cities.json', 'w') as fout:
    fout.write(json.dumps(unaccented_cities))

data = pd.read_json('../data/headlines_cities_countries.json')
data = data.replace({None: np.nan})
data.head()
data = data.rename(columns=dict(countries='country',
                                cities='city'))
data.describe()

# see if any duplicates
print(data["headline"].value_counts().sort_values().tail())

print(f'There were {len(data)} total rows before dropping duplicates.')
# drop duplicates
data = data.drop_duplicates()
print(f'There were {len(data)} total rows before dropping duplicates.')

data.info()
data.columns
# Exploratory plot
data['country'].value_counts()

_ = data['country'].value_counts().plot.bar(title='Count of Countries')
# add city countrycode, latitude and longitude
def get_country_code(city):
    return country_code_lat_lang.get(city, {}).get('countrycode')

def get_latitude(city):
    return country_code_lat_lang.get(city, {}).get('lat')

def get_longitude(city):
    return country_code_lat_lang.get(city, {}).get('lon')

def get_population(city):
    return country_code_lat_lang.get(city, {}).get('population')

data['countrycode'] = data['city'].apply(get_country_code)
data['latitude'] = data['city'].apply(get_latitude)

data['longitude'] = data['city'].apply(get_longitude)
data['population'] = data['city'].apply(get_population)

print(f"There are {data['country'].nunique()} different countries.")
print(f"There are {data['city'].nunique()} different cities.")

# 10 most common cities
data["city"].value_counts().sort_values().tail(10)
_ = (
    data["city"].value_counts().sort_values()
        .tail(10).plot.bar(title="10 most common cities.")
)

_ = data['city'].value_counts().plot.hist(
    title="distribution of city occurrences."
)