In [2]:
import numpy as np
import pandas as pd

data = pd.read_json("./data/headline_cities_and_countries.json")
data = data.replace({None: np.nan})

data.head(10)

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo


In [3]:
data['cities'][0]

'Miami'

In [4]:
import json

with open("./data/city_accent_mapping.json", "r") as fin:
    city_accent_mapping = json.loads(fin.read())

with open("./data/country_accent_mapping.json", "r") as fin:
    country_accent_mapping = json.loads(fin.read())

In [5]:
city_accent_mapping.get("Asmar")

'Āsmār'

In [6]:
import geonamescache

gc = geonamescache.GeonamesCache()

In [7]:
def pop(p):
    return list(p.values())[0]['population']


In [8]:
tmp  = gc.get_cities_by_name(city_accent_mapping.get("Asmar"))
tmp = sorted(tmp, key = pop, reverse = True)

In [9]:
list(tmp[0].values())[0]

{'geonameid': 1148205,
 'name': 'Āsmār',
 'latitude': 35.03333,
 'longitude': 71.35809,
 'countrycode': 'AF',
 'population': 15708,
 'timezone': 'Asia/Kabul',
 'admin1code': '34'}

In [10]:
def lat(city):
    if not isinstance(city, str):
        return np.nan
    tmp = gc.get_cities_by_name(city_accent_mapping.get(city))
    tmp = sorted(tmp, key = pop, reverse = True)
    return list(tmp[0].values())[0]['latitude']

def lon(city):
    if not isinstance(city, str):
        return np.nan
    tmp = gc.get_cities_by_name(city_accent_mapping.get(city))
    tmp = sorted(tmp, key = pop, reverse = True)
    return list(tmp[0].values())[0]['longitude']

def co_code(city):
    if not isinstance(city, str):
        return np.nan
    tmp = gc.get_cities_by_name(city_accent_mapping.get(city))
    tmp = sorted(tmp, key = pop, reverse = True)
    return list(tmp[0].values())[0]['countrycode']

In [11]:
data = data.assign(latitude = [lat(city) for city in data['cities']],
                  longitude = [lon(city) for city in data['cities']],
                  countrycode = [co_code(city) for city in data['cities']])
data.head(30)

Unnamed: 0,headline,countries,cities,latitude,longitude,countrycode
0,Zika Outbreak Hits Miami,,Miami,25.77427,-80.19366,US
1,Could Zika Reach New York City?,,New York City,40.71427,-74.00597,US
2,First Case of Zika in Miami Beach,,Miami Beach,25.79065,-80.13005,US
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife,-8.05389,-34.88111,BR
4,Dallas man comes down with case of Zika,,Dallas,32.78306,-96.80667,US
5,Trinidad confirms first Zika case,,Trinidad,-14.83333,-64.9,BO
6,Zika Concerns are Spreading in Houston,,Houston,29.76328,-95.36327,US
7,Geneve Scientists Battle to Find Cure,,Geneve,46.20222,6.14569,CH
8,The CDC in Atlanta is Growing Worried,,Atlanta,33.749,-84.38798,US
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo,-23.5475,-46.63611,BR


In [12]:
data = data.drop(['countries'], axis = 1)

In [15]:
data = data.dropna()
data.head(30)

Unnamed: 0,headline,cities,latitude,longitude,countrycode
0,Zika Outbreak Hits Miami,Miami,25.77427,-80.19366,US
1,Could Zika Reach New York City?,New York City,40.71427,-74.00597,US
2,First Case of Zika in Miami Beach,Miami Beach,25.79065,-80.13005,US
3,"Mystery Virus Spreads in Recife, Brazil",Recife,-8.05389,-34.88111,BR
4,Dallas man comes down with case of Zika,Dallas,32.78306,-96.80667,US
5,Trinidad confirms first Zika case,Trinidad,-14.83333,-64.9,BO
6,Zika Concerns are Spreading in Houston,Houston,29.76328,-95.36327,US
7,Geneve Scientists Battle to Find Cure,Geneve,46.20222,6.14569,CH
8,The CDC in Atlanta is Growing Worried,Atlanta,33.749,-84.38798,US
9,Zika Infested Monkeys in Sao Paulo,Sao Paulo,-23.5475,-46.63611,BR


In [16]:
data.to_csv("./data/headlines-cities-lat-lon.csv")