In [132]:
# !pip install geonamescache

In [12]:
import pandas as pd
import numpy as np
import geonamescache
import re
from unidecode import unidecode
%matplotlib inline

In [13]:
# get and view dataset; convert to pandas dataframe
data = pd.read_csv("data/headlines.txt", sep="\n", names=['headline', 'countries', 'cities'])
data

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,
1,Could Zika Reach New York City?,,
2,First Case of Zika in Miami Beach,,
3,"Mystery Virus Spreads in Recife, Brazil",,
4,Dallas man comes down with case of Zika,,
5,Trinidad confirms first Zika case,,
6,Zika Concerns are Spreading in Houston,,
7,Geneve Scientists Battle to Find Cure,,
8,The CDC in Atlanta is Growing Worried,,
9,Zika Infested Monkeys in Sao Paulo,,


In [14]:
# get names oc countries and cities to match against; convert to dataframe
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()

countries = pd.DataFrame.from_dict(countries, orient='index')

cities = gc.get_cities()
cities = pd.DataFrame.from_dict(cities, orient='index')
cities.head()

Unnamed: 0,geonameid,name,latitude,longitude,countrycode,population,timezone,admin1code
1000501,1000501,Grahamstown,-33.30422,26.53276,ZA,91548,Africa/Johannesburg,5
1000543,1000543,Graaff-Reinet,-32.25215,24.53075,ZA,62896,Africa/Johannesburg,5
100077,100077,Abū Ghurayb,33.30563,44.18477,IQ,900000,Asia/Baghdad,7
1001860,1001860,Giyani,-23.30246,30.71868,ZA,37024,Africa/Johannesburg,9
1002851,1002851,Ga-Rankuwa,-25.61692,27.99471,ZA,68767,Africa/Johannesburg,10


In [15]:
# 1. clean up country names
country_names = unidecode(countries['name'].str.cat(sep = "|")
                          .replace(" ", "\s")
                          .replace("-", "\-"))

# 2. build a regex with all country names
regexp_country = f"(?i)({country_names})"

# 3. extract matches using the above regex and populate countries column in dataframe
data['countries'] = data['headline'].astype('str').str.extract(regexp_country, expand=True)

# repeat above 3 steps for cities column
city_names = unidecode(cities['name'].str.cat(sep = "|")
                       .replace(" ", "\s")
                       .replace("-", "\-")
                      .replace("`", "")
                      .replace("|", r"\b|\b"))
regexp_city = f"({city_names})"
data['cities'] = data['headline'].astype('str').str.extract(regexp_city, expand=True)
data.describe()

Unnamed: 0,headline,countries,cities
count,650,19,608
unique,648,12,566
top,Spanish Flu Outbreak in Lisbon,Brazil,Miami
freq,2,3,5


In [16]:
# view final result
# data
# import numpy as np
# # np.random.seed(100)
# test_headlines = np.random.choice(data['headline'], 20)
# for test_headline in test_headlines:
#     print(test_headline)
#     print(re.search(regexp_city, test_headline))
#     print(re.search(regexp_country, test_headline))

In [17]:
data.drop('countries', axis=1, inplace=True)
data.dropna(axis=0, how ='any', inplace=True)
data.drop_duplicates(subset='headline', inplace=True)
data.describe()


Unnamed: 0,headline,cities
count,606,606
unique,606,566
top,Schools in Bentonville Closed Due to Hepatitis...,Miami
freq,1,5


In [102]:
latitude = []
longitude = []
countrycode = []

for city in data['cities'].values:
    city_alternatives = gc.get_cities_by_name(unidecode(city))   
    try:
        best_alternative = max(city_alternatives, key=lambda x: list(x.values())[0]['population'])
    except:
        best_alternative = {'No result': {'geonameid': None, 'name': None, 'latitude': None, 'longitude': None, 'countrycode': None, 'population': None, 'timezone': None, 'admin1code': None}}

    best_city = list(best_alternative.values())[0]
#     print(best_alternative)
    
    latitude.append(best_city['latitude'])
    longitude.append(best_city['longitude'])
    countrycode.append(best_city['countrycode'])
data['latitude'] = latitude
data['longitude'] = longitude
data['countrycode'] = countrycode

# # print(data['latitude'])
data.head(600)


 



Unnamed: 0,headline,cities,latitude,longitude,countrycode
0,Zika Outbreak Hits Miami,Miami,25.77427,-80.19366,US
1,Could Zika Reach New York City?,New York City,40.71427,-74.00597,US
2,First Case of Zika in Miami Beach,Miami,25.77427,-80.19366,US
3,"Mystery Virus Spreads in Recife, Brazil",Recife,-8.05389,-34.88111,BR
4,Dallas man comes down with case of Zika,Dallas,32.78306,-96.80667,US
5,Trinidad confirms first Zika case,Trinidad,-14.83333,-64.90000,BO
6,Zika Concerns are Spreading in Houston,Houston,29.76328,-95.36327,US
7,Geneve Scientists Battle to Find Cure,Geneve,,,
8,The CDC in Atlanta is Growing Worried,Atlanta,33.74900,-84.38798,US
9,Zika Infested Monkeys in Sao Paulo,Sao Paulo,,,
