In [1]:
# display all output and eliminate scrolling in output areas

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

<h2>Discovering Disease Outbreaks from News Headlings - Part 2</h2>

In [3]:
# key libraries
import os
import re
import numpy as np
import pandas as pd

from unidecode import unidecode
from geonamescache import GeonamesCache
gc = GeonamesCache()

<h4>1. Load the pandas DataFrame (from Part 1) containing the headlines, countries, and cities</h4>

In [4]:
df = pd.read_csv("data/df_headlines_cities_countries.txt", sep="|")

df.head()

Unnamed: 0,Headline,City,Country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami Beach,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,


In [5]:
# brief check of the retrieved data
df.head()
df.describe(include=object)

Unnamed: 0,Headline,City,Country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami Beach,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,


Unnamed: 0,Headline,City,Country
count,611,611,15
unique,608,577,10
top,Spanish Flu Outbreak in Lisbon,Madrid,Malaysia
freq,2,4,3


The above description reiterates that every in headline (in the remaining 611) mentions a city name, while the overwhelming majority are missing a country name. The implication is that the search for an associated longitude or latitude will have to be based on the city locations not the country locations.

<h4>2. For each city/country, match the name to the latitude and longitude in geonamescache.</h4>

The geonamescache library provides a series of methods for retrieving a small set of geographic properties for continents, countries and cities of the world.  All of these methods return the associated info in the form of a Python 'dictionary'. In the case of the cities, there are two retrieval methods: 'get_cities' and 'get_cities_by_name'. In either case, the information provided includes: geonameid, name (accented name), latitude, longitude, countrycode (unique abbreviation), population, timezone, and admin1code (i.e. this is the administrative subdivision where the city is located - e.g. in US it could be the state). Since we have the city name, we'll use the latter method to retrieve the associated dictionary and subsequently the associated latitude and longitude.

<h5>Some issues with retrieving by city name</h5>

<i>Issue 1: In the geonamescache city name dictionary the keys are not decoded</i>

The first issue to be addressed is that the city names used as 'keys' in the geonamescache (gc) city dictionary are 'undecoded' and may or may not have accents, while the city names in the headlines dataframe have all been decoded so there are no accents. Take, for instance, a city like Sao Paulo which is one of the cities in the headlines. If you try to retrieve Sao Paulo's geographic info from the gc city dictionary, you won't find it because there the city name is São Paulo. On the other hand, with a city like Dallas there is no issue because the decoded and 'undecoded' names are the same. So, to overcome this problem, we need a way to determine for a given city name whether the decoded and undecoded versions are the same or not, and if not, what the accented ('undecoded') name is for a given decoded name.

The supporting literature suggests the way to do this is to:

<ol>
    <li>Create a list (labeled 'accented_names') of all the accented names in the gc city name dictionary</li>
    <li>For the list of accented_names, create a dictionary ('alternative_names') of key-value pairs ('unidecode(name):name') linking the decoded name to the name in the gc city names dictionary.</li>
    <li>Finally, before a request is made to retrieve info (e.g 'latitude') for a (decoded) city name in a headline, you would use the 'alternative_names' dictionary to see whether the decoded city name had an accented counterpart. If so, the counterpart would be used in making the request. If not, you'd simply use the decoded city name. </li>
</ol>

Note: An alternative way is to create a lookup dictionary of 'unidecoded(name):name' pairs for all the cities in the gc city name database (not just the accented city names). This eliminates the need for steps 1 and 3 and is negligibly faster.

In [6]:
accented_names = [city['name'] for city in gc.get_cities( ).values()
                  if city['name'] != unidecode(city['name'])]

alternative_names = {unidecode(name):name for name in accented_names}

print('Retrieving the alternative names for small sample of cities in headlines\n')

for city_name in df.loc[0:10,'City']:
    if (city_name in alternative_names): city_name = alternative_names[city_name]
    print(city_name)


Retrieving the alternative names for small sample of cities in headlines

Miami
New York City
Miami Beach
Recife
Dallas
Trinidad
Houston
Genève
Atlanta
São Paulo
Brownsville


In [7]:
# Create a look dictionary pairing the decoded and undecoded city names 
# for all the city names in the gc city name dictionary

gc_names = {}

for k,v in gc.get_cities().items():
    gc_name = gc.get_cities()[k]['name'] # retrieve the names in the gc city name dictionary
    gc_names[unidecode(gc_name)] = gc_name # key-value pairs

print('Retrieving the gc names for small sample of cities in headlines\n')
for city_name in df.loc[0:10,'City']: #find the gc_names for all the decoded names in the headlines
    gc_name = gc_names[city_name]
    print(city_name, gc_name)
    

Retrieving the gc names for small sample of cities in headlines

Miami Miami
New York City New York City
Miami Beach Miami Beach
Recife Recife
Dallas Dallas
Trinidad Trinidad
Houston Houston
Geneve Genève
Atlanta Atlanta
Sao Paulo São Paulo
Brownsville Brownsville


Once we have the lookup name, we can use the 'get_cities_by_name' method to retrieve the associated entry in the city dictionary. The results are shown for the two lookup names.

In [8]:
print('Properties for sample city accessed by city name:')
gc.get_cities_by_name('São Paulo')
gc.get_cities_by_name('Dallas')

Properties for sample city accessed by city name:


[{'3448439': {'geonameid': 3448439,
   'name': 'São Paulo',
   'latitude': -23.5475,
   'longitude': -46.63611,
   'countrycode': 'BR',
   'population': 10021295,
   'timezone': 'America/Sao_Paulo',
   'admin1code': '27'}}]

[{'4684888': {'geonameid': 4684888,
   'name': 'Dallas',
   'latitude': 32.78306,
   'longitude': -96.80667,
   'countrycode': 'US',
   'population': 1300092,
   'timezone': 'America/Chicago',
   'admin1code': 'TX'}},
 {'5722064': {'geonameid': 5722064,
   'name': 'Dallas',
   'latitude': 44.91928,
   'longitude': -123.31705,
   'countrycode': 'US',
   'population': 15277,
   'timezone': 'America/Los_Angeles',
   'admin1code': 'OR'}}]

<i>Issue 2: Some cities have multiple entries in the city name dictionary</i>

For São Paulo there is only one entry in the city_name dictionary. For Dallas there are two. The reason there are two is because the dictionary has info on two cities named Dallas. Question becomes, how do we decide which entry is the one that matches the 'Dallas' in the headlines? If the headline mentioned the country and or the administrative subdivision (e.g. 'TX'), then we could more easily decide.  However, virtually none of the headlines provide this extra information. One potential solution is select the largest among the possibilities. The theory being that the alternative with the largest population is much more likely to appear in a headline. The code below illustrates one way of using 'population' to select among the alternatives with the same names.

In [9]:
for city_name in ['Sao Paulo', 'Dallas']:
    print(f'Selected Dictionary Entry for {city_name} Based on Population Size')
    
    # 1. retrieve the the alternative name for a given city name
    if (city_name in alternative_names): city_name = alternative_names[city_name]
    
    # 2. among the alternatives find the dictionary entry with the largest population
    city = max(gc.get_cities_by_name(city_name),key=lambda x: list(x.values())[0]['population'])
    
    # display dictionary entry
    city = list(city.values())[0]
    print(city,'\n')

Selected Dictionary Entry for Sao Paulo Based on Population Size
{'geonameid': 3448439, 'name': 'São Paulo', 'latitude': -23.5475, 'longitude': -46.63611, 'countrycode': 'BR', 'population': 10021295, 'timezone': 'America/Sao_Paulo', 'admin1code': '27'} 

Selected Dictionary Entry for Dallas Based on Population Size
{'geonameid': 4684888, 'name': 'Dallas', 'latitude': 32.78306, 'longitude': -96.80667, 'countrycode': 'US', 'population': 1300092, 'timezone': 'America/Chicago', 'admin1code': 'TX'} 



<i>Retrieving the Latitude and Longitude along with the Country and Administrative Codes</i>

Once we have the appropriate dictionary for each city, it's a straightforward task to retrieve any of the geographic properties of interest and append them to the existing headline dateframe. In our case, we're interested in the latitude and longitude. Additionally, since few headlines have a matching country name, we might add the countrycode to our dataframe. Potentially, the countrycode could be used for other validation and mapping purposes.

In [10]:
latitudes, longitudes, countrycode, admin1code = [], [], [], []

for city_name in df.City.values:
    if (city_name in alternative_names): city_name = alternative_names[city_name]
    city = max(gc.get_cities_by_name(city_name),key=lambda x: list(x.values())[0]['population'])
    city = list(city.values())[0]
    latitudes.append(city['latitude']) 
    longitudes.append(city['longitude'])
    countrycode.append(city['countrycode'])

print('Sample Properties:\n')
print('latitudes', latitudes[:5])
print('longitudes', longitudes[:5])
print('countrycode', countrycode[:5])
    

Sample Properties:

latitudes [25.77427, 40.71427, 25.79065, -8.05389, 32.78306]
longitudes [-80.19366, -74.00597, -80.13005, -34.88111, -96.80667]
countrycode ['US', 'US', 'US', 'BR', 'US']


<h4>3. Add longitude and latitude coordinates to your DataFrame for each headline.</h4>

Given the the lists for the additional properties of interest, we can simply 'assign' the new additions to the existing dataframe. Using the 'info' method we can see that with the exception of the 'Country' column, the resulting dataframe has entries for every row in every column.


In [11]:
df = df.assign(Latitude=latitudes, Longitude=longitudes, CountryCode=countrycode)

df.head()
df.info()

Unnamed: 0,Headline,City,Country,Latitude,Longitude,CountryCode
0,Zika Outbreak Hits Miami,Miami,,25.77427,-80.19366,US
1,Could Zika Reach New York City?,New York City,,40.71427,-74.00597,US
2,First Case of Zika in Miami Beach,Miami Beach,,25.79065,-80.13005,US
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil,-8.05389,-34.88111,BR
4,Dallas man comes down with case of Zika,Dallas,,32.78306,-96.80667,US


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Headline     611 non-null    object 
 1   City         611 non-null    object 
 2   Country      15 non-null     object 
 3   Latitude     611 non-null    float64
 4   Longitude    611 non-null    float64
 5   CountryCode  611 non-null    object 
dtypes: float64(2), object(4)
memory usage: 28.8+ KB


<h4>Saving the dataframe for future analysis</h4>

In [12]:
df.to_csv(r'data\df_headlines_complete_info_p2.txt', na_rep = 'NaN', index=None, sep='|')