# Adding Latitude and Longitude Coordinates

## Objective
Find the geographic location of each headline in latitude and longitude coordinates from the city/country names.

## Workflow


1. Load in the pandas DataFrame with headline, countries, and cities.
    * If a headline contains multiple cities/countries, decide which single one to keep.
2. For each city/country, match the name to the latitude and longitude in geonamescache.
    * You can use the function gc.get_cities_by_names_ _(“city_name”).
    * Some cities will return multiple matches with the previous function in different countries. You’ll have to decide which city to keep based on a heuristic (rule of thumb).
    * If you have trouble, work with a single problematic city until you figure it out, then write a function to apply on all headlines.
3. Add longitude and latitude coordinates to your DataFrame for each headline.
    * It will be helpful to get the countrycode of each headline at this point.
    * If you were not able to find many countries, think about dropping the column. You also need to decide what to do with headlines that have no coordinates.
    * You should end up with over 600 headlines that have geographic coordinates.

In [1]:
import pandas as pd
import numpy as np
import geonamescache

gc = geonamescache.GeonamesCache()

In [2]:
# Setup helpers
from geonamescache.mappers import country
isomapper = country(from_key='name', to_key='iso')

In [3]:
# Read previous file
df = pd.read_json('geo-headlines.json')
df[:25]

Unnamed: 0,headline,country,city
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo


In [4]:
def lookup_city_info(city):
    """Looks up city information from the geonamescache.  
    Returns city tuples sorted by decreasing population"""
    data = []
    for cstruct in gc.get_cities_by_name(city):
        entry = list(cstruct.values())[0]
        data.append((entry['countrycode'], entry['latitude'], entry['longitude'], entry['population']))
    data = sorted(data, key=lambda e: e[3], reverse=True)
    return data

lookup_city_info('San Antonio')

[('US', 29.42412, -98.49363, 1469845),
 ('CL', -33.59473, -71.60746, 85651),
 ('PY', -25.42126, -57.54725, 55754),
 ('PH', 14.94659, 120.08673, 34217),
 ('PH', 15.3062, 120.856, 26247)]

In [5]:

def determine_city_coords(citytuples, countrycode=None):
    """Takes a list of city data tuples and a country code to select best one"""
    country_match = [x for x in filter(lambda city: city[0] == countrycode, citytuples)]
    if country_match:
        #print(f'Found country match {countrycode} -> {country_match}')
        return country_match[0]
    return citytuples[0] if len(citytuples) > 0 else []

def lookup_geo_coords(row, debug=False):
    if debug: print(f'\nLooking up {row.headline}')
    country_code = None
    if row.country:
        # If we extracted a country use that to pick city
        country_code = isomapper(row.country)
        if debug: print(f'Found country: {country_code}')
    cities = lookup_city_info(row.city)
    if debug: print(f'Found cities: {cities}')
    if len(cities) > 0:
        city_info = determine_city_coords(cities, country_code)
        if debug: print(f'Selected city info: {city_info}')
        return city_info
    else:
        return (None, None, None, None)
    
coords = df.apply(lookup_geo_coords, axis=1)
coords[:10]

0     (US, 25.77427, -80.19366, 441003)
1    (US, 40.71427, -74.00597, 8175133)
2      (US, 25.79065, -80.13005, 92312)
3    (BR, -8.05389, -34.88111, 1478098)
4    (US, 32.78306, -96.80667, 1300092)
5         (BO, -14.83333, -64.9, 84259)
6    (US, 29.76328, -95.36327, 2296224)
7              (None, None, None, None)
8       (US, 33.749, -84.38798, 463878)
9              (None, None, None, None)
dtype: object

# Clean up results

In [6]:
# Add geo info to data frame
df['countrycode'] = [cc for (cc,_,_,_) in coords]
df['latitude'] = [lat for (_,lat,_,_) in coords]
df['longitude'] = [lng for (_,_,lng,_) in coords]
#df['population'] = [pop for (_,_,_,pop) in coords]

# Drop rows with NA
df.dropna(axis=0, thresh=2)  # missing two geo coords - ont working?
df = df[df.latitude.notna()]
df[:10]




Unnamed: 0,headline,country,city,countrycode,latitude,longitude
0,Zika Outbreak Hits Miami,,Miami,US,25.77427,-80.19366
1,Could Zika Reach New York City?,,New York City,US,40.71427,-74.00597
2,First Case of Zika in Miami Beach,,Miami Beach,US,25.79065,-80.13005
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife,BR,-8.05389,-34.88111
4,Dallas man comes down with case of Zika,,Dallas,US,32.78306,-96.80667
5,Trinidad confirms first Zika case,,Trinidad,BO,-14.83333,-64.9
6,Zika Concerns are Spreading in Houston,,Houston,US,29.76328,-95.36327
8,The CDC in Atlanta is Growing Worried,,Atlanta,US,33.749,-84.38798
10,Brownsville teen contracts Zika virus,,Brownsville,US,25.90175,-97.49748
11,Mosquito control efforts in St. Louis take new...,,St. Louis,US,38.62727,-90.19789
