Geocode the *place* field for each brewery to lat-long with the Google Maps geocoding API.

In [1]:
import requests, time, pandas as pd, numpy as np

In [2]:
# set the pause duration between api requests
pause = 0.25

In [3]:
# load the untappd data and look at the first 10 brewery names and places
df = pd.read_csv('data/untappd_details.csv', encoding='utf-8')
df[['brewery_name', 'brewery_place']].head(10)

Unnamed: 0,brewery_name,brewery_place
0,Angel City Brewery,"Los Angeles, CA United States"
1,Cascade Brewery Co. (Australia),"South Hobart, Tas. Australia"
2,Naked For Satan,Australia
3,Venom Brewing,Australia
4,Matilda Bay Brewing Company,"Port Melbourne, Vic. Australia"
5,Kosciuszko Brewing Company (Lion Nathan),"Jindabyne, NSW Australia"
6,Little Creatures Brewing,"Fremantle, W.A. Australia"
7,Great Northern Brewing Co. (CUB),"Abbotsfort, Vic. Australia"
8,Batch Brewing Company,"Marrickville, New South Wales Australia"
9,McLaren Vale Orchards,Australia


In [4]:
# how many total brewery places are there, and how many unique places are there?
print len(df['brewery_place'])
print len(df['brewery_place'].unique())

1430
270


In [5]:
# first clean up any places with parentheses or slashes, like myanmar and china
df['brewery_place'] = df['brewery_place'].map(lambda x: x.split(' (')[0])
df['brewery_place'] = df['brewery_place'].map(lambda x: x.split(' /')[0])

In [6]:
# select only the unique places and drop any that are just 'other'
brewery_places = pd.Series(df['brewery_place'].unique())
brewery_places = brewery_places[~(brewery_places=='Other')]
brewery_places = brewery_places.sort_values()

In [7]:
# function that accepts an address string, sends it to the Google API, and returns the lat-long API result
def geocode_google(address):
    time.sleep(pause) #pause for some duration before each request, to not hammer their server
    url = u'http://maps.googleapis.com/maps/api/geocode/json?address={}&sensor=false' #api url with placeholders
    request = url.format(address)
    response = requests.get(request)
    data = response.json()
    
    if len(data['results']) > 0: #if google was able to geolocate our address, extract lat-long from result
        latitude = data['results'][0]['geometry']['location']['lat']
        longitude = data['results'][0]['geometry']['location']['lng']
        return '{},{}'.format(latitude, longitude) #return lat-long as a string in the format google likes

In [8]:
def geocode_nominatim(address):
    time.sleep(pause)
    url = u'https://nominatim.openstreetmap.org/search?format=json&q={}'
    request = url.format(address)
    response = requests.get(request)
    data = response.json()
    if len(data) > 0:
        return '{},{}'.format(data[0]['lat'], data[0]['lon'])

In [9]:
# geocode all the unique brewery places using the google maps api
brewery_latlngs = brewery_places.map(geocode_google)

In [10]:
# how many places failed to geocode?
brewery_places_failed = brewery_places[brewery_latlngs[pd.isnull(brewery_latlngs)].index]
print 'after google, {} places lack lat-long'.format(len(brewery_places_failed))

after google, 2 places lack lat-long


In [11]:
# re-try any that failed to geocode, but this time use the nominatim api
brewery_latlngs_nominatim = brewery_places_failed.map(geocode_nominatim)
brewery_places_failed = brewery_places[brewery_latlngs_nominatim[pd.isnull(brewery_latlngs_nominatim)].index]
print 'after nominatim, {} places lack lat-long'.format(len(brewery_places_failed))

after nominatim, 1 places lack lat-long


In [12]:
# update the latlng values in brewery_latlngs based on any new results from nominatim
for label in brewery_latlngs_nominatim.index:
    brewery_latlngs[label] = brewery_latlngs_nominatim[label]

In [13]:
# create a dict with key of place name and value of lat-long
place_latlng = {}
for label in brewery_places.index:
    key = brewery_places[label]
    val = brewery_latlngs[label]
    place_latlng[key] = val

In [14]:
def get_latlng(brewery_place):
    try:
        return place_latlng[brewery_place]
    except:
        return None

df['brewery_latlng'] = df['brewery_place'].map(get_latlng)

In [15]:
# split latlng into separate lat and lon columns
df['brewery_lat'] = df['brewery_latlng'].map(lambda x: x.split(',')[0] if pd.notnull(x) else np.nan)
df['brewery_lon'] = df['brewery_latlng'].map(lambda x: x.split(',')[1] if pd.notnull(x) else np.nan)
df = df.drop('brewery_latlng', axis=1)

In [16]:
# look at the first 10 breweries and their lat-longs
df[['brewery_name', 'brewery_place', 'brewery_lat', 'brewery_lon']].head(10)

Unnamed: 0,brewery_name,brewery_place,brewery_lat,brewery_lon
0,Angel City Brewery,"Los Angeles, CA United States",34.0522342,-118.2436849
1,Cascade Brewery Co. (Australia),"South Hobart, Tas. Australia",-42.8945228,147.3094914
2,Naked For Satan,Australia,-25.274398,133.775136
3,Venom Brewing,Australia,-25.274398,133.775136
4,Matilda Bay Brewing Company,"Port Melbourne, Vic. Australia",-37.836926,144.94455
5,Kosciuszko Brewing Company (Lion Nathan),"Jindabyne, NSW Australia",-36.41479,148.6110247
6,Little Creatures Brewing,"Fremantle, W.A. Australia",-32.0560399,115.7471797
7,Great Northern Brewing Co. (CUB),"Abbotsfort, Vic. Australia",-37.8002232,144.9961625
8,Batch Brewing Company,"Marrickville, New South Wales Australia",-33.91063,151.15646
9,McLaren Vale Orchards,Australia,-25.274398,133.775136


In [17]:
# save to csv
df.to_csv('data/untappd_details_geocoded.csv', index=False, encoding='utf-8')