In [None]:
%pylab inline

import seaborn as sns
import pickle

df = pickle.load(open('ssoc_df_2.p','rb'))

In [None]:
df.head()

## Geohash

<img width=500 src="http://kamliu.co.uk/wp-content/uploads/2015/11/map-1.png"/>

It is a hierarchical spatial data structure which subdivides space into buckets of grid shape. Geohashes offer properties like arbitrary precision and the possibility of gradually removing characters from the end of the code to reduce its size (and gradually lose precision).

The main usages of Geohashes are:

- As a unique identifier.
- To represent point data, e.g. in databases.


In [None]:
import geohash # pip install python-geohash

# full geohash
df['geohash']=df[['lat','lon']].apply(lambda x: geohash.encode(x['lat'],x['lon']), axis=1)

df.head()

### Top Locations / by geohash

In [None]:
# top locations grouped by geohash

daily_locs = df[['geohash']].groupby(df.geohash).size()
daily_locs.sort_values(inplace=True, ascending=False)
print daily_locs[:20]

In [None]:
# less precise (just first 10 digits, gives us neighborhood level precision)
df['geohash10d']=df[['lat','lon']].apply(lambda x: geohash.encode(x['lat'],x['lon'])[:10], axis=1)

# let's try grouping by 9-digit geohash
daily_locs = df[['geohash10d']].groupby(df.geohash10d).size()
daily_locs.sort_values(inplace=True, ascending=False)
print daily_locs[:20]

In [None]:
plot(daily_locs.values[:10])
title('location distribution')

In [None]:
# pip install gmaps
import gmaps
import gmaps.datasets

api_key = ''

# insert Google API key
gmaps.configure(api_key=api_key)


In [None]:
# array of (latitude, longitude) pairs
data = [geohash.decode(ghash) for ghash,val in daily_locs[:100].to_dict().items()]

In [None]:
# plot top locations

def show_data_on_map(data):
    # instantiate a gmaps object
    m = gmaps.Map()

    # add a layer (heatmap) to it using our data
    heatmap_layer = gmaps.Heatmap(data=data)
    heatmap_layer.gradient = ['white', 'red']
    heatmap_layer.point_radius = 3
    heatmap_layer.max_intensity = 1
    m.add_layer(heatmap_layer)

    return m

In [None]:
daily_locs_sm = {ghash:val for ghash,val in daily_locs.to_dict().items() if val>2 and val<5}
len(daily_locs_sm)

In [None]:
# lets look at locations that were logged only 2-3 times

data = [geohash.decode(ghash) for ghash,val in daily_locs_sm.items()]
m = show_data_on_map(data)
m

In [None]:
# different level of precision

In [None]:
# less precise (just first 8 digits, gives us neighborhood level precision)
df['geohash8d']=df[['lat','lon']].apply(lambda x: geohash.encode(x['lat'],x['lon'])[:8], axis=1)

# let's try grouping by 8-digit geohash
daily_locs = df[['geohash8d']].groupby(df.geohash8d).size()
daily_locs.sort(ascending=False)
print daily_locs[:20]

In [None]:
plot(daily_locs.values[:10])
title('location distribution')

## Google Places API

In [None]:
# pip install python-google-places

from googleplaces import GooglePlaces, types, lang

google_places = GooglePlaces(api_key)

In [None]:
# pip install geopy

from geopy.distance import great_circle
import geohash

lat, lng = (40.7453019, -73.99033025)
query_result = google_places.nearby_search(lat_lng={'lat':lat,'lng':lng}, radius=50)

In [None]:
query_result.places

In [None]:
place = query_result.places[0]

In [None]:
place.get_details()

In [None]:
place.details

In [None]:
place.details.keys()

In [None]:
place.details['name']

In [None]:
place.details['types']

- Great-circle distance: the shortest distance between two points on the surface of a sphere: https://en.wikipedia.org/wiki/Great-circle_distance

In [None]:
# calculate distance from given location

cur_ll = (float(place.geo_location['lat']),float(place.geo_location['lng']))
cur_dist = round(great_circle((lat, lng),cur_ll).feet,2)
cur_dist

In [None]:
# This is the data that seems most useful for us from what's returned
{
    'name':place.name, 
    'types':place.types, 
    'address':place.formatted_address, 
    'll':cur_ll,
    'dist':cur_dist
}

In [None]:
# let's put this all together into a helper function

def get_goog_places(lat, lng):
    
    print lat,lng
    
    # radius is in meters
    query_result = google_places.nearby_search(lat_lng={'lat':lat,'lng':lng}, radius=50)
    to_return = []
    
    for place in query_result.places:
        place.get_details()
        cur_ll = (float(place.geo_location['lat']),float(place.geo_location['lng']))
        cur_dist = round(great_circle((lat, lng),cur_ll).meters,2)
        
        # we don't care about identified places that are far away
        if cur_dist>500:
            continue
        
        ans = {
                'name':place.name, 
                'types':place.types, 
                'address':place.formatted_address, 
                'll':cur_ll,
                'dist':cur_dist
            }
        
        to_return.append(ans)
        
    return to_return

In [None]:
get_goog_places(40.743384,-73.992103)

In [None]:
lat, lng = geohash.decode('9q8yywedtv')

In [None]:
ans = get_goog_places(lat, lng)

In [None]:
sorted(ans, key=lambda x: x['dist'])

In [None]:
# now lets take our list from before and find likely venues
venues = {}

for ghash in daily_locs_sm:
    lat, lng = geohash.decode(ghash)
    ans = get_goog_places(lat, lng)
    ans = sorted(ans, key=lambda x: x['dist'])
    venues[ghash] = ans

In [None]:
venues

In [None]:
d = []
for ghash, vals in venues.items():
    for v in vals:
        if v['dist']<50:
            d.append(v)

In [None]:
import pandas as pd

df_venues = pd.DataFrame.from_records(d)
df_venues.head()

In [None]:
df_venues.groupby(df_venues.name).size().sort_values(ascending=False)

In [None]:
# save our data

pickle.dump(df, open('ssoc_df_3.p','wb'))

## Questions

1. Describe your target's interests - use geohashes to group your target's locations and identify top likely venues using Google's Location API.

2. What type of venues does your target tend to visit? Plot out a few data points (ex: histogram of breakdown of venue types). 

3. Over the last two weeks we did this manually. This week use code and other tools we've learned in class to go over a larger dataset and come to conclusions about your Target.