# Pizza pizza pizza

Get top venues by keyword and location, querying Google, Yelp, Foursquare

#### Google

 - Needs a Google API key and module
 - [Create Google Cloud credentials and give access to Places APIs](https://console.cloud.google.com/google/maps-apis/credentials) (also, restrict IP or set other restrictions)
 - `conda install -c conda-forge -y gmaps`
 - put key in `apikey.txt`
 - `gmaps` Jupyter nbextension to show maps in notebook, with marker pins etc.

```
conda install -c conda-forge -y jupyter_contrib_nbextensions
jupyter nbextension enable --py gmaps
jupyter notebook
```

#### Yelp
 - needs Yelp API key and module
 - https://www.yelp.com/developers/documentation/v3
 - https://github.com/gfairchild/yelpapi
 - put key in `yelpkey.txt`
 
#### Foursquare
- Needs Foursquare API key and module
- https://developer.foursquare.com/docs/places-api/getting-started/
- https://github.com/mLewisLogic/foursquare
- OAuth id in `foursquare_id.txt`
- OAuth secret in `foursquare_secret.txt`

See `requirements.txt` for versions used, other requirements (requests, folium, Flask)

Was going to try OpenTable and TripAdvisor but their language seems to limit API key access to approved commercial partners.
- https://dev.opentable.com/affiliate-partners/
- https://www.tripadvisor.com/APIAccessSupport

In [30]:
import time
from pprint import pprint
from ipywidgets import widgets, interact
from itertools import product
from os import path

import traceback
import pdb

import numpy as np
import pandas as pd
import pandas_dedupe

from sklearn.preprocessing import StandardScaler

import requests, json 

# for haversine distance
from geopy.distance import distance

import gmaps
with open('apikeys/apikey.txt') as f:
    api_key = f.readline().strip()
    f.close
gmaps.configure(api_key=api_key)

# https://github.com/gfairchild/yelpapi
from yelpapi import YelpAPI
with open('apikeys/yelpkey.txt') as f:
    yelp_key = f.readline().strip()
    f.close
yelp_api = YelpAPI(yelp_key)

from foursquare import Foursquare, FoursquareException
with open('apikeys/foursquare_id.txt') as f:
    foursquare_id = f.readline().strip()
    f.close
with open('apikeys/foursquare_secret.txt') as f:
    foursquare_secret = f.readline().strip()
    f.close

gmaps.configure(api_key=api_key)

import folium


## Google Maps

In [2]:
# pick a search term
keyword_options = [('Pizza', 'pizza'), ('Coffee', 'coffee')]
keyword = 'pizza'

@interact
def get_kw(kw = widgets.Dropdown(
    options=keyword_options,
    value=keyword,
    description='Search term:  ',
    disabled=False,
)):
    global keyword
    keyword = kw
    return None


interactive(children=(Dropdown(description='Search term:  ', options=(('Pizza', 'pizza'), ('Coffee', 'coffee')…

In [3]:
# pick a location
location = '40.7484, -73.9857'
location_coords = eval(location)
location_options = [('Midtown', '40.7484, -73.9857'),
                    ('Downtown', '40.7077443,-74.0139089'),
                    ('Upper East Side', '40.7711473,-73.9661166'),
                    ('Upper West Side', '40.778794,-73.984257'),
                    ('Brooklyn Heights', '40.6915812,-73.9954095'), 
                    ('Grand Army Plaza', '40.671872,-73.972544'),
                    ('Bay Ridge', '40.624468,-74.0487134'),
                    ('Williamsburg', '40.7144609,-73.9553373'),
                  ]

@interact
def get_loc(loc = widgets.Dropdown(
    options=location_options,
    value=location,
    description='Location:',
)):
    global location
    global location_coords
    location = loc
    location_coords = eval(loc)
    return loc

interactive(children=(Dropdown(description='Location:', options=(('Midtown', '40.7484, -73.9857'), ('Downtown'…

In [4]:
for test_keyword, test_location in product(keyword_options, location_options):
    keyword = test_keyword[1]
    location_name = test_location[0]
    location = test_location[1]
    location_coords = eval(test_location[1])
    test_pickle_filename = keyword + "_" + location_name.replace(' ', '').lower() + ".pkl"
    
    if not path.exists(test_pickle_filename):
        break
        
print(location_name, keyword, location, location_coords)
print(test_pickle_filename)

Midtown pizza 40.7484, -73.9857 (40.7484, -73.9857)
pizza_midtown.pkl


In [5]:
figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}
fig = gmaps.figure(center=location_coords, zoom_level=12, layout=figure_layout)
fig.add_layer(gmaps.marker_layer([location_coords]))
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [56]:
# global options for all search services APIs
MIN_USER_RATINGS = 40
MIN_RATING = 3
NRESULTS = 50
RADIUS = 1000


In [7]:
# gmaps options
GMAPS_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
# https://developers.google.com/places/web-service/search#TextSearchRequests
#https://developers.google.com/places/web-service/supported_types
# rankby='prominence'
rankby='distance'
ltype='establishment'


In [57]:
def gmaps_get_first_page(api_key, location, **kwargs):
    """get first page of results from gmaps using api_key, location, kwargs for search spec"""
    # use either rankby or radius kwarg
    request_url = GMAPS_URL + '?key=' + api_key
    request_url += '&location=' + location
    for name, val in kwargs.items():
        request_url += '&' + name + '=' + str(val)
    r = requests.get(request_url)
    j = r.json()
    return j


def gmaps_get_next_page(api_key, next_page_token):
    """get next search engine results page page using search token, waiting until available"""
    r = requests.get(GMAPS_URL + '?pagetoken=' + next_page_token +
                        '&key=' + api_key)
    for i in range(10):
        j = r.json()
        if not j['results']: # wait for next page to be available
            time.sleep(5)
            continue
        else:
            return j


def gmaps_get_all_df(api_key, location, **kwargs):
    """return dataframe of all results using api_key, location, search kwargs"""
    # get first page
    j = gmaps_get_first_page(api_key, location, **kwargs)
    venues_df = pd.json_normalize(j['results'])

    # get pages while additional pages available
    while 'next_page_token' in j:
        next_page_token = j['next_page_token']
        time.sleep(5)
        j = gmaps_get_next_page(api_key, next_page_token)
        venues_df = venues_df.append(pd.json_normalize(j['results']))
        
    return venues_df


def gmaps_get_df(location_coords, keyword):

    # use either rankby or radius
    location = "%.7f,%.7f" % location_coords
    gmaps_df = gmaps_get_all_df(api_key, location, keyword=keyword, ltype=ltype, radius=RADIUS)
    if gmaps_df.empty:
        return None
    else:
        # gmaps_get_df(api_key, location, keyword=keyword, ltype=ltype, radius=RADIUS)
        gmaps_df = gmaps_df.loc[(gmaps_df['user_ratings_total'] >= MIN_USER_RATINGS) & (gmaps_df['rating'] >= MIN_RATING)] \
                           .sort_values(['rating', 'user_ratings_total'], ascending=False) \
                           .reset_index(drop=True)
        gmaps_df = gmaps_df[['name', 'vicinity', 'rating', 'user_ratings_total', 'geometry.location.lat', 'geometry.location.lng']]
        gmaps_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng']
        # drop trailing ", Brooklyn"
        gmaps_df['address'] = gmaps_df['address'].apply(lambda address: " ".join(address.split(',')[:-1]))
        gmaps_df['distance'] = gmaps_df.apply(lambda row: distance((row['lat'], row['lng']), location_coords).km,
                                              axis=1)
        return gmaps_df


In [58]:
%%time
gmaps_df = gmaps_get_df(location_coords, keyword)
gmaps_df = gmaps_df.loc[(gmaps_df['nratings'] >= MIN_USER_RATINGS) & (gmaps_df['rating'] >= MIN_RATING)] \
        .sort_values(['rating', 'nratings'], ascending=False) \
        .reset_index(drop=True)
gmaps_df


CPU times: user 94.8 ms, sys: 4.25 ms, total: 99.1 ms
Wall time: 13.4 s


Unnamed: 0,name,address,rating,nratings,lat,lng,distance
0,NY Pizza Suprema,413 8th Ave,4.6,3976,40.750144,-73.995224,0.827348
1,&pizza - Nomad,15 W 28th St,4.6,997,40.745159,-73.98815,0.415094
2,Joe's Pizza,1435 Broadway,4.5,8891,40.754629,-73.986999,0.700342
3,Marta,29 E 29th St,4.5,1389,40.744476,-73.984583,0.445817
4,99 Cent Fresh Pizza,151 E 43rd St,4.5,1286,40.751767,-73.974428,1.022707
5,Vezzo,178 Lexington Ave,4.5,1280,40.744488,-73.981331,0.56996
6,Capizzi,547 9th Ave,4.5,1136,40.757482,-73.993645,1.211331
7,Tappo Thin Crust Pizza,49 W 24th St,4.5,640,40.743375,-73.99155,0.74529
8,800 Degrees Woodfired Kitchen,1 E 33rd St,4.5,314,40.747698,-73.984433,0.132423
9,Dj pizza,120 E 34th St,4.5,136,40.746403,-73.980013,0.528992


In [60]:
# plot on google map

markers = [(row.lat, row.lng) for row in gmaps_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in gmaps_df.itertuples()]
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Google rating</dt><dd>{rating}</dd>
<dt>Google reviews</dt><dd>{nratings}</dd>
</dl>
"""
marker_info = [info_box_template.format(**row) for i, row in gmaps_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=eval(location), zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [61]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
    
venues_map

## Yelp

In [79]:
def yelp_get_df(location_coords, keyword):
    lat, lng = location_coords
    response = yelp_api.search_query(categories=keyword, latitude=lat, longitude=lng, 
                                     radius=RADIUS, sort_by=rankby, limit=NRESULTS)

    yelp_df = pd.json_normalize(response['businesses'])
    if not yelp_df.empty:    
        yelp_df = yelp_df.loc[(yelp_df['review_count'] >= MIN_USER_RATINGS) & (yelp_df['rating'] >= MIN_RATING)] \
                         .sort_values(['rating', 'review_count'], ascending=False) \
                         .reset_index(drop=True)
        display_columns = ['name', 'location.address1', 'rating', 'review_count', 'coordinates.latitude', 'coordinates.longitude', 'url']
        yelp_df = yelp_df[display_columns]
        yelp_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng', 'url']
        yelp_df['distance'] = yelp_df.apply(lambda row: distance((row['lat'], row['lng']), location_coords).km,
                                              axis=1)
        
        return yelp_df
    else:
        return None

yelp_df = yelp_get_df(location_coords, keyword)
yelp_df

Unnamed: 0,name,address,rating,nratings,lat,lng,url,distance
0,&pizza - Nomad,15 W 28th St,4.5,437,40.745306,-73.988099,https://www.yelp.com/biz/and-pizza-nomad-new-y...,0.398885
1,Stone Bridge Pizza & Salad,16 E 41st St,4.5,274,40.75225,-73.98061,https://www.yelp.com/biz/stone-bridge-pizza-an...,0.606275
2,Trenta Tre Pizzeria,29 East 33rd St,4.5,118,40.74706,-73.98311,https://www.yelp.com/biz/trenta-tre-pizzeria-n...,0.264556
3,Vezzo,178 Lexington Ave,4.0,1297,40.7445,-73.98139,https://www.yelp.com/biz/vezzo-new-york-2?adju...,0.565748
4,Marta,29 E 29th St,4.0,893,40.744588,-73.984794,https://www.yelp.com/biz/marta-new-york-7?adju...,0.430163
5,Waldy's Wood Fired Pizza & Penne,800 6th Ave,4.0,822,40.745767,-73.99064,https://www.yelp.com/biz/waldys-wood-fired-piz...,0.509512
6,Cafe Rustico II,62 E 34th St,4.0,132,40.74719,-73.98202,https://www.yelp.com/biz/cafe-rustico-ii-new-y...,0.338598
7,Mani In Pasta,14 E 37th St,4.0,94,40.74985,-73.98229,https://www.yelp.com/biz/mani-in-pasta-new-yor...,0.329945
8,Cafe 37,47 W 37th St,4.0,41,40.751439,-73.985401,https://www.yelp.com/biz/cafe-37-new-york?adju...,0.338422
9,Stella 34 Trattoria,151 W 34th St,3.5,670,40.750382,-73.9883,https://www.yelp.com/biz/stella-34-trattoria-n...,0.310864


In [63]:
markers = [(row.lat, row.lng) for row in yelp_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in yelp_df.itertuples()]

info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Yelp rating</dt><dd>{rating}</dd>
<dt>Yelp reviews</dt><dd>{nratings}</dd>
</dl>
"""

marker_info = [info_box_template.format(**row) for i, row in yelp_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [64]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map

## Foursquare

In [80]:
def foursquare_get_df(location_coords, keyword):
    location_str = "%.7f,%.7f" % location_coords
    client = Foursquare(client_id=foursquare_id, 
                                   client_secret=foursquare_secret, 
                                   redirect_uri='http://streeteye.com/')
    response = client.venues.search(params={'query': keyword, 'll': "%s" % location_str, 
                                            'radius': RADIUS, 'limit': NRESULTS})

    foursquare_array = []

    for i, venue in pd.json_normalize(response['venues']).iterrows():
        venue_id = venue['id']
        # query detailed venue info from foursquare
        venue_name = venue['name']
        venue_address = venue['location.address']
        venue_url = venue['delivery.url']
        venue_lat = venue['location.lat']
        venue_lng = venue['location.lng']
        # default these to -1
        venue_rating = -1
        venue_nratings = -1
        try:
            # get rating, nratings with another API call for venue details
            venue_details = client.venues(venue_id)['venue']
            venue_rating = venue_details['rating']
            venue_nratings = venue_details['ratingSignals']
        except FoursquareException as e:
            print("Foursquare exception", type(e), str(e))
        except Exception as e:
            pass
            # sometimes no rating ... probably not popular enough
            # print(type(e), str(e))
            # print(traceback.format_exc())
            # print("No rating for %s" % venue_name)

        foursquare_array.append([venue_name, venue_address, venue_rating, venue_nratings, venue_lat, venue_lng, venue_url])
            
    foursquare_df = pd.DataFrame(foursquare_array)
    if len(foursquare_df) and len(foursquare_df.columns):
        foursquare_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng', 'url']
        foursquare_df = foursquare_df.loc[(foursquare_df['nratings'] >= MIN_USER_RATINGS) & (foursquare_df['rating'] >= MIN_RATING)] \
                                     .sort_values(['rating', 'nratings'], ascending=False) \
                                     .reset_index(drop=True)
        foursquare_df['distance'] = foursquare_df.apply(lambda row: distance((row['lat'], row['lng']), location_coords).km,
                                                        axis=1)
        return foursquare_df
    else:
        return None

foursquare_df = foursquare_get_df(location_coords, keyword)
foursquare_df

Unnamed: 0,name,address,rating,nratings,lat,lng,url,distance
0,Little Italy Pizza,2 E 33rd St,6.8,198,40.747689,-73.984883,https://www.seamless.com/menu/little-italy-piz...,0.104866
1,Joe's Pizza,1435 Broadway,8.9,462,40.754679,-73.987029,https://www.seamless.com/menu/joes-pizza-1435-...,0.706308
2,Waldy’s Wood Fired Pizza & Penne,800 Avenue of the Americas,8.5,582,40.745583,-73.990768,https://www.seamless.com/menu/waldys-wood-fire...,0.530154
3,Penn Pizza,33rd St Penn Station,-1.0,-1,40.74843,-73.988917,,0.271713
4,Café Bonjour Deli & Pizza,18 W 38th St,6.1,13,40.75135,-73.983762,,0.366179
5,Pizza 33,489 3rd Ave,6.1,87,40.745098,-73.978585,https://www.seamless.com/menu/pizza-33-489-3rd...,0.703943
6,New York Pizza Suprema,413 8th Ave,8.8,1265,40.750124,-73.994992,https://www.seamless.com/menu/new-york-pizza-s...,0.807737
7,Brick Oven Pizza 33,527 Avenue of the Americas,6.0,16,40.745046,-73.978372,,0.722352
8,La Pizza & La Pasta,200 5th Ave,8.4,281,40.742449,-73.989983,,0.753359
9,2 Bros. Pizza,755 Avenue of the Americas,5.7,260,40.744248,-73.991887,https://www.seamless.com/menu/2-bros-pizza-755...,0.696922


In [66]:
markers = [(row.lat, row.lng) for row in foursquare_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in foursquare_df.itertuples()]

info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Foursquare rating</dt><dd>{rating}</dd>
<dt>Foursquare reviews</dt><dd>{nratings}</dd>
</dl>
"""
marker_info = [info_box_template.format(**d_item) for i, d_item in foursquare_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [67]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map

## Deduplicate and merge

In [68]:
pd.set_option('display.max_rows', None)

gmaps_df_copy = gmaps_df.copy()
gmaps_df_copy['source'] = '0_gmaps'

yelp_df_copy = yelp_df.copy() 
yelp_df_copy['source'] = '1_yelp'

foursquare_df_copy = foursquare_df.copy()
foursquare_df_copy['source'] = '2_foursquare'

venues_df = pd.concat([gmaps_df_copy, yelp_df_copy, foursquare_df_copy]).reset_index()
venues_df['latlong'] = venues_df[['lat','lng']].apply(tuple, axis=1)

venues_df.sort_values('name')

Unnamed: 0,index,name,address,rating,nratings,lat,lng,distance,source,url,latlong
18,18,$1 Pizza,832 6th Ave,4.3,425,40.746471,-73.989986,0.420594,0_gmaps,,"(40.7464708, -73.9899856)"
121,48,&pizza,15 W 28th St,8.6,148,40.745205,-73.988231,0.414277,2_foursquare,https://www.seamless.com/menu/pizza-15-w-28th-...,"(40.74520457937066, -73.98823142051697)"
1,1,&pizza - Nomad,15 W 28th St,4.6,997,40.745159,-73.98815,0.415094,0_gmaps,,"(40.7451595, -73.9881498)"
54,0,&pizza - Nomad,15 W 28th St,4.5,437,40.745306,-73.988099,0.398885,1_yelp,https://www.yelp.com/biz/and-pizza-nomad-new-y...,"(40.745306, -73.9880992)"
66,12,2 Bros Pizza,755 6th Ave,3.5,269,40.744353,-73.991992,0.696023,1_yelp,https://www.yelp.com/biz/2-bros-pizza-new-york...,"(40.7443525488053, -73.9919924442138)"
31,31,2 Bros. Pizza,755 6th Ave,4.1,1544,40.744337,-73.992005,0.697966,0_gmaps,,"(40.7443371, -73.9920054)"
83,10,2 Bros. Pizza,755 Avenue of the Americas,5.7,260,40.744248,-73.991887,0.696922,2_foursquare,https://www.seamless.com/menu/2-bros-pizza-755...,"(40.74424753245427, -73.99188721843028)"
23,23,2 Bros. Pizza,31 W 46th St,4.2,1404,40.756689,-73.98029,1.027662,0_gmaps,,"(40.7566892, -73.9802897)"
85,12,2 Bros. Pizza,557 8th Ave,7.6,296,40.754722,-73.991751,0.86836,2_foursquare,https://www.seamless.com/menu/2-bros-pizza-557...,"(40.754722463446925, -73.99175064209689)"
8,8,800 Degrees Woodfired Kitchen,1 E 33rd St,4.5,314,40.747698,-73.984433,0.132423,0_gmaps,,"(40.7476976, -73.9844329)"


In [69]:
# run dedupe algorithm using name, address as default texts, latlong as latlong
venues_df2 = pandas_dedupe.dedupe_dataframe(venues_df, ['name', 'address', ('latlong', 'LatLong')])


Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...
# duplicate sets 99


In [70]:
# view clustering
venues_df['cluster'] = venues_df2['cluster id']
venues_df = venues_df.sort_values(['cluster', 'source'])[['cluster', 'name', 'address', 'rating', 'nratings', 'lat', 'lng', 'source']]
venues_df

Unnamed: 0,cluster,name,address,rating,nratings,lat,lng,source
0,0,NY Pizza Suprema,413 8th Ave,4.6,3976,40.7501438,-73.9952244,0_gmaps
79,0,New York Pizza Suprema,413 8th Ave,8.8,1265,40.750124384298815,-73.99499165569237,2_foursquare
1,1,&pizza - Nomad,15 W 28th St,4.6,997,40.7451595,-73.9881498,0_gmaps
54,1,&pizza - Nomad,15 W 28th St,4.5,437,40.745306,-73.9880992,1_yelp
121,1,&pizza,15 W 28th St,8.6,148,40.74520457937066,-73.98823142051697,2_foursquare
2,2,Joe's Pizza,1435 Broadway,4.5,8891,40.7546287,-73.9869994,0_gmaps
74,2,Joe's Pizza,1435 Broadway,8.9,462,40.75467949999999,-73.9870291,2_foursquare
3,3,Marta,29 E 29th St,4.5,1389,40.7444764,-73.9845825,0_gmaps
58,3,Marta,29 E 29th St,4.0,893,40.7445882,-73.9847936,1_yelp
5,4,Vezzo,178 Lexington Ave,4.5,1280,40.744488,-73.9813314,0_gmaps


In [71]:
# group by clusters
cluster_df = venues_df.groupby('cluster')[['name', 'address', 'lat', 'lng', 'source']] \
    .first() \
    .reset_index()
cluster_df

Unnamed: 0,cluster,name,address,lat,lng,source
0,0,NY Pizza Suprema,413 8th Ave,40.7501438,-73.9952244,0_gmaps
1,1,&pizza - Nomad,15 W 28th St,40.7451595,-73.9881498,0_gmaps
2,2,Joe's Pizza,1435 Broadway,40.7546287,-73.9869994,0_gmaps
3,3,Marta,29 E 29th St,40.7444764,-73.9845825,0_gmaps
4,4,Vezzo,178 Lexington Ave,40.744488,-73.9813314,0_gmaps
5,5,Little Italy Pizza,55 W 45th St,40.7564807,-73.9814849,0_gmaps
6,6,99 Cent Fresh Pizza,473 Lexington Ave,40.7537679,-73.9741317,0_gmaps
7,7,Lombardi’s,290 8th Ave,40.7462426,-73.9974421,0_gmaps
8,8,Bravo Kosher Pizza,107 W 37th St,40.7521189,-73.9870459,0_gmaps
9,9,Little Italy Pizza,2 E 33rd St,40.747607,-73.9848854,0_gmaps


In [72]:
# make markers on clusters
# add ratings on all rows

markers = [(float(row.lat), float(row.lng)) for row in cluster_df.itertuples()]
marker_hover = ["%s" % (row.name) for row in cluster_df.itertuples()]

# make a dict by cluster, initialize rating string to ''
marker_dict = {i: {'name': row['name'],
                   'address': row['address'],
                   'lat': row['lat'],
                   'lng': row['lng'],
                   'rate_str': '',
                  } for i, row in cluster_df.iterrows()}

# add all rating strings
for i, row in venues_df.iterrows():
    cluster = row['cluster']
    if row.source[2:] == 'gmaps':
        marker_dict[cluster]['rate_str'] += "<dt>Google rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    elif row.source[2:] == 'yelp':
        marker_dict[cluster]['rate_str'] += "<dt>Yelp rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    elif row.source[2:] == 'foursquare':
        marker_dict[cluster]['rate_str'] += "<dt>Foursquare rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
{rate_str}

</dl>
"""    

marker_info = [info_box_template.format(**d_item) for i, d_item in marker_dict.items()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig


Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [73]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map


In [74]:
merge_df = cluster_df \
    .merge(venues_df.loc[venues_df['source']=='0_gmaps'][['cluster','rating']], on='cluster', how='outer') \
    .rename(columns={'rating': 'gmaps_rating'})
merge_df['gmaps_rating_std'] = StandardScaler().fit_transform(merge_df[['gmaps_rating']])

merge_df = merge_df \
    .merge(venues_df.loc[venues_df['source']=='1_yelp'][['cluster','rating']], on='cluster', how='outer') \
    .rename(columns={'rating': 'yelp_rating'})
merge_df['yelp_rating_std'] = StandardScaler().fit_transform(merge_df[['yelp_rating']])

merge_df = merge_df \
    .merge(venues_df.loc[venues_df['source']=='2_foursquare'][['cluster','rating']], on='cluster', how='outer') \
    .rename(columns={'rating': 'foursquare_rating'})
merge_df['foursquare_rating_std'] = StandardScaler().fit_transform(merge_df[['foursquare_rating']])

merge_df 

Unnamed: 0,cluster,name,address,lat,lng,source,gmaps_rating,gmaps_rating_std,yelp_rating,yelp_rating_std,foursquare_rating,foursquare_rating_std
0,0,NY Pizza Suprema,413 8th Ave,40.7501438,-73.9952244,0_gmaps,4.6,1.582608,,,8.8,1.18682
1,1,&pizza - Nomad,15 W 28th St,40.7451595,-73.9881498,0_gmaps,4.6,1.582608,4.5,1.742843,8.6,1.125958
2,2,Joe's Pizza,1435 Broadway,40.7546287,-73.9869994,0_gmaps,4.5,1.218945,,,8.9,1.217251
3,3,Marta,29 E 29th St,40.7444764,-73.9845825,0_gmaps,4.5,1.218945,4.0,0.516398,,
4,4,Vezzo,178 Lexington Ave,40.744488,-73.9813314,0_gmaps,4.5,1.218945,4.0,0.516398,,
5,5,Little Italy Pizza,55 W 45th St,40.7564807,-73.9814849,0_gmaps,4.4,0.855282,,,7.2,0.69992
6,6,99 Cent Fresh Pizza,473 Lexington Ave,40.7537679,-73.9741317,0_gmaps,4.4,0.855282,,,6.6,0.517332
7,7,Lombardi’s,290 8th Ave,40.7462426,-73.9974421,0_gmaps,4.3,0.491619,,,-1.0,-1.795446
8,8,Bravo Kosher Pizza,107 W 37th St,40.7521189,-73.9870459,0_gmaps,4.2,0.127956,,,5.9,0.304313
9,9,Little Italy Pizza,2 E 33rd St,40.747607,-73.9848854,0_gmaps,4.2,0.127956,3.5,-0.710047,6.8,0.578194


In [78]:
merge_df['distance'] = merge_df.apply(lambda row: distance((row['lat'], row['lng']), location_coords).km,
                                      axis=1)
merge_df


Unnamed: 0,cluster,name,address,lat,lng,source,gmaps_rating,gmaps_rating_std,yelp_rating,yelp_rating_std,foursquare_rating,foursquare_rating_std,meanrating,nratings,w,R,bayes_score,distance
0,0,NY Pizza Suprema,413 8th Ave,40.7501438,-73.9952244,0_gmaps,4.6,1.582608,,,8.8,1.18682,1.384714,2,0.616822,1.384714,0.854123,0.827348
1,1,&pizza - Nomad,15 W 28th St,40.7451595,-73.9881498,0_gmaps,4.6,1.582608,4.5,1.742843,8.6,1.125958,1.483803,3,0.707143,1.483803,1.049261,0.415094
2,2,Joe's Pizza,1435 Broadway,40.7546287,-73.9869994,0_gmaps,4.5,1.218945,,,8.9,1.217251,1.218098,2,0.616822,1.218098,0.75135,0.700342
3,3,Marta,29 E 29th St,40.7444764,-73.9845825,0_gmaps,4.5,1.218945,4.0,0.516398,,,0.867671,2,0.616822,0.867671,0.535199,0.445817
4,4,Vezzo,178 Lexington Ave,40.744488,-73.9813314,0_gmaps,4.5,1.218945,4.0,0.516398,,,0.867671,2,0.616822,0.867671,0.535199,0.56996
5,5,Little Italy Pizza,55 W 45th St,40.7564807,-73.9814849,0_gmaps,4.4,0.855282,,,7.2,0.69992,0.777601,2,0.616822,0.777601,0.479642,0.965378
6,6,99 Cent Fresh Pizza,473 Lexington Ave,40.7537679,-73.9741317,0_gmaps,4.4,0.855282,,,6.6,0.517332,0.686307,2,0.616822,0.686307,0.423329,1.144454
7,7,Lombardi’s,290 8th Ave,40.7462426,-73.9974421,0_gmaps,4.3,0.491619,,,-1.0,-1.795446,-0.651914,2,0.616822,-0.651914,-0.402115,1.020216
8,8,Bravo Kosher Pizza,107 W 37th St,40.7521189,-73.9870459,0_gmaps,4.2,0.127956,,,5.9,0.304313,0.216134,2,0.616822,0.216134,0.133316,0.428337
9,9,Little Italy Pizza,2 E 33rd St,40.747607,-73.9848854,0_gmaps,4.2,0.127956,3.5,-0.710047,6.8,0.578194,-0.001299,3,0.707143,-0.001299,-0.000919,0.111749


In [75]:
# simple average score
merge_df['meanrating'] = np.nanmean(merge_df[['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']], axis=1)
merge_df.sort_values('meanrating', ascending=False)[['name', 'address', 'gmaps_rating', 'yelp_rating', 'foursquare_rating']]


Unnamed: 0,name,address,gmaps_rating,yelp_rating,foursquare_rating
55,Stone Bridge Pizza & Salad,16 E 41st St,,4.5,
56,Trenta Tre Pizzeria,29 East 33rd St,,4.5,
1,&pizza - Nomad,15 W 28th St,4.6,4.5,8.6
0,NY Pizza Suprema,413 8th Ave,4.6,,8.8
24,800 Degrees Woodfired Kitchen,1 E 33rd St,4.5,,
25,Dj pizza,120 E 34th St,4.5,,
23,Tappo Thin Crust Pizza,49 W 24th St,4.5,,
22,Capizzi,547 9th Ave,4.5,,
21,99 Cent Fresh Pizza,151 E 43rd St,4.5,,
2,Joe's Pizza,1435 Broadway,4.5,,8.9


In [76]:
# bayes score
rating_cols = ['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']
merge_df['nratings'] = merge_df[rating_cols].count(axis=1)
nratings_mean = np.mean(merge_df['nratings'])
rating_avg = np.nanmean(merge_df[rating_cols])
merge_df['w'] = merge_df['nratings']/(merge_df['nratings'] + nratings_mean)
merge_df['R'] = np.mean(merge_df[rating_cols], axis=1)
print('mean number of ratings', nratings_mean)
print('average rating', rating_avg)
merge_df['bayes_score'] = merge_df['w'] * merge_df['R'] + (1 - merge_df['w']) * rating_avg
merge_df.sort_values('bayes_score', ascending=False)[['name', 'address', 'gmaps_rating', 'yelp_rating', 'foursquare_rating', 'nratings', 'bayes_score']]




mean number of ratings 1.2424242424242424
average rating 4.621416167545368e-16


Unnamed: 0,name,address,gmaps_rating,yelp_rating,foursquare_rating,nratings,bayes_score
1,&pizza - Nomad,15 W 28th St,4.6,4.5,8.6,3,1.049261
0,NY Pizza Suprema,413 8th Ave,4.6,,8.8,2,0.854123
56,Trenta Tre Pizzeria,29 East 33rd St,,4.5,,1,0.777214
55,Stone Bridge Pizza & Salad,16 E 41st St,,4.5,,1,0.777214
2,Joe's Pizza,1435 Broadway,4.5,,8.9,2,0.75135
24,800 Degrees Woodfired Kitchen,1 E 33rd St,4.5,,,1,0.543584
21,99 Cent Fresh Pizza,151 E 43rd St,4.5,,,1,0.543584
25,Dj pizza,120 E 34th St,4.5,,,1,0.543584
23,Tappo Thin Crust Pizza,49 W 24th St,4.5,,,1,0.543584
22,Capizzi,547 9th Ave,4.5,,,1,0.543584


In [None]:
def dedupe(dedupe_list):

    for i, source_df in enumerate(dedupe_list):
        source_df['source'] = i
    venues_df = pd.concat(dedupe_list).reset_index()
    venues_df['latlong'] = venues_df[['lat','lng']].apply(tuple, axis=1)
    venues_df['shortname'] = venues_df['name'].apply(lambda n: n[:25])

    # dedupe and assign cluster id
    venues_df2 = pandas_dedupe.dedupe_dataframe(venues_df, ['shortname', 'address', ('latlong', 'LatLong')])
    venues_df['cluster'] = venues_df2['cluster id']
    venues_df = venues_df.sort_values(['cluster', 'source'])[['cluster', 'name', 'address', 'rating', 'nratings', 'lat', 'lng', 'source']]

    # group by clusters, uniquify name
    cluster_df = venues_df.groupby('cluster')[['name', 'address', 'lat', 'lng', 'source']] \
                          .first() \
                          .reset_index()

    # merge ratings by source
    merge_df = cluster_df \
        .merge(venues_df.loc[venues_df['source']=='0'][['cluster','rating']], on='cluster', how='outer') \
        .rename(columns={'rating': 'gmaps_rating'})
    merge_df['gmaps_rating_std'] = StandardScaler().fit_transform(merge_df[['gmaps_rating']])

    merge_df = merge_df \
        .merge(venues_df.loc[venues_df['source']=='1'][['cluster','rating']], on='cluster', how='outer') \
        .rename(columns={'rating': 'yelp_rating'})
    merge_df['yelp_rating_std'] = StandardScaler().fit_transform(merge_df[['yelp_rating']])

    merge_df = merge_df \
        .merge(venues_df.loc[venues_df['source']=='2'][['cluster','rating']], on='cluster', how='outer') \
        .rename(columns={'rating': 'foursquare_rating'})
    merge_df['foursquare_rating_std'] = StandardScaler().fit_transform(merge_df[['foursquare_rating']])

    # bayes score
    rating_cols = ['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']
    merge_df['nratings'] = merge_df[rating_cols].count(axis=1)
    nratings_mean = np.mean(merge_df['nratings'])
    rating_avg = np.nanmean(merge_df[rating_cols])
    merge_df['w'] = merge_df['nratings']/(merge_df['nratings'] + nratings_mean)
    merge_df['R'] = np.mean(merge_df[rating_cols], axis=1)
    merge_df['bayes_score'] = merge_df['w'] * merge_df['R'] + (1 - merge_df['w']) * rating_avg
    merge_df = merge_df.sort_values('bayes_score', ascending=False)
    merge_df[['name', 'address', 'gmaps_rating', 'yelp_rating', 'foursquare_rating', 'nratings', 'bayes_score']]
    return merge_df

    
dedupe_list = list(filter(lambda df: df is not None, [gmaps_df, yelp_df, foursquare_df]))

dedupe(dedupe_list)

In [None]:
# app.py

location = '40.6915812,-73.9954095'
keyword = 'pizza'
ltype = 'establishment'
rankby = 'distance'

gmaps_df = gmaps_get_df(location, keyword)
yelp_df = yelp_get_df(location, keyword)
foursquare_df = foursquare_get_df(location, keyword)
dedupe_list = list(filter(lambda df: df is not None, [gmaps_df, yelp_df, foursquare_df]))
print(len(gmaps_df), len(yelp_df), len(foursquare_df))
print("Deduping %d dataframes" % (len(list(dedupe_list))))
dedupe(dedupe_list).to_json()
