# Pizza pizza pizza

Get top venues by keyword and location, querying Google, Yelp, Foursquare

## 1. Intro
### Motivation
 - Gmaps and Yelp GUIs sorting and filtering are limited.
 - Query all 3 services and bring merged results into sortable table and map widget

### Google Maps setup

 - Needs a Google API key and gmaps python module
 - [Create Google Cloud credentials and give access to Places APIs](https://console.cloud.google.com/google/maps-apis/credentials) (also, restrict IP or set other restrictions)
 - `conda install -c conda-forge -y gmaps`
 - put key in `secrets/apikey.txt`
 - `gmaps` Jupyter nbextension to show maps in notebook, with marker pins etc.

```
conda install -c conda-forge -y jupyter_contrib_nbextensions
jupyter nbextension enable --py gmaps
jupyter notebook
```

### Yelp setup
 - needs Yelp API key and module
 - https://www.yelp.com/developers/documentation/v3
 - https://github.com/gfairchild/yelpapi
 - put key in `secrets/yelpkey.txt`
 
### Foursquare setup
- Needs Foursquare API key and module
- https://developer.foursquare.com/docs/places-api/getting-started/
- https://github.com/mLewisLogic/foursquare
- OAuth id in `secrets/foursquare_id.txt`
- OAuth secret in `secrets/foursquare_secret.txt`

See `requirements.txt` for versions used, other requirements (requests, folium, Flask)

Was going to try OpenTable and TripAdvisor but their language seems to limit API key access to approved commercial partners.
- https://dev.opentable.com/affiliate-partners/
- https://www.tripadvisor.com/APIAccessSupport

In [1]:
import time
from pprint import pprint
import ipywidgets
from ipywidgets import widgets, interact
from itertools import product
from os import path

import multiprocessing
from multiprocessing import Pool

import tqdm
from tqdm import tqdm

import traceback
import pdb

import numpy as np
import pandas as pd

# cluster similar entities
import pandas_dedupe

import sklearn
from sklearn.preprocessing import StandardScaler

import requests, json 

import qgrid
from qgrid import show_grid

# convert coords to km using haversine distance 
import geopy
from geopy.distance import distance

# put API key in this file
# recommend restricting to your IP https://console.cloud.google.com/apis/credentials?project=myproject
import gmaps
with open('secrets/apikey.txt') as f:
    api_key = f.readline().strip()
    f.close
gmaps.configure(api_key=api_key)

# https://github.com/gfairchild/yelpapi
import yelpapi
from yelpapi import YelpAPI
with open('secrets/yelpkey.txt') as f:
    yelp_key = f.readline().strip()
    f.close
yelp_api = YelpAPI(yelp_key)

import foursquare
from foursquare import Foursquare, FoursquareException
with open('secrets/foursquare_id.txt') as f:
    foursquare_id = f.readline().strip()
    f.close
with open('secrets/foursquare_secret.txt') as f:
    foursquare_secret = f.readline().strip()
    f.close

# interactive maps
import folium

print(f"numpy                {np.__version__:<20}")
print(f"pandas               {pd.__version__:<20}")
print(f"ipywidgets           {ipywidgets.__version__:<20}")
print(f"qgrid                {qgrid.__version__:<20}")
print(f"sklearn              {sklearn.__version__:<20}")
print(f"requests             {requests.__version__:<20}")
print(f"geopy                {geopy.__version__:<20}")
print(f"gmaps                {gmaps.__version__:<20}")
print(f"foursquare           {foursquare.__version__:<20}")
print(f"numpy                {folium.__version__:<20}")


numpy                1.20.2              
pandas               1.2.4               
ipywidgets           7.6.3               
qgrid                1.3.1               
sklearn              0.24.2              
requests             2.25.1              
geopy                2.1.0               
gmaps                0.9.0               
foursquare           1!2020.1.30         
numpy                0.12.1              


## 2. Global setup, keyword, location

In [2]:
# pick a search term
keyword = 'pizza'
keyword_options = [('Pizza', 'pizza'), ('Coffee', 'coffee'), ('Ice Cream', 'icecream')]

@interact
def get_kw(kw = widgets.Dropdown(
    options=keyword_options,
    value=keyword,
    description='Search term:  ',
    disabled=False,
)):
    global keyword
    keyword = kw
    return None

interactive(children=(Dropdown(description='Search term:  ', options=(('Pizza', 'pizza'), ('Coffee', 'coffee')…

In [3]:
keyword = 'miniature golf'


In [4]:
# pick a location
# location = '40.7484, -73.9857'
# location_coords = tuple(eval(location))
location_options = [('Midtown', '40.7484, -73.9857'),
                    ('Downtown', '40.7077443,-74.0139089'),
                    ('Upper East Side', '40.7711473,-73.9661166'),
                    ('Upper West Side', '40.778794,-73.984257'),
                    ('Brooklyn Heights', '40.6915812,-73.9954095'), 
                    ('Grand Army Plaza', '40.671872,-73.972544'),
                    ('Bay Ridge', '40.6292633,-74.0309554'),
                    ('Williamsburg', '40.7144609,-73.9553373'),
                  ]

@interact
def get_loc(loc = widgets.Dropdown(
    options=location_options,
    # value=myloc,
    description='Location:',
)):
    global location
    global location_coords
    location = loc
    location_coords = tuple(eval(loc))
    return None

interactive(children=(Dropdown(description='Location:', options=(('Midtown', '40.7484, -73.9857'), ('Downtown'…

In [5]:
# location='40.655001,-74.0059557'
# location = '40.7611383,-73.9166489' # Astoria
location_coords = tuple(eval(location))
location, location_coords

# test_pickle_filename = keyword + "_" + location.replace(' ', '').lower() + ".pkl"
# test_pickle_filename


('40.7484, -73.9857', (40.7484, -73.9857))

In [6]:
# pick anything we don't have a pickle file for

for test_keyword, test_location in product(keyword_options, location_options):
    print(test_keyword, test_location)
    keyword = test_keyword[1]
    location_name = test_location[0]
    location = test_location[1]
    location_coords = tuple(eval(test_location[1]))
    test_pickle_filename = keyword + "_" + location_name.replace(' ', '').lower() + ".pkl"
    print(test_pickle_filename)
    if not path.exists(test_pickle_filename):
        break
        
print(location_name, keyword, location, location_coords)
print(test_pickle_filename)

('Pizza', 'pizza') ('Midtown', '40.7484, -73.9857')
pizza_midtown.pkl
('Pizza', 'pizza') ('Downtown', '40.7077443,-74.0139089')
pizza_downtown.pkl
('Pizza', 'pizza') ('Upper East Side', '40.7711473,-73.9661166')
pizza_uppereastside.pkl
('Pizza', 'pizza') ('Upper West Side', '40.778794,-73.984257')
pizza_upperwestside.pkl
('Pizza', 'pizza') ('Brooklyn Heights', '40.6915812,-73.9954095')
pizza_brooklynheights.pkl
('Pizza', 'pizza') ('Grand Army Plaza', '40.671872,-73.972544')
pizza_grandarmyplaza.pkl
('Pizza', 'pizza') ('Bay Ridge', '40.6292633,-74.0309554')
pizza_bayridge.pkl
('Pizza', 'pizza') ('Williamsburg', '40.7144609,-73.9553373')
pizza_williamsburg.pkl
('Coffee', 'coffee') ('Midtown', '40.7484, -73.9857')
coffee_midtown.pkl
('Coffee', 'coffee') ('Downtown', '40.7077443,-74.0139089')
coffee_downtown.pkl
('Coffee', 'coffee') ('Upper East Side', '40.7711473,-73.9661166')
coffee_uppereastside.pkl
('Coffee', 'coffee') ('Upper West Side', '40.778794,-73.984257')
coffee_upperwestside.pk

## 3. Google Maps

In [7]:
location_coords

(40.7144609, -73.9553373)

In [8]:
# gmap of chosen location

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}
fig = gmaps.figure(center=location_coords, zoom_level=12, layout=figure_layout)
fig.add_layer(gmaps.marker_layer([location_coords]))
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [9]:
# global options for all search services APIs
MIN_USER_RATINGS = 20
MIN_RATING = 0
NRESULTS = 50
RADIUS = 1000


In [10]:
# gmaps options
GMAPS_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
# https://developers.google.com/places/web-service/search#TextSearchRequests
#https://developers.google.com/places/web-service/supported_types
# rankby='prominence'
rankby='distance'
ltype='establishment'


In [11]:
def gmaps_get_first_page(api_key, location, **kwargs):
    """get first page of results from gmaps using api_key, location, kwargs for search spec"""
    # use either rankby or radius kwarg
    request_url = GMAPS_URL + '?key=' + api_key
    request_url += '&location=' + location
    for name, val in kwargs.items():
        request_url += '&' + name + '=' + str(val)
    r = requests.get(request_url)
    j = r.json()
    return j


def gmaps_get_next_page(api_key, next_page_token):
    """get next search engine results page page using search token, waiting until available"""
    r = requests.get(GMAPS_URL + '?pagetoken=' + next_page_token +
                        '&key=' + api_key)
    for i in range(10):
        j = r.json()
        if not j['results']: # wait for next page to be available
            time.sleep(5)
            continue
        else:
            return j


def gmaps_get_all_df(api_key, location, **kwargs):
    """return dataframe of all results using api_key, location, search kwargs"""
    # get first page
    j = gmaps_get_first_page(api_key, location, **kwargs)
    venues_df = pd.json_normalize(j['results'])

    # get pages while additional pages available
    while 'next_page_token' in j:
        next_page_token = j['next_page_token']
        time.sleep(5)
        j = gmaps_get_next_page(api_key, next_page_token)
        venues_df = venues_df.append(pd.json_normalize(j['results']))
        
    return venues_df


def gmaps_get_df(location_coords, keyword):

    # use either rankby or radius
    location = "%.7f,%.7f" % location_coords
    gmaps_df = gmaps_get_all_df(api_key, location, keyword=keyword, ltype=ltype, radius=RADIUS)
    if gmaps_df.empty:
        return None
    else:
        # gmaps_get_df(api_key, location, keyword=keyword, ltype=ltype, radius=RADIUS)
        gmaps_df = gmaps_df.loc[(gmaps_df['user_ratings_total'] >= MIN_USER_RATINGS) & (gmaps_df['rating'] >= MIN_RATING)] \
                           .sort_values(['rating', 'user_ratings_total'], ascending=False) \
                           .reset_index(drop=True)
        gmaps_df = gmaps_df[['name', 'vicinity', 'rating', 'user_ratings_total', 'geometry.location.lat', 'geometry.location.lng']]
        gmaps_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng']
        # drop trailing ", Brooklyn"
        gmaps_df['address'] = gmaps_df['address'].apply(lambda address: " ".join(address.split(',')[:-1]))
        gmaps_df['distance'] = gmaps_df.apply(lambda row: distance((row['lat'], row['lng']), location_coords).km,
                                              axis=1)
        gmaps_df['category'] = keyword
        return gmaps_df


In [12]:
%%time
gmaps_df = gmaps_get_df(location_coords, keyword)
gmaps_df = gmaps_df.loc[(gmaps_df['nratings'] >= MIN_USER_RATINGS) & (gmaps_df['rating'] >= MIN_RATING)] \
        .sort_values(['rating', 'nratings'], ascending=False) \
        .reset_index(drop=True)
gmaps_df.to_pickle('gmaps_' + test_pickle_filename)
gmaps_df


CPU times: user 46.1 ms, sys: 10.7 ms, total: 56.8 ms
Wall time: 845 ms


Unnamed: 0,name,address,rating,nratings,lat,lng,distance,category
0,Serendipity bk,229 S 3rd St,4.9,109,40.711055,-73.958244,0.451033,icecream
1,The Screen Door,145 Driggs Ave,4.8,94,40.722987,-73.944345,1.326333,icecream
2,Tipsy Scoop,270 Metropolitan Ave,4.7,266,40.714639,-73.958565,0.273468,icecream
3,Gelateria Gentile - Williamsburg,253 Wythe Ave,4.6,759,40.716225,-73.963831,0.743962,icecream
4,Van Leeuwen Ice Cream,620 Manhattan Ave,4.5,498,40.723578,-73.950543,1.090451,icecream
5,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,465,40.714367,-73.961531,0.523421,icecream
6,Van Leeuwen Ice Cream,204 Wythe Ave,4.4,978,40.718381,-73.961845,0.701326,icecream
7,16 Handles,139 N 7th St,4.3,284,40.718454,-73.95825,0.507097,icecream
8,Milk Bar,382 Metropolitan Ave,4.1,548,40.71396,-73.955485,0.05701,icecream
9,Uncle Louie G,341 Graham Ave,3.8,80,40.714765,-73.94459,0.908775,icecream


In [13]:
# sort grid by clicking on header (can also click on filter button)
gmaps_df

Unnamed: 0,name,address,rating,nratings,lat,lng,distance,category
0,Serendipity bk,229 S 3rd St,4.9,109,40.711055,-73.958244,0.451033,icecream
1,The Screen Door,145 Driggs Ave,4.8,94,40.722987,-73.944345,1.326333,icecream
2,Tipsy Scoop,270 Metropolitan Ave,4.7,266,40.714639,-73.958565,0.273468,icecream
3,Gelateria Gentile - Williamsburg,253 Wythe Ave,4.6,759,40.716225,-73.963831,0.743962,icecream
4,Van Leeuwen Ice Cream,620 Manhattan Ave,4.5,498,40.723578,-73.950543,1.090451,icecream
5,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,465,40.714367,-73.961531,0.523421,icecream
6,Van Leeuwen Ice Cream,204 Wythe Ave,4.4,978,40.718381,-73.961845,0.701326,icecream
7,16 Handles,139 N 7th St,4.3,284,40.718454,-73.95825,0.507097,icecream
8,Milk Bar,382 Metropolitan Ave,4.1,548,40.71396,-73.955485,0.05701,icecream
9,Uncle Louie G,341 Graham Ave,3.8,80,40.714765,-73.94459,0.908775,icecream


In [14]:
# plot on google map

markers = [(row.lat, row.lng) for row in gmaps_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in gmaps_df.itertuples()]
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Google rating</dt><dd>{rating}</dd>
<dt>Google reviews</dt><dd>{nratings}</dd>
</dl>
"""
marker_info = [info_box_template.format(**row) for i, row in gmaps_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=eval(location), zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [15]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
    
venues_map

## 4. Yelp

In [16]:
def yelp_get_df(location_coords, keyword):
    lat, lng = location_coords
    response = yelp_api.search_query(categories=keyword, latitude=lat, longitude=lng, 
                                     radius=RADIUS, sort_by=rankby, limit=NRESULTS)

    yelp_df = pd.json_normalize(response['businesses'])
    if not yelp_df.empty:    
        yelp_df = yelp_df.loc[(yelp_df['review_count'] >= MIN_USER_RATINGS) & (yelp_df['rating'] >= MIN_RATING)] \
                         .sort_values(['rating', 'review_count'], ascending=False) \
                         .reset_index(drop=True)
        display_columns = ['name', 'location.address1', 'rating', 'review_count', 'coordinates.latitude', 'coordinates.longitude', 'url']
        yelp_df = yelp_df[display_columns]
        yelp_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng', 'url']
        yelp_df['distance'] = yelp_df.apply(lambda row: distance((row['lat'], row['lng']), location_coords).km,
                                              axis=1)
        yelp_df['category'] = keyword
        
        if yelp_df.empty:
            return None
        else:
            return yelp_df
    else:
        return None

yelp_df = yelp_get_df(location_coords, keyword)
yelp_df

Unnamed: 0,name,address,rating,nratings,lat,lng,url,distance,category
0,Kitsby,186 Grand St,4.5,336,40.71411,-73.960977,https://www.yelp.com/biz/kitsby-brooklyn-2?adj...,0.478131,icecream
1,Wowfulls,90 Kent Ave,4.5,331,40.721231,-73.962162,https://www.yelp.com/biz/wowfulls-brooklyn-9?a...,0.947486,icecream
2,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,284,40.714369,-73.961554,https://www.yelp.com/biz/taiyaki-nyc-williamsb...,0.525359,icecream
3,Tipsy Scoop,270 Metropolitan Ave,4.5,121,40.714596,-73.958516,https://www.yelp.com/biz/tipsy-scoop-brooklyn?...,0.26902,icecream
4,Roll'n Chill,90 Kent Ave,4.5,42,40.721369,-73.960766,https://www.yelp.com/biz/rolln-chill-brooklyn-...,0.893813,icecream
5,The Screen Door,145 Driggs Ave,4.5,40,40.72299,-73.94431,https://www.yelp.com/biz/the-screen-door-brook...,1.3286,icecream
6,Van Leeuwen Ice Cream - Williamsburg,204 Wythe Ave,4.0,271,40.71832,-73.961891,https://www.yelp.com/biz/van-leeuwen-ice-cream...,0.700211,icecream
7,Uncle Louie G,341 Graham Ave,4.0,66,40.71482,-73.94455,https://www.yelp.com/biz/uncle-louie-g-brookly...,0.912366,icecream
8,OddFellows Ice Cream - Domino Park,40 River St,4.0,33,40.716011,-73.966789,https://www.yelp.com/biz/oddfellows-ice-cream-...,0.982794,icecream
9,16 Handles,139 N 7th St,3.5,95,40.718453,-73.95825,https://www.yelp.com/biz/16-handles-brooklyn-4...,0.507095,icecream


In [17]:
# test_pickle_filename = 'z.pkl'
yelp_df.to_pickle('yelp_' + test_pickle_filename)
yelp_df

Unnamed: 0,name,address,rating,nratings,lat,lng,url,distance,category
0,Kitsby,186 Grand St,4.5,336,40.71411,-73.960977,https://www.yelp.com/biz/kitsby-brooklyn-2?adj...,0.478131,icecream
1,Wowfulls,90 Kent Ave,4.5,331,40.721231,-73.962162,https://www.yelp.com/biz/wowfulls-brooklyn-9?a...,0.947486,icecream
2,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,284,40.714369,-73.961554,https://www.yelp.com/biz/taiyaki-nyc-williamsb...,0.525359,icecream
3,Tipsy Scoop,270 Metropolitan Ave,4.5,121,40.714596,-73.958516,https://www.yelp.com/biz/tipsy-scoop-brooklyn?...,0.26902,icecream
4,Roll'n Chill,90 Kent Ave,4.5,42,40.721369,-73.960766,https://www.yelp.com/biz/rolln-chill-brooklyn-...,0.893813,icecream
5,The Screen Door,145 Driggs Ave,4.5,40,40.72299,-73.94431,https://www.yelp.com/biz/the-screen-door-brook...,1.3286,icecream
6,Van Leeuwen Ice Cream - Williamsburg,204 Wythe Ave,4.0,271,40.71832,-73.961891,https://www.yelp.com/biz/van-leeuwen-ice-cream...,0.700211,icecream
7,Uncle Louie G,341 Graham Ave,4.0,66,40.71482,-73.94455,https://www.yelp.com/biz/uncle-louie-g-brookly...,0.912366,icecream
8,OddFellows Ice Cream - Domino Park,40 River St,4.0,33,40.716011,-73.966789,https://www.yelp.com/biz/oddfellows-ice-cream-...,0.982794,icecream
9,16 Handles,139 N 7th St,3.5,95,40.718453,-73.95825,https://www.yelp.com/biz/16-handles-brooklyn-4...,0.507095,icecream


In [18]:
markers = [(row.lat, row.lng) for row in yelp_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in yelp_df.itertuples()]

info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Yelp rating</dt><dd>{rating}</dd>
<dt>Yelp reviews</dt><dd>{nratings}</dd>
</dl>
"""

marker_info = [info_box_template.format(**row) for i, row in yelp_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [19]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map

## 5. Foursquare

In [20]:
def foursquare_get_df(location_coords, keyword):
    location_str = "%.7f,%.7f" % location_coords
    client = Foursquare(client_id=foursquare_id, 
                                   client_secret=foursquare_secret, 
                                   redirect_uri='http://streeteye.com/')
    response = client.venues.search(params={'query': keyword, 'll': "%s" % location_str, 
                                            'radius': RADIUS, 'limit': NRESULTS})

    foursquare_array = []

    for i, venue in pd.json_normalize(response['venues']).iterrows():
        venue_id = venue['id']
        # query detailed venue info from foursquare
        venue_name = venue['name']
        venue_address = venue['location.address']
        # sometimes no URL
        try:
            venue_url = venue['delivery.url']
        except:
            venue_url = ''
        venue_lat = venue['location.lat']
        venue_lng = venue['location.lng']
        # default these to -1
        try:
            # get rating, nratings with another API call for venue details
            venue_details = client.venues(venue_id)['venue']
            venue_rating = venue_details['rating']
            venue_nratings = venue_details['ratingSignals']
        except FoursquareException as e:
            print("Foursquare exception", type(e), str(e))
        except Exception as e:
            continue
            # sometimes no rating ... probably not popular enough
            # print(type(e), str(e))
            # print(traceback.format_exc())
            # print("No rating for %s" % venue_name)

        foursquare_array.append([venue_name, venue_address, venue_rating, venue_nratings, venue_lat, venue_lng, venue_url])
            
    foursquare_df = pd.DataFrame(foursquare_array)
    display(foursquare_df)
    if foursquare_df.empty:
        return None
    else:
        foursquare_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng', 'url']
        foursquare_df = foursquare_df.loc[(foursquare_df['nratings'] >= MIN_USER_RATINGS) & (foursquare_df['rating'] >= MIN_RATING)] \
                                     .sort_values(['rating', 'nratings'], ascending=False) \
                                     .reset_index(drop=True)
        try:
            foursquare_df['distance'] = foursquare_df.apply(lambda row: geopy.distance.geodesic((row['lat'], row['lng']), location_coords).km,
                                                        axis=1)
        except:
            pass
        foursquare_df['category'] = keyword
        
        return foursquare_df

foursquare_df = foursquare_get_df(location_coords, keyword)
foursquare_df

Unnamed: 0,0,1,2,3,4,5,6
0,Van Leeuwen Ice Cream,204 Wythe Ave,9.0,467,40.718381,-73.961845,
1,Oddfellows Ice Cream Co.,40 S River St,8.1,16,40.71604,-73.96677,https://www.seamless.com/menu/oddfellows-ice-c...
2,Blossom Ice Cream & The Poké,54 N 6th St,6.4,46,40.719645,-73.962121,


Unnamed: 0,name,address,rating,nratings,lat,lng,url,distance,category
0,Van Leeuwen Ice Cream,204 Wythe Ave,9.0,467,40.718381,-73.961845,,0.701326,icecream
1,Blossom Ice Cream & The Poké,54 N 6th St,6.4,46,40.719645,-73.962121,,0.812413,icecream


In [21]:
if foursquare_df is not None and not foursquare_df.empty:
    foursquare_df.to_pickle('foursquare_' + test_pickle_filename)
    
foursquare_df

Unnamed: 0,name,address,rating,nratings,lat,lng,url,distance,category
0,Van Leeuwen Ice Cream,204 Wythe Ave,9.0,467,40.718381,-73.961845,,0.701326,icecream
1,Blossom Ice Cream & The Poké,54 N 6th St,6.4,46,40.719645,-73.962121,,0.812413,icecream


In [22]:
markers = [(row.lat, row.lng) for row in foursquare_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in foursquare_df.itertuples()]

info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Foursquare rating</dt><dd>{rating}</dd>
<dt>Foursquare reviews</dt><dd>{nratings}</dd>
</dl>
"""
marker_info = [info_box_template.format(**d_item) for i, d_item in foursquare_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [23]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map

## 6. Merge dataframes after deduplication

In [24]:
# put everything in one big df
pd.set_option('display.max_rows', None)

gmaps_df_copy, yelp_df_copy, foursquare_df_copy = None, None, None

try:
    gmaps_df_copy = gmaps_df.copy()
    gmaps_df_copy['source'] = '0_gmaps'
except: 
    pass

try:
    yelp_df_copy = yelp_df.copy() 
    yelp_df_copy['source'] = '1_yelp'
except:
    pass

try:
    foursquare_df_copy = foursquare_df.copy()
    foursquare_df_copy['source'] = '2_foursquare'
except:
    pass


venues_df = pd.concat(list(filter(lambda df: df is not None, [gmaps_df_copy, yelp_df_copy, foursquare_df_copy]))).reset_index()
venues_df['latlong'] = venues_df[['lat','lng']].apply(tuple, axis=1)

venues_df.sort_values('name')

Unnamed: 0,index,name,address,rating,nratings,lat,lng,distance,category,source,url,latlong
20,9,16 Handles,139 N 7th St,3.5,95,40.718453,-73.95825,0.507095,icecream,1_yelp,https://www.yelp.com/biz/16-handles-brooklyn-4...,"(40.71845346713176, -73.95825003004205)"
7,7,16 Handles,139 N 7th St,4.3,284,40.718454,-73.95825,0.507097,icecream,0_gmaps,,"(40.7184535, -73.95824999999999)"
24,1,Blossom Ice Cream & The Poké,54 N 6th St,6.4,46,40.719645,-73.962121,0.812413,icecream,2_foursquare,,"(40.71964523671903, -73.96212134508906)"
3,3,Gelateria Gentile - Williamsburg,253 Wythe Ave,4.6,759,40.716225,-73.963831,0.743962,icecream,0_gmaps,,"(40.7162246, -73.9638314)"
11,0,Kitsby,186 Grand St,4.5,336,40.71411,-73.960977,0.478131,icecream,1_yelp,https://www.yelp.com/biz/kitsby-brooklyn-2?adj...,"(40.71411, -73.960977)"
8,8,Milk Bar,382 Metropolitan Ave,4.1,548,40.71396,-73.955485,0.05701,icecream,0_gmaps,,"(40.7139599, -73.95548459999999)"
22,11,New York Ice Cream Truck,80 Wythe Ave,1.5,34,40.722301,-73.957465,0.888997,icecream,1_yelp,https://www.yelp.com/biz/new-york-ice-cream-tr...,"(40.7223009797052, -73.9574650919071)"
19,8,OddFellows Ice Cream - Domino Park,40 River St,4.0,33,40.716011,-73.966789,0.982794,icecream,1_yelp,https://www.yelp.com/biz/oddfellows-ice-cream-...,"(40.71601067638834, -73.96678879398446)"
10,10,OddFellows Ice Cream Co. Domino Park,40 River St,3.7,51,40.716005,-73.966791,0.982828,icecream,0_gmaps,,"(40.7160051, -73.9667905)"
15,4,Roll'n Chill,90 Kent Ave,4.5,42,40.721369,-73.960766,0.893813,icecream,1_yelp,https://www.yelp.com/biz/rolln-chill-brooklyn-...,"(40.7213689, -73.9607664)"


In [25]:
# manual fix 
venues_df.loc[venues_df['address']=='267 A Smith St', 'address'] = '267 Smith St'
venues_df.loc[venues_df['name']=='Fornino', 'address'] = 'Pier 6 Brooklyn Bridge Park'
venues_df.loc[venues_df['name']=='Fornino', 'lat'] = 40.6931806
venues_df.loc[venues_df['name']=='Fornino', 'lng'] = -74.0011955
venues_df.loc[venues_df['name']=='Brado', 'lat'] = 40.690557
venues_df.loc[venues_df['name']=='Brado', 'lng'] = -73.994911
venues_df['latlong'] = venues_df[['lat','lng']].apply(tuple, axis=1)

venues_df.loc[venues_df['address']=='71 Smith St', 'address'] = '57 Smith St'
venues_df.loc[venues_df['address']=='71 Smith St', 'lat'] = 40.6896242
venues_df.loc[venues_df['address']=='71 Smith St', 'lng'] = -73.9882489

venues_df.loc[venues_df['address']=='212b Pacific St', 'address'] = '212 Pacific St'
venues_df.loc[venues_df['address']=='212 Pacific St', 'lat'] = 40.6888305
venues_df.loc[venues_df['address']=='212 Pacific St', 'lng'] = -73.9921739

# venues_df


In [26]:
# run dedupe algorithm using name, address as default texts, latlong as latlong
venues_df['shortname'] = venues_df['name'].apply(lambda n: n[:25])
venues_df2 = pandas_dedupe.dedupe_dataframe(venues_df, ['category', 'name', 'address', ('latlong', 'LatLong')])


Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...


  df[i] = df[i].str.replace('[^\w\s\.\-\(\)\,\:\/\\\\]','')


# duplicate sets 17


In [27]:
# view clustering
venues_df['cluster'] = venues_df2['cluster id']
clustercols = ['cluster', 'name', 'address', 'rating', 'category', 'nratings', 'lat', 'lng', 'distance', 'source']
venues_df = venues_df.sort_values(['cluster', 'source'])[clustercols]
venues_df

Unnamed: 0,cluster,name,address,rating,category,nratings,lat,lng,distance,source
1,0,The Screen Door,145 Driggs Ave,4.8,icecream,94,40.7229874,-73.9443448,1.3263332525152245,0_gmaps
16,0,The Screen Door,145 Driggs Ave,4.5,icecream,40,40.72299,-73.94431,1.3285996945497376,1_yelp
2,1,Tipsy Scoop,270 Metropolitan Ave,4.7,icecream,266,40.7146394,-73.9585652,0.2734675833756646,0_gmaps
14,1,Tipsy Scoop,270 Metropolitan Ave,4.5,icecream,121,40.7145963,-73.9585161,0.2690200376068553,1_yelp
5,2,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,icecream,465,40.7143671,-73.9615306,0.523420707022425,0_gmaps
13,2,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,icecream,284,40.7143691,-73.9615536,0.5253593967701266,1_yelp
6,3,Van Leeuwen Ice Cream,204 Wythe Ave,4.4,icecream,978,40.718381,-73.961845,0.7013255197266905,0_gmaps
17,3,Van Leeuwen Ice Cream - Williamsburg,204 Wythe Ave,4.0,icecream,271,40.7183199,-73.9618911,0.7002109000674294,1_yelp
23,3,Van Leeuwen Ice Cream,204 Wythe Ave,9.0,icecream,467,40.718381,-73.961845,0.7013255197266905,2_foursquare
7,4,16 Handles,139 N 7th St,4.3,icecream,284,40.7184535,-73.95824999999999,0.5070973412534319,0_gmaps


In [28]:
# group by clusters
cluster_df = venues_df.groupby('cluster')[['name', 'address', 'lat', 'lng', 'distance', 'source', 'category']] \
    .first() \
    .reset_index()
cluster_df

Unnamed: 0,cluster,name,address,lat,lng,distance,source,category
0,0,The Screen Door,145 Driggs Ave,40.7229874,-73.9443448,1.3263332525152245,0_gmaps,icecream
1,1,Tipsy Scoop,270 Metropolitan Ave,40.7146394,-73.9585652,0.2734675833756646,0_gmaps,icecream
2,2,Taiyaki NYC - Williamsburg,294 Bedford Ave,40.7143671,-73.9615306,0.523420707022425,0_gmaps,icecream
3,3,Van Leeuwen Ice Cream,204 Wythe Ave,40.718381,-73.961845,0.7013255197266905,0_gmaps,icecream
4,4,16 Handles,139 N 7th St,40.7184535,-73.95824999999999,0.5070973412534319,0_gmaps,icecream
5,5,Uncle Louie G,341 Graham Ave,40.7147646,-73.9445896,0.9087748019620068,0_gmaps,icecream
6,6,OddFellows Ice Cream Co. Domino Park,40 River St,40.7160051,-73.9667905,0.982827843539974,0_gmaps,icecream
7,7,Serendipity bk,229 S 3rd St,40.7110545,-73.95824429999999,0.4510331912240236,0_gmaps,icecream
8,8,Gelateria Gentile - Williamsburg,253 Wythe Ave,40.7162246,-73.9638314,0.7439619339815131,0_gmaps,icecream
9,9,Van Leeuwen Ice Cream,620 Manhattan Ave,40.7235777,-73.95054259999999,1.0904506465766466,0_gmaps,icecream


In [29]:
# make markers on clusters
# add ratings on all rows

markers = [(float(row.lat), float(row.lng)) for row in cluster_df.itertuples()]
marker_hover = ["%s" % (row.name) for row in cluster_df.itertuples()]

# make a dict by cluster, initialize rating string to ''
marker_dict = {i: {'name': row['name'],
                   'address': row['address'],
                   'lat': row['lat'],
                   'lng': row['lng'],
                   'rate_str': '',
                  } for i, row in cluster_df.iterrows()}

# add all rating strings
for i, row in venues_df.iterrows():
    cluster = row['cluster']
    if row.source[2:] == 'gmaps':
        marker_dict[cluster]['rate_str'] += "<dt>Google rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    elif row.source[2:] == 'yelp':
        marker_dict[cluster]['rate_str'] += "<dt>Yelp rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    elif row.source[2:] == 'foursquare':
        marker_dict[cluster]['rate_str'] += "<dt>Foursquare rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
{rate_str}

</dl>
"""    

marker_info = [info_box_template.format(**d_item) for i, d_item in marker_dict.items()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig


Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [30]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map


In [31]:
merge_df = cluster_df \
    .merge(venues_df.loc[venues_df['source']=='0_gmaps'][['cluster','rating', 'nratings']], on='cluster', how='outer') \
    .rename(columns={'rating': 'gmaps_rating', 'nratings': 'gmaps_nratings'})

merge_df = merge_df \
    .merge(venues_df.loc[venues_df['source']=='1_yelp'][['cluster','rating', 'nratings']], on='cluster', how='outer') \
    .rename(columns={'rating': 'yelp_rating', 'nratings': 'yelp_nratings'})

merge_df = merge_df \
    .merge(venues_df.loc[venues_df['source']=='2_foursquare'][['cluster','rating', 'nratings']], on='cluster', how='outer') \
    .rename(columns={'rating': 'foursquare_rating', 'nratings': 'foursquare_nratings'})


In [32]:
# merge_df.loc[merge_df['name']=='Brado', 'gmaps_rating'] = 4.5
# merge_df.loc[merge_df['name']=='Brado', 'gmaps_nratings'] = 251

In [33]:
merge_df['gmaps_rating_std'] = StandardScaler().fit_transform(merge_df[['gmaps_rating']])

merge_df['yelp_rating_std'] = StandardScaler().fit_transform(merge_df[['yelp_rating']])

merge_df['foursquare_rating_std'] = StandardScaler().fit_transform(merge_df[['foursquare_rating']])

merge_df 

Unnamed: 0,cluster,name,address,lat,lng,distance,source,category,gmaps_rating,gmaps_nratings,yelp_rating,yelp_nratings,foursquare_rating,foursquare_nratings,gmaps_rating_std,yelp_rating_std,foursquare_rating_std
0,0,The Screen Door,145 Driggs Ave,40.7229874,-73.9443448,1.3263332525152245,0_gmaps,icecream,4.8,94.0,4.5,40.0,,,1.104482,0.676716,
1,1,Tipsy Scoop,270 Metropolitan Ave,40.7146394,-73.9585652,0.2734675833756646,0_gmaps,icecream,4.7,266.0,4.5,121.0,,,0.834497,0.676716,
2,2,Taiyaki NYC - Williamsburg,294 Bedford Ave,40.7143671,-73.9615306,0.523420707022425,0_gmaps,icecream,4.5,465.0,4.5,284.0,,,0.294528,0.676716,
3,3,Van Leeuwen Ice Cream,204 Wythe Ave,40.718381,-73.961845,0.7013255197266905,0_gmaps,icecream,4.4,978.0,4.0,271.0,9.0,467.0,0.024544,0.096674,1.0
4,4,16 Handles,139 N 7th St,40.7184535,-73.95824999999999,0.5070973412534319,0_gmaps,icecream,4.3,284.0,3.5,95.0,,,-0.24544,-0.483368,
5,5,Uncle Louie G,341 Graham Ave,40.7147646,-73.9445896,0.9087748019620068,0_gmaps,icecream,3.8,80.0,4.0,66.0,,,-1.595362,0.096674,
6,6,OddFellows Ice Cream Co. Domino Park,40 River St,40.7160051,-73.9667905,0.982827843539974,0_gmaps,icecream,3.7,51.0,4.0,33.0,,,-1.865347,0.096674,
7,7,Serendipity bk,229 S 3rd St,40.7110545,-73.95824429999999,0.4510331912240236,0_gmaps,icecream,4.9,109.0,,,,,1.374466,,
8,8,Gelateria Gentile - Williamsburg,253 Wythe Ave,40.7162246,-73.9638314,0.7439619339815131,0_gmaps,icecream,4.6,759.0,,,,,0.564513,,
9,9,Van Leeuwen Ice Cream,620 Manhattan Ave,40.7235777,-73.95054259999999,1.0904506465766466,0_gmaps,icecream,4.5,498.0,,,,,0.294528,,


In [34]:
# simple average score
merge_df['meanrating'] = np.nanmean(merge_df[['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']], axis=1)
merge_df.sort_values('meanrating', ascending=False)[['name', 'address', 'gmaps_rating', 'yelp_rating', 'foursquare_rating', 'meanrating']]


Unnamed: 0,name,address,gmaps_rating,yelp_rating,foursquare_rating,meanrating
7,Serendipity bk,229 S 3rd St,4.9,,,1.374466
0,The Screen Door,145 Driggs Ave,4.8,4.5,,0.890599
1,Tipsy Scoop,270 Metropolitan Ave,4.7,4.5,,0.755606
13,Roll'n Chill,90 Kent Ave,,4.5,,0.676716
12,Wowfulls,90 Kent Ave,,4.5,,0.676716
11,Kitsby,186 Grand St,,4.5,,0.676716
8,Gelateria Gentile - Williamsburg,253 Wythe Ave,4.6,,,0.564513
2,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,4.5,,0.485622
3,Van Leeuwen Ice Cream,204 Wythe Ave,4.4,4.0,9.0,0.373739
9,Van Leeuwen Ice Cream,620 Manhattan Ave,4.5,,,0.294528


## 7. Bayesian estimated score

In [35]:
# bayes score
rating_cols = ['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']
merge_df['nratings'] = merge_df[rating_cols].count(axis=1)
nratings_mean = np.mean(merge_df['nratings'])
rating_avg = np.nanmean(merge_df[rating_cols])
merge_df['w'] = merge_df['nratings']/(merge_df['nratings'] + nratings_mean)
merge_df['R'] = np.mean(merge_df[rating_cols], axis=1)
print('mean number of ratings', nratings_mean)
print('average rating', rating_avg)
merge_df['bayes_score'] = merge_df['w'] * merge_df['R'] + (1 - merge_df['w']) * rating_avg

merge_df.sort_values('bayes_score', ascending=False)[['name', 'address', 'category', 'distance', 'gmaps_rating', 'yelp_rating', 'foursquare_rating', 'nratings', 'bayes_score']]


mean number of ratings 1.4705882352941178
average rating 3.730349362740526e-16


Unnamed: 0,name,address,category,distance,gmaps_rating,yelp_rating,foursquare_rating,nratings,bayes_score
7,Serendipity bk,229 S 3rd St,icecream,0.4510331912240236,4.9,,,1,0.556331
0,The Screen Door,145 Driggs Ave,icecream,1.3263332525152245,4.8,4.5,,2,0.513226
1,Tipsy Scoop,270 Metropolitan Ave,icecream,0.2734675833756646,4.7,4.5,,2,0.435434
2,Taiyaki NYC - Williamsburg,294 Bedford Ave,icecream,0.523420707022425,4.5,4.5,,2,0.27985
13,Roll'n Chill,90 Kent Ave,icecream,0.8938129229365911,,4.5,,1,0.273909
12,Wowfulls,90 Kent Ave,icecream,0.947485930331656,,4.5,,1,0.273909
11,Kitsby,186 Grand St,icecream,0.478130783707259,,4.5,,1,0.273909
3,Van Leeuwen Ice Cream,204 Wythe Ave,icecream,0.7013255197266905,4.4,4.0,9.0,3,0.250799
8,Gelateria Gentile - Williamsburg,253 Wythe Ave,icecream,0.7439619339815131,4.6,,,1,0.228493
9,Van Leeuwen Ice Cream,620 Manhattan Ave,icecream,1.0904506465766466,4.5,,,1,0.119214


In [36]:
z = merge_df.sort_values('bayes_score', ascending=False).reset_index()[['name', 'address', 'gmaps_rating', 'yelp_rating', 'foursquare_rating', 'bayes_score']]


In [37]:
z

Unnamed: 0,name,address,gmaps_rating,yelp_rating,foursquare_rating,bayes_score
0,Serendipity bk,229 S 3rd St,4.9,,,0.556331
1,The Screen Door,145 Driggs Ave,4.8,4.5,,0.513226
2,Tipsy Scoop,270 Metropolitan Ave,4.7,4.5,,0.435434
3,Taiyaki NYC - Williamsburg,294 Bedford Ave,4.5,4.5,,0.27985
4,Roll'n Chill,90 Kent Ave,,4.5,,0.273909
5,Wowfulls,90 Kent Ave,,4.5,,0.273909
6,Kitsby,186 Grand St,,4.5,,0.273909
7,Van Leeuwen Ice Cream,204 Wythe Ave,4.4,4.0,9.0,0.250799
8,Gelateria Gentile - Williamsburg,253 Wythe Ave,4.6,,,0.228493
9,Van Leeuwen Ice Cream,620 Manhattan Ave,4.5,,,0.119214


In [38]:
def dedupe(dedupe_list):

    for i, source_df in enumerate(dedupe_list):
        source_df['source'] = i
    venues_df = pd.concat(dedupe_list).reset_index()
    venues_df['latlong'] = venues_df[['lat','lng']].apply(tuple, axis=1)
    venues_df['shortname'] = venues_df['name'].apply(lambda n: n[:25])

    # dedupe and assign cluster id
    venues_df2 = pandas_dedupe.dedupe_dataframe(venues_df, ['category', 'shortname', 'address', ('latlong', 'LatLong')])
    venues_df['cluster'] = venues_df2['cluster id']
    venues_df = venues_df.sort_values(['cluster', 'source'])[['cluster', 'name', 'address', 'category', 'rating', 'nratings', 'lat', 'lng', 'distance', 'source']]

    # group by clusters, uniquify name
    cluster_df = venues_df.groupby('cluster')[['name', 'address', 'lat', 'lng', 'distance', 'source', 'category']] \
                          .first() \
                          .reset_index()

    # merge ratings by source
    merge_df = cluster_df \
        .merge(venues_df.loc[venues_df['source']=='0'][['cluster','rating', 'nratings', 'category']], on='cluster', how='outer') \
        .rename(columns={'rating': 'gmaps_rating', 'nratings': 'gmaps_nratings'})
    merge_df['gmaps_rating_std'] = StandardScaler().fit_transform(merge_df[['gmaps_rating']])

    merge_df = merge_df \
        .merge(venues_df.loc[venues_df['source']=='1'][['cluster','rating', 'nratings', 'category']], on='cluster', how='outer') \
        .rename(columns={'rating': 'yelp_rating', 'nratings': 'yelp_nratings'})
    merge_df['yelp_rating_std'] = StandardScaler().fit_transform(merge_df[['yelp_rating']])

    merge_df = merge_df \
        .merge(venues_df.loc[venues_df['source']=='2'][['cluster','rating', 'nratings', 'category']], on='cluster', how='outer') \
        .rename(columns={'rating': 'foursquare_rating', 'nratings': 'foursquare_nratings'})
    merge_df['foursquare_rating_std'] = StandardScaler().fit_transform(merge_df[['foursquare_rating']])

    # bayes score
    rating_cols = ['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']
    merge_df['nratings'] = merge_df[rating_cols].count(axis=1)
    nratings_mean = np.mean(merge_df['nratings'])
    rating_avg = np.nanmean(merge_df[rating_cols])
    merge_df['w'] = merge_df['nratings']/(merge_df['nratings'] + nratings_mean)
    merge_df['R'] = np.mean(merge_df[rating_cols], axis=1)
    merge_df['bayes_score'] = merge_df['w'] * merge_df['R'] + (1 - merge_df['w']) * rating_avg
    merge_df = merge_df.sort_values('bayes_score', ascending=False)
    return merge_df

    
dedupe_list = list(filter(lambda df: df is not None, [gmaps_df, yelp_df, foursquare_df]))

dedupe_df = dedupe(dedupe_list)
dedupe_df


Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...


  df[i] = df[i].str.replace('[^\w\s\.\-\(\)\,\:\/\\\\]','')


# duplicate sets 17


Unnamed: 0,cluster,name,address,lat,lng,distance,source,category_x,gmaps_rating,gmaps_nratings,...,category_x.1,yelp_rating_std,foursquare_rating,foursquare_nratings,category_y,foursquare_rating_std,nratings,w,R,bayes_score
7,7,Serendipity bk,229 S 3rd St,40.7110545,-73.95824429999999,0.4510331912240236,0,icecream,4.9,109.0,...,,,,,,,1,0.404762,1.374466,0.556331
0,0,The Screen Door,145 Driggs Ave,40.7229874,-73.9443448,1.3263332525152245,0,icecream,4.8,94.0,...,icecream,0.676716,,,,,2,0.576271,0.890599,0.513226
1,1,Tipsy Scoop,270 Metropolitan Ave,40.7146394,-73.9585652,0.2734675833756646,0,icecream,4.7,266.0,...,icecream,0.676716,,,,,2,0.576271,0.755606,0.435434
2,2,Taiyaki NYC - Williamsburg,294 Bedford Ave,40.7143671,-73.9615306,0.523420707022425,0,icecream,4.5,465.0,...,icecream,0.676716,,,,,2,0.576271,0.485622,0.27985
13,13,Roll'n Chill,90 Kent Ave,40.7213689,-73.9607664,0.8938129229365911,1,icecream,,,...,icecream,0.676716,,,,,1,0.404762,0.676716,0.273909
12,12,Wowfulls,90 Kent Ave,40.721231,-73.962162,0.947485930331656,1,icecream,,,...,icecream,0.676716,,,,,1,0.404762,0.676716,0.273909
11,11,Kitsby,186 Grand St,40.71411,-73.960977,0.478130783707259,1,icecream,,,...,icecream,0.676716,,,,,1,0.404762,0.676716,0.273909
3,3,Van Leeuwen Ice Cream,204 Wythe Ave,40.718381,-73.961845,0.7013255197266905,0,icecream,4.4,978.0,...,icecream,0.096674,9.0,467.0,icecream,1.0,3,0.671053,0.373739,0.250799
8,8,Gelateria Gentile - Williamsburg,253 Wythe Ave,40.7162246,-73.9638314,0.7439619339815131,0,icecream,4.6,759.0,...,,,,,,,1,0.404762,0.564513,0.228493
9,9,Van Leeuwen Ice Cream,620 Manhattan Ave,40.7235777,-73.95054259999999,1.0904506465766466,0,icecream,4.5,498.0,...,,,,,,,1,0.404762,0.294528,0.119214


In [39]:
test_pickle_filename

'icecream_williamsburg.pkl'

In [40]:
dedupe_df.to_pickle(test_pickle_filename)
pd.read_pickle(test_pickle_filename)

Unnamed: 0,cluster,name,address,lat,lng,distance,source,category_x,gmaps_rating,gmaps_nratings,...,category_x.1,yelp_rating_std,foursquare_rating,foursquare_nratings,category_y,foursquare_rating_std,nratings,w,R,bayes_score
7,7,Serendipity bk,229 S 3rd St,40.7110545,-73.95824429999999,0.4510331912240236,0,icecream,4.9,109.0,...,,,,,,,1,0.404762,1.374466,0.556331
0,0,The Screen Door,145 Driggs Ave,40.7229874,-73.9443448,1.3263332525152245,0,icecream,4.8,94.0,...,icecream,0.676716,,,,,2,0.576271,0.890599,0.513226
1,1,Tipsy Scoop,270 Metropolitan Ave,40.7146394,-73.9585652,0.2734675833756646,0,icecream,4.7,266.0,...,icecream,0.676716,,,,,2,0.576271,0.755606,0.435434
2,2,Taiyaki NYC - Williamsburg,294 Bedford Ave,40.7143671,-73.9615306,0.523420707022425,0,icecream,4.5,465.0,...,icecream,0.676716,,,,,2,0.576271,0.485622,0.27985
13,13,Roll'n Chill,90 Kent Ave,40.7213689,-73.9607664,0.8938129229365911,1,icecream,,,...,icecream,0.676716,,,,,1,0.404762,0.676716,0.273909
12,12,Wowfulls,90 Kent Ave,40.721231,-73.962162,0.947485930331656,1,icecream,,,...,icecream,0.676716,,,,,1,0.404762,0.676716,0.273909
11,11,Kitsby,186 Grand St,40.71411,-73.960977,0.478130783707259,1,icecream,,,...,icecream,0.676716,,,,,1,0.404762,0.676716,0.273909
3,3,Van Leeuwen Ice Cream,204 Wythe Ave,40.718381,-73.961845,0.7013255197266905,0,icecream,4.4,978.0,...,icecream,0.096674,9.0,467.0,icecream,1.0,3,0.671053,0.373739,0.250799
8,8,Gelateria Gentile - Williamsburg,253 Wythe Ave,40.7162246,-73.9638314,0.7439619339815131,0,icecream,4.6,759.0,...,,,,,,,1,0.404762,0.564513,0.228493
9,9,Van Leeuwen Ice Cream,620 Manhattan Ave,40.7235777,-73.95054259999999,1.0904506465766466,0,icecream,4.5,498.0,...,,,,,,,1,0.404762,0.294528,0.119214


In [41]:
# %%time
# # run all in parallel

# services = ['gmaps', 'yelp', 'foursquare']
# args = list(product(services, [location_coords], [keyword]))

# def generic_get_df(argslist):
#     """Query specified service for location and keyword, return dataframe"""
    
#     service, location_coords, keyword = argslist
    
#     if service=="gmaps":
#         retdf = gmaps_get_df(location_coords, keyword)
#     elif service=="yelp":
#         retdf = yelp_get_df(location_coords, keyword)
#     elif service=="foursquare":
#         retdf = foursquare_get_df(location_coords, keyword)
        
#     return retdf

# def all_get_df(location_coords, keyword):
#     """Query all services for location and keyword, return list of dataframes"""

#     services = ['gmaps', 'yelp', 'foursquare']
#     args = list(product(services, [location_coords], [keyword]))
#     with Pool() as pool:
#         # pool.map will block till all return
#         # pool.imap_unordered should make each available as returned … seems to not return, incompatible with underlying apis I guess
#         df_list = pool.map(generic_get_df, args)
#     return df_list

# print("Querying…")
# df_list = all_get_df(location_coords, keyword) 
# print("Deduping…")
# dedupe_df =  dedupe([df for df in df_list if df is not None])
# dedupe_df

In [42]:
# # save a file to retrain with train_dedupe2.py
# tempdf = None
# for k, l, j in product(['pizza', 'coffee', 'icecream'], 
#                     ['midtown','downtown','uppereastside','upperwestside','brooklynheights','grandarmyplaza','bayridge','williamsburg',],
#                     ['gmaps', 'yelp', 'foursquare']):
#     filename = "cache/%s_%s_%s.pkl" % (j, k, l)
#     try:
#         if tempdf is None:
#             tempdf = pd.read_pickle(filename)
#             tempdf['keyword']=k
#             print(filename)
#         else:
#             tempdf2 = pd.read_pickle(filename)
#             tempdf2['keyword']=k
#             tempdf = pd.concat([tempdf, tempdf2])
#             print(filename)
#     except:
#         print("missing ", filename)
# tempdf.to_csv('train_df.csv', index=False)