# Pizza pizza pizza

Get top venues by keyword and location, querying Google, Yelp, Foursquare

#### Google

 - Needs a Google API key and module
 - [Create Google Cloud credentials and give access to Places APIs](https://console.cloud.google.com/google/maps-apis/credentials)
 - `conda install -c conda-forge -y gmaps`
 - put key in `apikey.txt`
 - `gmaps` Jupyter nbextension to show maps in notebook, with marker pins etc.

```
conda install -c conda-forge -y jupyter_contrib_nbextensions
jupyter nbextension enable --py gmaps
jupyter notebook
```

#### Yelp
 - needs Yelp API key and module
 - https://www.yelp.com/developers/documentation/v3
 - https://github.com/gfairchild/yelpapi
 - put key in `yelpkey.txt`
 
#### Foursquare
- Needs Foursquare API key and module
- https://developer.foursquare.com/docs/places-api/getting-started/
- https://github.com/mLewisLogic/foursquare
- OAuth id in `foursquare_id.txt`
- OAuth secret in `foursquare_secret.txt`

See `requirements.txt` for versions used, other requirements (requests, folium, Flask)


In [1]:
import time
from pprint import pprint
from ipywidgets import widgets, interact
import pdb

import numpy as np
import pandas as pd
import pandas_dedupe

from sklearn.preprocessing import StandardScaler

import requests, json 

import gmaps
with open('apikey.txt') as f:
    api_key = f.readline().strip()
    f.close
gmaps.configure(api_key=api_key)

# https://github.com/gfairchild/yelpapi
from yelpapi import YelpAPI
with open('yelpkey.txt') as f:
    yelp_key = f.readline().strip()
    f.close
yelp_api = YelpAPI(yelp_key)

import foursquare
with open('foursquare_id.txt') as f:
    foursquare_id = f.readline().strip()
    f.close
with open('foursquare_secret.txt') as f:
    foursquare_secret = f.readline().strip()
    f.close

gmaps.configure(api_key=api_key)

import folium


## Google Maps

In [2]:
# pick a search term
keyword_options = [('Pizza', 'pizza'), ('Coffee', 'coffee')]
keyword = 'pizza'

@interact
def get_kw(kw = widgets.Dropdown(
    options=keyword_options,
    value=keyword,
    description='Search term:',
    disabled=False,
)):
    global keyword
    keyword = kw
    return kw


interactive(children=(Dropdown(description='Search term:', options=(('Pizza', 'pizza'), ('Coffee', 'coffee')),…

In [5]:
# pick a location
location = '40.7484, -73.9857'
location_coords = eval(location)
location_options = [('Midtown', '40.7484, -73.9857'),
                    ('Downtown', '40.7077443,-74.0139089'),
                    ('Upper East Side', '40.7711473,-73.9661166'),
                    ('Upper West Side', '40.778794,-73.984257'),
                    ('Brooklyn Heights', '40.6915812,-73.9954095'), 
                    ('Grand Army Plaza', '40.671872,-73.972544'),
                    ('Bay Ridge', '40.624468,-74.0487134'),
                    ('Williamsburg', '40.7144609,-73.9553373'),
                  ]

@interact
def get_loc(loc = widgets.Dropdown(
    options=location_options,
    value=location,
    description='Location:',
)):
    global location
    global location_coords
    location = loc
    location_coords = eval(loc)
    return loc

interactive(children=(Dropdown(description='Location:', options=(('Midtown', '40.7484, -73.9857'), ('Downtown'…

In [6]:
figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}
fig = gmaps.figure(center=location_coords, zoom_level=12, layout=figure_layout)
fig.add_layer(gmaps.marker_layer([location_coords]))
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [7]:
# global options for all 
MIN_USER_RATINGS = 40
MIN_RATING = 3
NRESULTS = 50
RADIUS = 3000


In [8]:
# gmaps options
GMAPS_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
# https://developers.google.com/places/web-service/search#TextSearchRequests
#https://developers.google.com/places/web-service/supported_types
# rankby='prominence'
rankby='distance'
ltype='establishment'


In [9]:
def get_first_page(api_key, location, **kwargs):
    """get first page of results from gmaps using api_key, location, kwargs for search spec"""
    request_url = GMAPS_URL + '?key=' + api_key
    request_url += '&location=' + location
    for name, val in kwargs.items():
        request_url += '&' + name + '=' + val
    r = requests.get(request_url)
    j = r.json()
    return j


def get_next_page(api_key, next_page_token):
    """get next search engine results page page using search token, waiting until available"""
    r = requests.get(GMAPS_URL + '?pagetoken=' + next_page_token +
                        '&key=' + api_key)
    for i in range(10):
        j = r.json()
        if not j['results']: # wait for next page to be available
            time.sleep(5)
            continue
        else:
            return j


def runquery(api_key, location, **kwargs):
    """return dataframe of all results using api_key, location, search kwargs"""
    # get first page
    j = get_first_page(api_key, location, **kwargs)
    venues_df = pd.json_normalize(j['results'])

    # get pages while additional pages available
    while 'next_page_token' in j:
        next_page_token = j['next_page_token']
        time.sleep(5)
        j = get_next_page(api_key, next_page_token)
        venues_df = venues_df.append(pd.json_normalize(j['results']))
        
    return venues_df        

In [10]:
# use either rankby or radius
gmaps_df = runquery(api_key, location, keyword=keyword, ltype=ltype, rankby=rankby)
# runquery(api_key, location, keyword=keyword, ltype=ltype, radius=RADIUS)
gmaps_df = gmaps_df.loc[(gmaps_df['user_ratings_total'] >= MIN_USER_RATINGS) & (gmaps_df['rating'] >= MIN_RATING)] \
        .sort_values(['rating', 'user_ratings_total'], ascending=False) \
        .reset_index(drop=True)
gmaps_df = gmaps_df[['name', 'vicinity', 'rating', 'user_ratings_total', 'geometry.location.lat', 'geometry.location.lng']]
gmaps_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng']
# drop trailing ", Brooklyn"
gmaps_df['address'] = gmaps_df['address'].apply(lambda address: " ".join(address.split(',')[:-1]))
gmaps_df


Unnamed: 0,name,address,rating,nratings,lat,lng
0,L'Arte Della Pizza Brooklyn,172 5th Ave,4.8,71,40.677606,-73.979957
1,Juliana's,19 Old Fulton St,4.6,3956,40.702747,-73.993435
2,Joe’s Pizza,124 Fulton St,4.6,954,40.710057,-74.007649
3,Sottocasa,298 Atlantic Ave,4.6,672,40.688307,-73.988978
4,Dellarocco's,214 Hicks St,4.6,433,40.695009,-73.996108
5,The House of Pizza & Calzone,132 Union St,4.6,316,40.683997,-74.002281
6,&pizza - Wall Street,63 Wall St,4.6,113,40.705765,-74.008547
7,La Villa Pizzeria,261 5th Ave,4.5,729,40.674325,-73.981687
8,Patsy’s Pizzeria,450 Dean St,4.5,593,40.681828,-73.976196
9,Table 87,87 Atlantic Ave,4.5,476,40.691282,-73.997321


In [11]:
# plot on google map

markers = [(row.lat, row.lng) for row in gmaps_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in gmaps_df.itertuples()]
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Google rating</dt><dd>{rating}</dd>
<dt>Google reviews</dt><dd>{nratings}</dd>
</dl>
"""
marker_info = [info_box_template.format(**row) for i, row in gmaps_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=eval(location), zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [12]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
    
venues_map

## Yelp

In [13]:
lat, lng = location_coords
response = yelp_api.search_query(categories=keyword, latitude=lat, longitude=lng, 
                                 radius=RADIUS, sort_by=rankby, limit=NRESULTS)

yelp_df = pd.json_normalize(response['businesses'])
yelp_df = yelp_df.loc[(yelp_df['review_count'] >= MIN_USER_RATINGS) & (yelp_df['rating'] >= MIN_RATING)] \
    .sort_values(['rating', 'review_count'], ascending=False) \
    .reset_index(drop=True)
display_columns = ['name', 'location.address1', 'rating', 'review_count', 'coordinates.latitude', 'coordinates.longitude', 'url']
yelp_df = yelp_df[display_columns]
yelp_df.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng', 'url']
yelp_df


Unnamed: 0,name,address,rating,nratings,lat,lng,url
0,Juliana's Pizza,19 Old Fulton St,4.5,2199,40.702615,-73.993416,https://www.yelp.com/biz/julianas-pizza-brookl...
1,Lucali,575 Henry St,4.5,1560,40.6818,-74.00024,https://www.yelp.com/biz/lucali-brooklyn-3?adj...
2,Sottocasa Pizzeria,298 Atlantic Ave,4.5,628,40.688285,-73.989006,https://www.yelp.com/biz/sottocasa-pizzeria-br...
3,Piz-zetta,90 Livingston St,4.5,288,40.691283,-73.990603,https://www.yelp.com/biz/piz-zetta-brooklyn-2?...
4,Forcella Fried Pizza,445 Albee square W,4.5,43,40.69088,-73.982868,https://www.yelp.com/biz/forcella-fried-pizza-...
5,Forno Rosso,327 Gold St,4.0,680,40.694467,-73.98293,https://www.yelp.com/biz/forno-rosso-brooklyn?...
6,Front Street Pizza,80 Front St,4.0,359,40.70244,-73.98943,https://www.yelp.com/biz/front-street-pizza-br...
7,La Cigogne,215 Union St,4.0,356,40.683501,-73.999304,https://www.yelp.com/biz/la-cigogne-brooklyn?a...
8,Circa Brewing,141 Lawrence St,4.0,316,40.69165,-73.98603,https://www.yelp.com/biz/circa-brewing-brookly...
9,Table 87 - Brooklyn Heights,87 Atlantic Ave,4.0,270,40.691219,-73.997345,https://www.yelp.com/biz/table-87-brooklyn-hei...


In [14]:
markers = [(row.lat, row.lng) for row in yelp_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in yelp_df.itertuples()]

info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Yelp rating</dt><dd>{rating}</dd>
<dt>Yelp reviews</dt><dd>{nratings}</dd>
</dl>
"""

marker_info = [info_box_template.format(**row) for i, row in yelp_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [15]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map

## Foursquare

In [16]:
client = foursquare.Foursquare(client_id=foursquare_id, 
                               client_secret=foursquare_secret, 
                               redirect_uri='http://streeteye.com/oauth/authorize')


In [17]:
response = client.venues.search(params={'query': keyword, 'll': "%.7f,%.7f" % location_coords, 
                                        'radius': RADIUS, 'limit': NRESULTS})
foursquare_df = pd.json_normalize(response['venues'])
foursquare_df


Unnamed: 0,id,name,categories,referralId,hasPerk,location.address,location.crossStreet,location.lat,location.lng,location.labeledLatLngs,...,location.country,location.formattedAddress,delivery.id,delivery.url,delivery.provider.name,delivery.provider.icon.prefix,delivery.provider.icon.sizes,delivery.provider.icon.name,venuePage.id,location.neighborhood
0,4aad3536f964a520035f20e3,MontyQ's Brick Oven Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,158 Montague St,At Clinton,40.69437,-73.992879,"[{'label': 'display', 'lat': 40.69436997583063...",...,United States,"[158 Montague St (At Clinton), Brooklyn, NY 11...",85392.0,https://www.seamless.com/menu/monty-qs-158-mon...,seamless,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_seamless_20180129.png,98269509.0,
1,50ca6337e4b04e1f3135689c,Juliana's Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,19 Old Fulton St,,40.702769,-73.993616,"[{'label': 'display', 'lat': 40.702769, 'lng':...",...,United States,"[19 Old Fulton St, Brooklyn, NY 11201, United ...",,,,,,,,
2,49bfceb0f964a52028551fe3,Ignazio's Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,4 Water St,btwn Old Fulton & Dock St,40.703299,-73.994029,"[{'label': 'display', 'lat': 40.70329900425840...",...,United States,"[4 Water St (btwn Old Fulton & Dock St), Brook...",294372.0,https://www.seamless.com/menu/ignazios-pizza-4...,seamless,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_seamless_20180129.png,,
3,5c18213c28374e002cb23f56,Jay St. Fresh 99¢ Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,408 Jay St,between Fulton & Willoughby Sts.,40.691787,-73.987462,"[{'label': 'display', 'lat': 40.691787, 'lng':...",...,United States,[408 Jay St (between Fulton & Willoughby Sts.)...,,,,,,,,
4,4d879b2ad5fab60cd043229c,99 Cent Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,51D Willoughby St,Willoughby Betw. Jay & Lawrence,40.692165,-73.986972,"[{'label': 'display', 'lat': 40.69216532940540...",...,United States,[51D Willoughby St (Willoughby Betw. Jay & Law...,1716716.0,https://www.seamless.com/menu/99-cent-supreme-...,seamless,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_seamless_20180129.png,,Downtown Brooklyn
5,49c7ce82f964a520c3571fe3,Front Street Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,80 Front St,at Washington St,40.702523,-73.98964,"[{'label': 'display', 'lat': 40.70252299593588...",...,United States,"[80 Front St (at Washington St), Brooklyn, NY ...",66767.0,https://www.seamless.com/menu/front-street-piz...,seamless,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_seamless_20180129.png,80597749.0,
6,4aa69424f964a5203c4a20e3,Fascati Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,80 Henry St,btwn Orange & Pineapple St,40.698355,-73.992522,"[{'label': 'display', 'lat': 40.69835547472359...",...,United States,"[80 Henry St (btwn Orange & Pineapple St), Bro...",,,,,,,94535902.0,
7,5428807b498e538336103286,99 Cent Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,255 Livingston St,at Bond St,40.688831,-73.983299,"[{'label': 'display', 'lat': 40.6888313293457,...",...,United States,"[255 Livingston St (at Bond St), Brooklyn, NY ...",1754276.0,https://www.seamless.com/menu/99-cents-hot-piz...,seamless,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_seamless_20180129.png,,
8,4eb5985f2c5b53141a674e2c,Mario's Pizza & Chicken,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,222 Hoyt St,btwn Baltic & Butler St,40.683542,-73.990013,"[{'label': 'display', 'lat': 40.68354199999999...",...,United States,"[222 Hoyt St (btwn Baltic & Butler St), Brookl...",,,,,,,,
9,4df6088da809141629aabfd2,2 Bros. Pizza,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",v-1595779546,False,395 Flatbush Ave,at Fulton St,40.689394,-73.981125,"[{'label': 'display', 'lat': 40.68939371833308...",...,United States,"[395 Flatbush Ave (at Fulton St), Brooklyn, NY...",,,,,,,,Fort Greene


In [18]:
# iterate through venues to get ratings, nratings
def parse_foursquare_results(response):
    """process foursquare response, query details for each row to get rating and nratings"""
    retarray = []

    for i, venue in pd.json_normalize(response['venues']).iterrows():
        venue_id = venue['id']
        # query detailed venue info from foursquare
        venue_details = client.venues(venue_id)['venue']
        try:
            venue_name = venue['name']
            venue_address = venue['location.address']
            venue_rating = venue_details['rating']
            venue_nratings = venue_details['ratingSignals']
            venue_url = venue['delivery.url']
            venue_lat = venue['location.lat']
            venue_lng = venue['location.lng']
            retarray.append([venue_name, venue_address, venue_rating, venue_nratings, venue_lat, venue_lng, venue_url])
            
        except Exception as e:
            # sometimes no rating ... probably not popular enough
            print(type(e), str(e))
            # print(traceback.format_exc())
            print("No rating for %s" % venue_name)
            continue

    print(len(retarray))
    print(pd.json_normalize(response['venues']).shape)
    retdf = pd.DataFrame(retarray)
    retdf.columns = ['name', 'address', 'rating', 'nratings', 'lat', 'lng', 'url']
    return retdf

foursquare_df = parse_foursquare_results(response)
foursquare_df = foursquare_df.loc[(foursquare_df['nratings'] >= MIN_USER_RATINGS) & (foursquare_df['rating'] >= MIN_RATING)] \
        .sort_values(['rating', 'nratings'], ascending=False) \
        .reset_index(drop=True)
foursquare_df

<class 'KeyError'> 'rating'
No rating for Jay St. Fresh 99¢ Pizza
<class 'KeyError'> 'rating'
No rating for 99 Cent Pizza
<class 'KeyError'> 'rating'
No rating for Mario's Pizza & Chicken
<class 'KeyError'> 'rating'
No rating for Texas Fried Chicken & Pizza
<class 'KeyError'> 'rating'
No rating for S&S Brooklyn Pizza
<class 'KeyError'> 'rating'
No rating for Torpedo Pizza
<class 'KeyError'> 'rating'
No rating for Pizza Rustica
<class 'KeyError'> 'rating'
No rating for Pizza a Casa Pizza School
<class 'KeyError'> 'rating'
No rating for Pizza Cafe
<class 'KeyError'> 'rating'
No rating for Pizza Moto
<class 'KeyError'> 'rating'
No rating for Primo New York Pizza
<class 'KeyError'> 'rating'
No rating for pizza vn
38
(50, 25)


Unnamed: 0,name,address,rating,nratings,lat,lng,url
0,Prince Street Pizza,27 Prince St,9.2,1860,40.723093,-73.994527,https://www.seamless.com/menu/prince-st-pizza-...
1,Juliana's Pizza,19 Old Fulton St,8.9,1365,40.702769,-73.993616,
2,Adrienne's Pizza Bar,54 Stone St,8.6,880,40.704348,-74.010223,
3,Patrizia's Pizza and Pasta,35 Broadway,8.2,217,40.710902,-73.96764,https://www.seamless.com/menu/patrizias-pizza-...
4,Lombardi's Coal Oven Pizza,32 Spring St,8.1,2030,40.721636,-73.995635,
5,Fatoosh BBQ & Pizza,330 Hicks St,8.1,63,40.691328,-73.997739,
6,Harry's Italian Pizza Bar,2 Gold St,8.0,513,40.707634,-74.006996,
7,Brooklyn Pizza Market,267 Smith St,8.0,54,40.682696,-73.993191,https://www.seamless.com/menu/brooklyn-pizza-m...
8,La Pizza & La Pasta,4 World Trade Ctr,7.9,270,40.710567,-74.012494,
9,Pizza Secret,72 5th Ave,7.9,50,40.68081,-73.977636,https://www.seamless.com/menu/pizza-secret-nea...


In [19]:
markers = [(row.lat, row.lng) for row in foursquare_df.itertuples()]
marker_hover = ["%s: %s (%s)" % (row.name, row.rating, row.nratings) for row in foursquare_df.itertuples()]

info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
<dt>Foursquare rating</dt><dd>{rating}</dd>
<dt>Foursquare reviews</dt><dd>{nratings}</dd>
</dl>
"""
marker_info = [info_box_template.format(**d_item) for i, d_item in foursquare_df.iterrows()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [20]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map

## Deduplicate and merge

In [21]:
pd.set_option('display.max_rows', None)

gmaps_df_copy = gmaps_df.copy()
gmaps_df_copy['source'] = '0_gmaps'

yelp_df_copy = yelp_df.copy() 
yelp_df_copy['source'] = '1_yelp'

foursquare_df_copy = foursquare_df.copy()
foursquare_df_copy['source'] = '2_foursquare'

venues_df = pd.concat([gmaps_df_copy, yelp_df_copy, foursquare_df_copy]).reset_index()
venues_df['latlong'] = venues_df[['lat','lng']].apply(tuple, axis=1)

venues_df.sort_values('name')

Unnamed: 0,index,name,address,rating,nratings,lat,lng,source,url,latlong
6,6,&pizza - Wall Street,63 Wall St,4.6,113,40.705765,-74.008547,0_gmaps,,"(40.7057647, -74.0085474)"
87,34,2 Bros Pizza,395 Flatbush Ave,3.0,84,40.689339,-73.980896,1_yelp,https://www.yelp.com/biz/2-bros-pizza-brooklyn...,"(40.689338684082, -73.9808959960938)"
37,37,2 Bros. Pizza,395 Flatbush Ave Ext #5321,4.1,452,40.689439,-73.98094,0_gmaps,,"(40.6894392, -73.9809399)"
104,15,2 Bros. Pizza,395 Flatbush Ave,7.1,62,40.689394,-73.981125,2_foursquare,,"(40.68939371833308, -73.98112529498947)"
44,44,99 Cents Hot Pizza,255 Livingston St,4.0,119,40.688824,-73.983285,0_gmaps,,"(40.68882420000001, -73.98328459999999)"
91,2,Adrienne's Pizza Bar,54 Stone St,8.6,880,40.704348,-74.010223,2_foursquare,,"(40.704348091685645, -74.01022261314714)"
15,15,Adrienne's Pizzabar,54 Stone St,4.4,1630,40.704294,-74.010065,0_gmaps,,"(40.7042941, -74.0100646)"
48,48,Angelica,332 Livingston St,3.9,88,40.687885,-73.981566,0_gmaps,,"(40.687885, -73.981566)"
47,47,Antonio's Pizzeria,32 Court St,3.9,109,40.692952,-73.991008,0_gmaps,,"(40.6929516, -73.99100829999999)"
82,29,Antonio's Pizzeria & Cafe,32 Court St,3.5,72,40.692966,-73.991046,1_yelp,https://www.yelp.com/biz/antonios-pizzeria-and...,"(40.6929657361683, -73.9910462092274)"


In [22]:
# run dedupe algorithm using name, address as default texts, latlong as latlong
venues_df2 = pandas_dedupe.dedupe_dataframe(venues_df, ['name', 'address', ('latlong', 'LatLong')])


Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...
# duplicate sets 78


In [23]:
# view clustering
venues_df['cluster'] = venues_df2['cluster id']
venues_df = venues_df.sort_values(['cluster', 'source'])[['cluster', 'name', 'address', 'rating', 'nratings', 'lat', 'lng', 'source']]
venues_df

Unnamed: 0,cluster,name,address,rating,nratings,lat,lng,source
1,0,Juliana's,19 Old Fulton St,4.6,3956,40.7027467,-73.9934349,0_gmaps
53,0,Juliana's Pizza,19 Old Fulton St,4.5,2199,40.7026153030093,-73.9934159993549,1_yelp
90,0,Juliana's Pizza,19 Old Fulton St,8.9,1365,40.702769,-73.993616,2_foursquare
3,1,Sottocasa,298 Atlantic Ave,4.6,672,40.6883065,-73.9889778,0_gmaps
55,1,Sottocasa Pizzeria,298 Atlantic Ave,4.5,628,40.688285,-73.989006,1_yelp
4,2,Dellarocco's,214 Hicks St,4.6,433,40.6950091,-73.9961082,0_gmaps
63,2,Dellarocco's,214 Hicks St,4.0,265,40.6950309,-73.9961252,1_yelp
5,3,The House of Pizza & Calzone,132 Union St,4.6,316,40.6839972,-74.00228129999999,0_gmaps
70,3,House of Pizza & Calzones,132 Union St,4.0,141,40.683944,-74.00225,1_yelp
101,3,House of Pizza & Calzones,132 Union St,7.3,48,40.68403941172705,-74.00215525440561,2_foursquare


In [38]:
# group by clusters
cluster_df = venues_df.groupby('cluster')[['name', 'address', 'lat', 'lng', 'source']] \
    .first() \
    .reset_index()
cluster_df

Unnamed: 0,cluster,name,address,lat,lng,source
0,0,Juliana's,19 Old Fulton St,40.7027467,-73.9934349,0_gmaps
1,1,Sottocasa,298 Atlantic Ave,40.6883065,-73.9889778,0_gmaps
2,2,Dellarocco's,214 Hicks St,40.6950091,-73.9961082,0_gmaps
3,3,The House of Pizza & Calzone,132 Union St,40.6839972,-74.00228129999999,0_gmaps
4,4,Table 87,87 Atlantic Ave,40.6912825,-73.9973211,0_gmaps
5,5,Pizza Town,85 5th Ave,40.68,-73.9777778,0_gmaps
6,6,Luzzo's,145 Atlantic Ave,40.690696,-73.99522499999999,0_gmaps
7,7,Brado,155 Atlantic Ave,40.69064520000001,-73.9949106,0_gmaps
8,8,Pizza Secret,72 5th Ave,40.6808089,-73.9777158,0_gmaps
9,9,Adrienne's Pizzabar,54 Stone St,40.7042941,-74.0100646,0_gmaps


In [25]:
# make markers on clusters
# add ratings on all rows

markers = [(float(row.lat), float(row.lng)) for row in cluster_df.itertuples()]
marker_hover = ["%s" % (row.name) for row in cluster_df.itertuples()]

# make a dict by cluster, initialize rating string to ''
marker_dict = {i: {'name': row['name'],
                   'address': row['address'],
                   'lat': row['lat'],
                   'lng': row['lng'],
                   'rate_str': '',
                  } for i, row in cluster_df.iterrows()}

# add all rating strings
for i, row in venues_df.iterrows():
    cluster = row['cluster']
    if row.source[2:] == 'gmaps':
        marker_dict[cluster]['rate_str'] += "<dt>Google rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    elif row.source[2:] == 'yelp':
        marker_dict[cluster]['rate_str'] += "<dt>Yelp rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    elif row.source[2:] == 'foursquare':
        marker_dict[cluster]['rate_str'] += "<dt>Foursquare rating</dt><dd>%s (%s reviews)</dd>\n" % (row.rating, row.nratings)
    
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Address</dt><dd>{address}</dd>
{rate_str}

</dl>
"""    

marker_info = [info_box_template.format(**d_item) for i, d_item in marker_dict.items()]

marker_layer = gmaps.marker_layer(markers, hover_text=marker_hover, info_box_content=marker_info)

figure_layout = {
    'width': '800px',
    'height': '800px',
    'border': '1px solid black',
    'padding': '1px'
}

fig = gmaps.figure(layout=figure_layout, center=location_coords, zoom_level=14)
fig.add_layer(marker_layer)
fig


Figure(layout=FigureLayout(border='1px solid black', height='800px', padding='1px', width='800px'))

In [26]:
folium_markers = [(a[0], a[1], b) for a, b in zip(markers, marker_info)]

venues_map = folium.Map(location=[*location_coords], zoom_start=14)
for lat, lng, label in folium_markers:
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        color='blue',
        tooltip=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(venues_map)
venues_map


In [43]:
merge_df = cluster_df \
    .merge(venues_df.loc[venues_df['source']=='0_gmaps'][['cluster','rating']], on='cluster', how='outer') \
    .rename(columns={'rating': 'gmaps_rating'})
merge_df['gmaps_rating_std'] = StandardScaler().fit_transform(merge_df[['gmaps_rating']])

merge_df = merge_df \
    .merge(venues_df.loc[venues_df['source']=='1_yelp'][['cluster','rating']], on='cluster', how='outer') \
    .rename(columns={'rating': 'yelp_rating'})
merge_df['yelp_rating_std'] = StandardScaler().fit_transform(merge_df[['yelp_rating']])

merge_df = merge_df \
    .merge(venues_df.loc[venues_df['source']=='2_foursquare'][['cluster','rating']], on='cluster', how='outer') \
    .rename(columns={'rating': 'foursquare_rating'})
merge_df['foursquare_rating_std'] = StandardScaler().fit_transform(merge_df[['foursquare_rating']])

merge_df 

Unnamed: 0,cluster,name,address,lat,lng,source,gmaps_rating,gmaps_rating_std,yelp_rating,yelp_rating_std,foursquare_rating,foursquare_rating_std
0,0,Juliana's,19 Old Fulton St,40.7027467,-73.9934349,0_gmaps,4.6,1.228375,4.5,1.559457,8.9,1.789323
1,1,Sottocasa,298 Atlantic Ave,40.6883065,-73.9889778,0_gmaps,4.6,1.228375,4.5,1.559457,,
2,2,Dellarocco's,214 Hicks St,40.6950091,-73.9961082,0_gmaps,4.6,1.228375,4.0,0.436648,,
3,3,The House of Pizza & Calzone,132 Union St,40.6839972,-74.00228129999999,0_gmaps,4.6,1.228375,4.0,0.436648,7.3,-0.193306
4,4,Table 87,87 Atlantic Ave,40.6912825,-73.9973211,0_gmaps,4.5,0.862623,4.0,0.436648,,
5,5,Pizza Town,85 5th Ave,40.68,-73.9777778,0_gmaps,4.5,0.862623,,,7.5,0.054522
6,6,Luzzo's,145 Atlantic Ave,40.690696,-73.99522499999999,0_gmaps,4.5,0.862623,4.0,0.436648,,
7,7,Brado,155 Atlantic Ave,40.69064520000001,-73.9949106,0_gmaps,4.5,0.862623,4.0,0.436648,,
8,8,Pizza Secret,72 5th Ave,40.6808089,-73.9777158,0_gmaps,4.5,0.862623,,,7.9,0.55018
9,9,Adrienne's Pizzabar,54 Stone St,40.7042941,-74.0100646,0_gmaps,4.4,0.496871,,,8.6,1.41758


In [91]:
# simple average score
merge_df['meanrating'] = np.nanmean(merge_df[['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']], axis=1)
merge_df.sort_values('meanrating', ascending=False)[['name', 'address', 'gmaps_rating', 'yelp_rating', 'foursquare_rating']]


Unnamed: 0,name,address,gmaps_rating,yelp_rating,foursquare_rating
67,Prince Street Pizza,27 Prince St,,,9.2
29,L'Arte Della Pizza Brooklyn,172 5th Ave,4.8,,
57,Piz-zetta,90 Livingston St,,4.5,
56,Lucali,575 Henry St,,4.5,
0,Juliana's,19 Old Fulton St,4.6,4.5,8.9
1,Sottocasa,298 Atlantic Ave,4.6,4.5,
31,&pizza - Wall Street,63 Wall St,4.6,,
30,Joe’s Pizza,124 Fulton St,4.6,,
13,Forcella Fried Pizza,445 Albee Square W,4.4,4.5,
9,Adrienne's Pizzabar,54 Stone St,4.4,,8.6


In [92]:
# bayes score
rating_cols = ['gmaps_rating_std', 'yelp_rating_std', 'foursquare_rating_std']
merge_df['nratings'] = merge_df[rating_cols].count(axis=1)
nratings_mean = np.mean(merge_df['nratings'])
rating_avg = np.nanmean(merge_df[rating_cols])
merge_df['w'] = merge_df['nratings']/(merge_df['nratings'] + nratings_mean)
merge_df['R'] = np.mean(merge_df[rating_cols], axis=1)
print('mean number of ratings', nratings_mean)
print('average rating', rating_avg)
merge_df['bayes_score'] = merge_df['w'] * merge_df['R'] + (1 - merge_df['w']) * rating_avg
merge_df.sort_values('bayes_score', ascending=False)[['name', 'address', 'gmaps_rating', 'yelp_rating', 'foursquare_rating', 'nratings', 'bayes_score']]




mean number of ratings 1.4615384615384615
average rating -7.47939721852737e-16


Unnamed: 0,name,address,gmaps_rating,yelp_rating,foursquare_rating,nratings,bayes_score
0,Juliana's,19 Old Fulton St,4.6,4.5,8.9,3,1.025914
67,Prince Street Pizza,27 Prince St,,,9.2,1,0.877933
1,Sottocasa,298 Atlantic Ave,4.6,4.5,,2,0.805374
29,L'Arte Della Pizza Brooklyn,172 5th Ave,4.8,,,1,0.796201
57,Piz-zetta,90 Livingston St,,4.5,,1,0.633529
56,Lucali,575 Henry St,,4.5,,1,0.633529
13,Forcella Fried Pizza,445 Albee Square W,4.4,4.5,,2,0.59405
9,Adrienne's Pizzabar,54 Stone St,4.4,,8.6,2,0.553063
30,Joe’s Pizza,124 Fulton St,4.6,,,1,0.499027
31,&pizza - Wall Street,63 Wall St,4.6,,,1,0.499027


In [88]:
np.nanmean(merge_df[rating_cols])

-7.47939721852737e-16

In [71]:
df

Unnamed: 0,Bar,User1,User2,User3,n_reviews,nreviews,w,R,S
0,Snickers,0.01,,0.7,2,2,0.5,0.355,0.3905
1,Mars Bars,0.25,0.4,0.1,3,3,0.6,0.25,0.3204
2,Milky Way,0.9,1.0,,2,2,0.5,0.95,0.688
3,Almond Joy,,,,0,0,0.0,,
4,Babe Ruth,0.5,0.1,0.3,3,3,0.6,0.3,0.3504
