Google places api: https://developers.google.com/places/web-service/  

In [10]:
#__date__ = 'Oct 2, 2017'
import pandas as pd
import googlemaps
import json
import numpy as np
import requests

import warnings
warnings.filterwarnings("ignore")

In [12]:
# get inspected restaurant data
inspected = pd.read_csv('uniq_biz_yelp_geo.csv', usecols=[1,2,4,11,12])
inspected.head()

Unnamed: 0,CAMIS,DBA,loc,goog_lat,goog_lng
0,50063325,PARIS BAGUETTE,"575 8TH AVE, MANHATTAN, 10018",40.755052,-73.991831
1,50063312,TAPA,"102 LEXINGTON AVE, MANHATTAN, 10016",40.742174,-73.982807
2,50063305,LUANNE'S WILD GINGER,"676 FRANKLIN AVE, BROOKLYN, 11238",40.675388,-73.956502
3,50063296,MARCO'S,"1071 BROADWAY, BROOKLYN, 11221",40.694596,-73.931075
4,50063282,BOND 45 ITALIAN KITCHEN AND BAR,"221 W 46TH ST, MANHATTAN, 10036",40.759238,-73.98624


In [13]:
# take a sample from inspected dataset
inspect_cut = inspected.iloc[540:545,:]
inspect_cut.head()

Unnamed: 0,CAMIS,DBA,loc,goog_lat,goog_lng
540,50060117,KBBQ,"672 LEXINGTON AVE, MANHATTAN, 10022",40.76013,-73.970011
541,50060116,PALACE CHICKEN & GRILL,"4445 21ST ST, QUEENS, 11101",40.748278,-73.947301
542,50060115,EL LOBITO LOCO,"330 HEBERTON AVE, STATEN ISLAND, 10302",40.632998,-74.134152
543,50060112,MANILA TO NEW YORK STEAK HOUSE,"8906-8908 QUEENS BLVD, QUEENS, 11373",40.734394,-73.874628
544,50060105,BIRDS OF A FEATHER,"191 GRAND ST, BROOKLYN, 11211",40.714213,-73.960609


In [14]:
# api credentials. Use your own 'secrets.json' file
with open('secrets.json') as f:    
    creds = json.load(f)
    google_client = googlemaps.Client(key=creds["googlemaps"]['1'])

In [15]:
def get_keywords(name):
    keywords = name.split(' ')
    # lowercase
    keywords = map(lambda word: word.lower(), keywords)
    # ignore puncuation ect.
    keywords = map(lambda word: word.replace("'","")\
                                    .replace("-","")\
                                    .replace(".","")\
                                    .replace(",",""), keywords)
    # only keeps words of length 3 or more
    keywords = filter(lambda word: len(word) >= 3, keywords)
    # but throw out a common set of 3 letter words
    keywords = filter(lambda word: word not in ['and', 'the', 'for'], keywords)
    return keywords

In [16]:
# sample
get_keywords(inspected.loc[200,'DBA'])

['panino', 'rustico']

In [17]:
def find_google(df):
    import tqdm
    for row, insp in tqdm.tqdm(df.iterrows()):
        df['google_categories'] = np.empty((len(df), 1))
        df['google_categories'][:] = np.nan
        df['google_categories'] = df['google_categories'].astype(object)
        
        name = insp[1]
        true_keywords = get_keywords(name)
        try:
            params = {
                'location': {'lat' : insp[3], 'lng' : insp[4]},
                'keyword': " ".join(true_keywords),
                'radius': 100,
            }

            google_results = google_client.places_nearby(**params)
            # finding the best match among candidates
            best_match = False
            for i, biz in enumerate(google_results['results']):
                candidate_keywords = get_keywords(biz['name'])
                union = len(set(true_keywords) & set(candidate_keywords))
                if union > 0:
                    if best_match == False or union > best_match[1]:
                        best_match = (i, union)
            # append restaurant attributes to the inspections dataframe
            if best_match == False:
                pass # dataframe attributes already default to 'None'
            else:
                best_match = google_results['results'][best_match[0]]
                df.loc[row, 'google_price_level'] = best_match.get('price_level', np.nan)
                types = [best_match['types']]
                df.set_value(row, 'google_categories', types )
                df.loc[row, 'google_rating'] = best_match.get('rating', np.nan)
                
        except (IndexError, requests.Timeout, googlemaps.exceptions.HTTPError):
            df.set_value(row, 'google_categories', None )
            df.set_value(row, 'google_price_level', None )
            df.set_value(row, 'google_rating', None )
    return df

In [18]:
# testing on sample dataset
wowgg = find_google(inspect_cut)
wowgg

5it [00:01,  2.97it/s]


Unnamed: 0,CAMIS,DBA,loc,goog_lat,goog_lng,google_categories,google_price_level,google_rating
540,50060117,KBBQ,"672 LEXINGTON AVE, MANHATTAN, 10022",40.76013,-73.970011,,,4.3
541,50060116,PALACE CHICKEN & GRILL,"4445 21ST ST, QUEENS, 11101",40.748278,-73.947301,,1.0,3.8
542,50060115,EL LOBITO LOCO,"330 HEBERTON AVE, STATEN ISLAND, 10302",40.632998,-74.134152,,,5.0
543,50060112,MANILA TO NEW YORK STEAK HOUSE,"8906-8908 QUEENS BLVD, QUEENS, 11373",40.734394,-73.874628,,,2.3
544,50060105,BIRDS OF A FEATHER,"191 GRAND ST, BROOKLYN, 11211",40.714213,-73.960609,"[[restaurant, food, point_of_interest, establi...",2.0,4.1
