In [1]:
from tqdm import tqdm

from yelp.client import Client
from yelp.oauth1_authenticator import Oauth1Authenticator

import googlemaps

import json
import pandas as pd
import math

In [2]:
# create yelp and google clients
with open('secrets.json') as f:    
    creds = json.load(f)
    # yelp
    auth = Oauth1Authenticator(**creds["yelp"])
    yelp_client = Client(auth)
    # google
    google_client = googlemaps.Client(key=creds["googlemaps"]["key"])

In [3]:
# get inspected restaurant data
inspected = pd.read_csv('uniq_biz.csv', index_col=0)

inspected.loc[:, 'yelp_rating'] = None
inspected.loc[:, 'yelp_categories'] = None
inspected.loc[:, 'yelp_review_count'] = None
inspected.loc[:, 'goog_lat'] = None
inspected.loc[:, 'goog_lng'] = None

inspected.head()

Unnamed: 0_level_0,CAMIS,DBA,PHONE,loc,INSPECTION DATE,GRADE,new_grade,yelp_rating,yelp_categories,yelp_review_count,goog_lat,goog_lng
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,50063325,PARIS BAGUETTE,2016865943,"575 8TH AVE, MANHATTAN, 10018",1900-01-01,,,,,,,
1,50063312,TAPA,7187374237,"102 LEXINGTON AVE, MANHATTAN, 10016",1900-01-01,,,,,,,
2,50063305,LUANNE'S WILD GINGER,7186268663,"676 FRANKLIN AVE, BROOKLYN, 11238",1900-01-01,,,,,,,
3,50063296,MARCO'S,2159324545,"1071 BROADWAY, BROOKLYN, 11221",1900-01-01,,,,,,,
4,50063282,BOND 45 ITALIAN KITCHEN AND BAR,2123999547,"221 W 46TH ST, MANHATTAN, 10036",1900-01-01,,,,,,,


In [4]:
pd.options.mode.chained_assignment = None

In [5]:
def get_keywords(name):
    # lowercase
    keywords = name.lower().split(' ')
    # ignore puncuation ect.
    keywords = map(lambda word: word.replace("'s","")\
                                    .replace("'","")\
                                    .replace("-"," ")\
                                    .replace(".","")\
                                    .replace(",",""), keywords)
    # only keeps words of length 3 or more
    keywords = filter(lambda word: len(word) >= 3, keywords)
    # but throw out a common set of 3 letter words
    keywords = filter(lambda word: word not in ['and', 'the', 'for'], keywords)
    return keywords

In [6]:
def find_yelp(df):
    start_point = df.index[0]
    length = len(df)
    for row, insp in tqdm(df.iterrows()):
        
        try:
            name = insp[1]
            addr = insp[3]
            true_keywords = get_keywords(name)
            ### TEST - print 'SEARCH FOR: {}'.format(true_keywords)

            # get google lat/long for address
            geocode = google_client.geocode(addr)

            # if the geocoding worked
            if len(geocode) > 0:
                loc = geocode[0]["geometry"]["location"]
                df.loc[row, 'goog_lat'] = loc['lat']
                df.loc[row, 'goog_lng'] = loc['lng']

                params = {
                    'term': " ".join(true_keywords),
                    'sort': 0,
                    'radius_filter': 500,
                    'limit': 5
                }
                yelp_results = yelp_client.search_by_coordinates(loc['lat'], loc['lng'], **params)
                ### TEST - print 'loc: {}, {}'.format(loc['lat'], loc['lng'])

            # if the geocoding failed, search without lat/lng, and check a few more candidates
            else:
                params = {
                    'term': ' '.join(true_keywords),
                    'sort': 0,
                    'limit': 10,
                    'location': 'New York, NY'
                }
                yelp_results = yelp_client.search(**params)

            # finding the best match among candidates
            best_match = False
            ### TEST - print 'candidates:'
            for i, biz in enumerate(yelp_results.businesses):
                candidate_keywords = get_keywords(biz.name)
                ### TEST - print candidate_keywords
                union = len(set(true_keywords) & set(candidate_keywords))
                if union > 0:
                    if best_match == False or union > best_match[1]:
                        best_match = (i, union)

            # append restaurant attributes to the inspections dataframe
            if best_match == False:
                pass # dataframe attributes already default to 'None'
            else:
                best_match = yelp_results.businesses[best_match[0]]
                df.loc[row, 'yelp_review_count'] = best_match.review_count
                df.loc[row, 'yelp_categories'] = str(map(lambda cat: cat.alias, best_match.categories))
                df.loc[row, 'yelp_rating'] = best_match.rating
        
        except:
            pass # catch-all to skip the record and move on... BAD PRACTICE BUT OH WELL

    return df

In [18]:
# testing on a smaller set
df2 = find_yelp(inspected.iloc[10005:10010,:])

5it [00:02,  2.07it/s][00:00,  2.81it/s]


In [19]:
df2

Unnamed: 0_level_0,CAMIS,DBA,PHONE,loc,INSPECTION DATE,GRADE,new_grade,yelp_rating,yelp_categories,yelp_review_count,goog_lat,goog_lng
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10005,50007883,PRIMORSKIY CORP.,2125420747,"1526 NEPTUNE AVE, BROOKLYN, 11224",2016-05-17,A,A,,,,40.5791,-73.9839
10006,50007882,THE FINANCIER PATISSERIE,2127863220,"3-4 World Financial Center, MANHATTAN, 10281",2016-11-04,A,A,3.5,"[u'coffee', u'desserts']",73.0,40.7135,-74.0151
10007,50007880,MANDARIN ORIENTAL HOTEL-ASIATE,2128058368,"80 COLUMBUS CIRCLE at 60th street, MANHATTAN, ...",2016-04-21,A,A,,,,40.7691,-73.983
10008,50007879,MANDARIN ORIENTAL HOTEL-CAFETERIA,2128058800,"80 COLUMBUS CIRCLE, MANHATTAN, 10023",2016-04-23,A,A,,,,40.7691,-73.983
10009,50007878,MANDARIN ORIENTAL HOTEL-BANQUET,2128058888,"80 COLUMBUS CIRCLE at 60th street, MANHATTAN, ...",2016-04-27,A,A,,,,40.7691,-73.983


### run this when you have some tiiiime

In [7]:
len(inspected)

25997

In [24]:
yelp_inspected_1 = find_yelp(inspected.iloc[:5000, :])

5000it [1:54:54,  1.26s/it]1,  1.57s/it]


In [26]:
yelp_inspected_1.to_csv('yelped_1.csv')

In [27]:
yelp_inspected_2 = find_yelp(inspected.iloc[5001:10000, :])

4999it [1:49:38,  1.44s/it]2,  2.03s/it]


In [30]:
yelp_inspected_2.to_csv('yelped_2.csv')

In [31]:
yelp_inspected_3 = find_yelp(inspected.iloc[10001:15000, :])

4999it [1:48:42,  1.13it/s]1,  1.93s/it]


In [32]:
yelp_inspected_3.to_csv('yelped_3.csv')

In [8]:
yelp_inspected_4 = find_yelp(inspected.iloc[15000:20000, :])

5000it [2:21:36,  1.86s/it]1,  1.49s/it]


In [9]:
yelp_inspected_4.to_csv('yelped_4.csv')

In [10]:
yelp_inspected_5 = find_yelp(inspected.iloc[20001:, :])

5996it [2:16:00,  1.15s/it]2,  2.26s/it]


In [11]:
yelp_inspected_5.to_csv('yelped_5.csv')

In [25]:
yelp_inspected_6 = find_yelp(inspected.iloc[[5000, 10000, 20000], :])

3it [00:05,  1.89s/it][00:02,  2.43s/it]


In [16]:
y1 = pd.read_csv('yelped_1.csv', index_col=0)
y2 = pd.read_csv('yelped_2.csv', index_col=0)
y3 = pd.read_csv('yelped_3.csv', index_col=0)
y4 = pd.read_csv('yelped_4.csv', index_col=0)
y5 = pd.read_csv('yelped_5.csv', index_col=0)

In [27]:
y = pd.concat([y1, y2, y3, y4, y5, yelp_inspected_6], axis=0)

In [30]:
y.to_csv('uniq_biz_yelp_geo.csv')