##### Imports and instantiating the API

In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [2]:
with open('/Users/cjhun/.secret/yelp_api.json') as f:   #adjust for your path
    login = json.load(f)

# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

##### Define Search

In [3]:
# setting our API call parameters
LOCATION = 'PA'
TERM = 'Chicken'

##### Create a results-in-progress JSON file

In [4]:
# Specifying JSON_FILE filename (can include folder)
# include the search terms in the file name
JSON_FILE = 'Data/results_in_progress_PA_chicken.json'
JSON_FILE

'Data/results_in_progress_PA_chicken.json'

In [5]:
# check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)

# If it does not exist
if file_exists == False:
    
    # Create any needed folders
    # get the folder name only
    folder = os.path.dirname(JSON_FILE)
    
    # IF JSON_FILE included a folder
    if len(folder) > 0:
        os.makedirs(folder, exist_ok=True)
        
    # Inform user and save empty list
    print(f'[i] {JSON_FILE} not found. Saving empty list to file')
    
    # save an empty list 
    with open(JSON_FILE, 'w') as f:
        json.dump([],f)
        
# if it exists, inform user
else:
    print(f'[i] {JSON_FILE} already exists')

[i] Data/results_in_progress_PA_chicken.json not found. Saving empty list to file


##### Determine how many results are already in the file

In [6]:
# Load previous results and use len of results for offset
with open(JSON_FILE, 'r') as f:
    previous_results = json.load(f)
    
# set offset based on previous results
n_results = len(previous_results)
print(f' - {n_results} previous results found.')

 - 0 previous results found.


##### Figure out how many pages of results we will need

In [7]:
# use our yelp_api variable's search_query method to perform out API call
results = yelp_api.search_query(location=LOCATION,
                               term=TERM,
                               offset=n_results)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
# How many results total?
total_results = results['total']
total_results

10900

In [9]:
# How many did we get the details for?
results_per_page = len(results['businesses'])
results_per_page

20

In [10]:
# import additional packages for controlling our loop
import time, math
# use math.ceil to round up for the total number of pages of results
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

545

##### Add this page of results to .json file

In [11]:
# join new results with old list with extend and save to file
previous_results.extend(results['businesses'])
with open(JSON_FILE, 'w') as f:
    json.dump(previous_results, f)

##### Set up a progress bar in our for loop

In [12]:
from tqdm.notebook import tqdm_notebook
import time
for i in tqdm_notebook(range(n_pages)):
    # adds 200 ms pause
    time.sleep(.2)

  0%|          | 0/545 [00:00<?, ?it/s]

In [13]:
def create_json_file(JSON_FILE,  delete_if_exists=False):
    
    # Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)

    # If it DOES exist:
    if file_exists == True:
        
        # Check if user wants to delete if exists
        if delete_if_exists==True:
            
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            # Delete file and confirm it no longer exists
            os.remove(JSON_FILE)
            # Recursive call to function after old file deleted
            create_json_file(JSON_FILE,delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")

            
    # If it does NOT exist:
    else:
        
        # INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        
        # CREATE ANY NEEDED FOLDERS
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)

        # If JSON_FILE included a folder:
        if len(folder)>0:
            # Create the folder
            os.makedirs(folder,exist_ok=True)
        # Save an empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f)

In [14]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

[!] Data/results_in_progress_PA_chicken.json already exists. Deleting previous file...
[i] Data/results_in_progress_PA_chicken.json not found. Saving empty list to new file.
- 0 previous results found.


545

In [15]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(.2)

  0%|          | 0/545 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


In [16]:
# convert .json to dataframe
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
0,PNRz9ZAfFXTJDHM1Jkc2rA,brazas-bbq-chicken-philadelphia,Brazas BBQ Chicken,https://s3-media4.fl.yelpcdn.com/bphoto/NTIgw6...,False,https://www.yelp.com/biz/brazas-bbq-chicken-ph...,25,"[{'alias': 'chickenshop', 'title': 'Chicken Sh...",4.5,"{'latitude': 39.94139968793364, 'longitude': -...","[delivery, pickup]","{'address1': '326 South St', 'address2': None,...",12675198551,(267) 519-8551,2378.665121,
1,GWqPmrWu0kXB_-gB1H-j6A,love-and-honey-fried-chicken-philadelphia-2,Love & Honey Fried Chicken,https://s3-media3.fl.yelpcdn.com/bphoto/_5aapS...,False,https://www.yelp.com/biz/love-and-honey-fried-...,469,"[{'alias': 'chickenshop', 'title': 'Chicken Sh...",4.5,"{'latitude': 39.967461, 'longitude': -75.136992}",[delivery],"{'address1': '1100 N Front St', 'address2': No...",12157897878,(215) 789-7878,3292.403557,$$
2,3j08Cje2YWUuRxV60BiVFw,hatch-and-coop-philadelphia-2,Hatch & Coop,https://s3-media2.fl.yelpcdn.com/bphoto/eUb768...,False,https://www.yelp.com/biz/hatch-and-coop-philad...,157,"[{'alias': 'newamerican', 'title': 'American (...",4.0,"{'latitude': 39.949466, 'longitude': -75.1605349}","[pickup, delivery]","{'address1': '122 S 12th St', 'address2': '', ...",12159220102,(215) 922-0102,1029.00042,$
3,hGb-yA-llJ7hnw9NyzE52A,crunchikn-philadelphia,Crunchik'n,https://s3-media1.fl.yelpcdn.com/bphoto/r2FgSh...,False,https://www.yelp.com/biz/crunchikn-philadelphi...,224,"[{'alias': 'korean', 'title': 'Korean'}, {'ali...",4.5,"{'latitude': 39.9482037845251, 'longitude': -7...","[delivery, pickup]","{'address1': '212 S 11th St', 'address2': '', ...",12678869373,(267) 886-9373,1188.668839,$$
4,z5YxP_jyIU7dRKUA7V_H0w,cily-chicken-rice-philadelphia-2,Cily Chicken Rice,https://s3-media2.fl.yelpcdn.com/bphoto/fGRalw...,False,https://www.yelp.com/biz/cily-chicken-rice-phi...,129,"[{'alias': 'thai', 'title': 'Thai'}]",4.5,"{'latitude': 39.955545, 'longitude': -75.15547...","[delivery, pickup]","{'address1': '933 Race St', 'address2': None, ...",12159821113,(215) 982-1113,1355.087363,$$


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
995,Cn7EPZB7acLQb5fzTwFAcw,chick-fil-a-audubon,Chick-fil-A,https://s3-media4.fl.yelpcdn.com/bphoto/6INZjx...,False,https://www.yelp.com/biz/chick-fil-a-audubon?a...,38,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",3.5,"{'latitude': 39.891206, 'longitude': -75.0894783}",[delivery],"{'address1': '110 Black Horse Pike', 'address2...",18565470815.0,(856) 547-0815,9848.166376,$
996,YNl2JK7qqYyHqKc4bbKivg,la-dominique-philadelphia,La Dominique,https://s3-media3.fl.yelpcdn.com/bphoto/rbl7BN...,False,https://www.yelp.com/biz/la-dominique-philadel...,44,"[{'alias': 'creperies', 'title': 'Creperies'},...",4.5,"{'latitude': 39.9555876106024, 'longitude': -7...",[delivery],"{'address1': '3300 Market St', 'address2': '',...",,,1585.485187,$
997,8Xk9nyQkaLZyWJcZrwX2uA,kung-fu-tea-philadelphia-15,Kung Fu Tea,https://s3-media4.fl.yelpcdn.com/bphoto/HFOgwq...,False,https://www.yelp.com/biz/kung-fu-tea-philadelp...,21,"[{'alias': 'bubbletea', 'title': 'Bubble Tea'}...",4.5,"{'latitude': 40.00282, 'longitude': -75.22171}","[pickup, delivery]","{'address1': '4500 City Ave', 'address2': None...",12159219082.0,(215) 921-9082,6966.706042,
998,_Uvb_q66j8SDPG9RfRvNvg,blk-shp-swedesboro,Blk Shp,https://s3-media2.fl.yelpcdn.com/bphoto/m9-H2D...,False,https://www.yelp.com/biz/blk-shp-swedesboro?ad...,43,"[{'alias': 'tapas', 'title': 'Tapas Bars'}, {'...",4.0,"{'latitude': 39.749312, 'longitude': -75.308646}",[],"{'address1': '1301 Kings Hwy', 'address2': '',...",18564670200.0,(856) 467-0200,25582.758051,
999,EgI6S-JxwpxICPsqmaggiw,sofitel-philadelphia-philadelphia,Sofitel Philadelphia,https://s3-media4.fl.yelpcdn.com/bphoto/21UjLm...,False,https://www.yelp.com/biz/sofitel-philadelphia-...,409,"[{'alias': 'hotels', 'title': 'Hotels'}]",4.0,"{'latitude': 39.9507904, 'longitude': -75.1690...",[],"{'address1': '120 S 17th St', 'address2': '', ...",12155698300.0,(215) 569-8300,345.520198,$$


##### Check for duplicates

In [17]:
# check for duplicate results using subset='id' which will check the id columns for duplicates
final_df.duplicated(subset='id').sum()

68

In [18]:
# drop the duplicate ids and confirm there are no more duplicates
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

##### Save the final DataFrame to a .csv (or .csv.gz if it's too big)


In [19]:
# save the funal results to a compressed csv
final_df.to_csv('Data/final_results_PA_chicken.csv.gz', compression='gzip', index=False)