In [1]:
# install
!pip install yelpapi --quiet

In [2]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [3]:
# file path creation
relative_path = os.path.join('.secret', 'yelp_api.json')

In [4]:
# Load Yelp API credentials and instantiate YelpAPI object
with open('.secret/yelp_api.json') as file:
    yelp_credentials = json.load(file)
    
yelp_api = YelpAPI(yelp_credentials['api-key'], timeout_s=5.0)

In [5]:
# Define API call parameters and output file path
LOCATION = 'Greenville, SC'
TERM = 'Sushi'
JSON_FILE = '/Data/results_SC_Sushi.json'

# Display the file path where data will be saved
print(f'Data will be saved to: {JSON_FILE}')

Data will be saved to: /Data/results_SC_Sushi.json


In [6]:
# Check if JSON_FILE exists and create it if it doesn't
if not os.path.isfile(JSON_FILE):
    
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(JSON_FILE), exist_ok=True)
    
    # Inform user and save an empty list to file
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    with open(JSON_FILE, 'w') as file:
        json.dump([], file)
else:
    # Inform user if the file already exists
    print(f'[i] {JSON_FILE} already exists.')

[i] /Data/results_SC_Sushi.json already exists.


In [7]:
# Load previous results and set offset based on the number of results
with open(JSON_FILE, 'r') as file:
    previous_results = json.load(file)

n_results = len(previous_results)

print(f'- {n_results} previous results found.')

- 0 previous results found.


## Making the first API call to get the first page of data Step-by-step

In [8]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location = LOCATION,
                                term = TERM,
                                offset = n_results)

results.keys()

dict_keys(['businesses', 'total', 'region'])

In [9]:
## How many results total?
total_results = results['total']

total_results

110

In [10]:
business_data = results['businesses']

# specify the filename where you want to save the data
json_file_path = JSON_FILE

# save the business data to a JSON file
with open(json_file_path, 'w') as file:
    json.dump(business_data, file, indent = 4)

In [11]:
## How many did we get the details for?
results_per_page = len(business_data)
print(f'number of results retrieved per page', results_per_page)

number of results retrieved per page 20


In [12]:
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil(total_results / results_per_page)

print(f'Total number of pages: {n_pages}')

Total number of pages: 6


## A more optimized solution

In [14]:
results = yelp_api.search_query(location=LOCATION, term=TERM, offset=n_results)

total_results = results['total']
business_data = results['businesses']

with open(JSON_FILE, 'w') as file:
    json.dump(business_data, file, indent=4)

results_per_page = len(business_data)

# Check if there are any results per page to avoid division by zero
if results_per_page > 0:
    n_pages = math.ceil(total_results / results_per_page)
else:
    n_pages = 0  # No pages if there are no results

print(f'Number of results retrieved per page: {results_per_page}')
print(f'Total number of pages: {n_pages}')

# Additional handling for when there are no business results
if n_pages == 0:
    print("No business data found for the given search parameters.")


Number of results retrieved per page: 20
Total number of pages: 6


In [16]:
# Assuming `results_per_call` and `total_iterations` are correctly calculated before this snippet.
for i in tqdm_notebook(range(1, total_results + 1)):
    try:
        time.sleep(0.2)  # Short delay to respect API rate limits
        
        # Load existing results to append new data
        with open(JSON_FILE, 'r') as file:
            previous_results = json.load(file)

        # Fetch new results using the current length of previous_results as the offset
        new_results = yelp_api.search_query(location=LOCATION, term=TERM, offset=len(previous_results))

        # Append and save the updated results
        updated_results = previous_results + new_results['businesses']
        with open(JSON_FILE, 'w') as file:
            json.dump(updated_results, file)

    except Exception as e:
        if 'Too Many Requests for url' in str(e):
            print('Rate limit exceeded. Stopping data collection.')
            break  # Exit loop if rate limit is exceeded
        else:
            print(f'An error occurred: {e}')
            continue  # Continue to next iteration in case of other errors

  0%|          | 0/110 [00:00<?, ?it/s]

In [18]:
# Load the final JSON file into a DataFrame
df = pd.read_json(JSON_FILE)

# Display the first and last few rows of the DataFrame
display(df.head(), df.tail())

# Check for duplicate entries based on the 'id' column
duplicate_count = df.duplicated(subset='id').sum()

print('\n')
print(f'Number of duplicate IDs: {duplicate_count}')

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
0,2jXS4oZkMhAONtd2j7L5Yg,chef-21-sushi-burger-and-korean-bbq-greenville-3,Chef 21 Sushi Burger & Korean BBQ,https://s3-media4.fl.yelpcdn.com/bphoto/TgP5gY...,False,https://www.yelp.com/biz/chef-21-sushi-burger-...,36,"[{'alias': 'korean', 'title': 'Korean'}, {'ali...",4.5,"{'latitude': 34.847671, 'longitude': -82.394229}","[delivery, pickup]","{'address1': '500 E McBee Ave', 'address2': 'S...",18642633018,(864) 263-3018,3341.861901,
1,RGRk1ioORwm_FIX8PM732Q,konnichiwa-greenville,Konnichiwa,https://s3-media3.fl.yelpcdn.com/bphoto/p47H0_...,False,https://www.yelp.com/biz/konnichiwa-greenville...,68,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",4.1,"{'latitude': 34.845952342825115, 'longitude': ...",[],"{'address1': '101 Falls Park Dr', 'address2': ...",18642524436,(864) 252-4436,4184.255183,
2,zG_XOAFi9Y560WJ1RvghBw,sushi-masa-japanese-restaurant-greenville,Sushi-Masa Japanese Restaurant,https://s3-media1.fl.yelpcdn.com/bphoto/zsRavZ...,False,https://www.yelp.com/biz/sushi-masa-japanese-r...,161,"[{'alias': 'sushi', 'title': 'Sushi Bars'}]",4.4,"{'latitude': 34.8512725830078, 'longitude': -8...",[delivery],"{'address1': '8590 Pelham Rd', 'address2': 'St...",18642882227,(864) 288-2227,11481.830881,$$
3,7cJxOV-ANX1qLThK3yV96w,otto-izakaya-greenville-4,Otto Izakaya,https://s3-media1.fl.yelpcdn.com/bphoto/TdPhFy...,False,https://www.yelp.com/biz/otto-izakaya-greenvil...,448,"[{'alias': 'japanese', 'title': 'Japanese'}, {...",4.2,"{'latitude': 34.8228218820722, 'longitude': -8...",[delivery],"{'address1': '15 Market Point Dr', 'address2':...",18645688009,(864) 568-8009,5933.485357,$$
4,Kx1x7Kf6C2gtogQErWSu0A,o-ku-greenville,O-Ku,https://s3-media2.fl.yelpcdn.com/bphoto/7dR0xy...,False,https://www.yelp.com/biz/o-ku-greenville?adjus...,38,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...",3.9,"{'latitude': 34.847954222223294, 'longitude': ...",[],"{'address1': '30 W Broad St', 'address2': None...",18643264812,(864) 326-4812,3931.009612,


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,location,phone,display_phone,distance,price
105,xQZIvcjkH2R14yaHr2qQYQ,the-cheesecake-factory-greenville-2,The Cheesecake Factory,https://s3-media3.fl.yelpcdn.com/bphoto/Wk5Aul...,False,https://www.yelp.com/biz/the-cheesecake-factor...,470,"[{'alias': 'desserts', 'title': 'Desserts'}, {...",3.1,"{'latitude': 34.8499166, 'longitude': -82.3335...",[delivery],"{'address1': '700 Haywood Mall', 'address2': '...",18642884444,(864) 288-4444,2209.333296,$$
106,xb9QSdbk63Ani2-S5MrIHQ,harris-teeter-greenville-6,Harris Teeter,https://s3-media3.fl.yelpcdn.com/bphoto/ZelRSg...,False,https://www.yelp.com/biz/harris-teeter-greenvi...,27,"[{'alias': 'grocery', 'title': 'Grocery'}, {'a...",3.6,"{'latitude': 34.8279736, 'longitude': -82.3987...",[],"{'address1': '1720 Augusta St', 'address2': ''...",18649778041,(864) 977-8041,4335.688854,$$
107,zTTrMt6nvB_bw0j3RUEsLA,new-china-greer,New China,https://s3-media2.fl.yelpcdn.com/bphoto/IGpt7F...,False,https://www.yelp.com/biz/new-china-greer?adjus...,18,"[{'alias': 'chinese', 'title': 'Chinese'}]",2.4,"{'latitude': 34.9476509, 'longitude': -82.2240...",[delivery],"{'address1': '614 N Main St', 'address2': '', ...",18648778885,(864) 877-8885,16504.99922,$
108,GDPBZJ1tDjmHC3v4uxVQzw,publix-super-market-greer-greer,Publix Super Market - Greer,https://s3-media1.fl.yelpcdn.com/bphoto/BzPvjL...,False,https://www.yelp.com/biz/publix-super-market-g...,17,"[{'alias': 'grocery', 'title': 'Grocery'}]",4.1,"{'latitude': 34.8715143081717, 'longitude': -8...",[],"{'address1': '411 The Pkwy', 'address2': '', '...",18648487820,(864) 848-7820,9662.818662,$$
109,_BLlWxSpx1mRGW9eFutYdQ,dairy-queen-grill-and-chill-mauldin-2,Dairy Queen Grill & Chill,https://s3-media1.fl.yelpcdn.com/bphoto/9AJb5X...,False,https://www.yelp.com/biz/dairy-queen-grill-and...,22,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",3.1,"{'latitude': 34.780454197801, 'longitude': -82...","[pickup, delivery]","{'address1': '112 N Main St', 'address2': None...",18643739896,(864) 373-9896,8623.43224,$




Number of duplicate IDs: 0


In [19]:
# Specify directory and base filename
directory = 'Data'
filename = 'final_results_SC_Sushi.csv.gz'  # Include .csv.gz extension here
path = os.path.join(directory, filename)

# Ensure that the 'Data' directory exists
os.makedirs(directory, exist_ok=True)

# Save DataFrame as a compressed CSV file (to save space)
df.to_csv(path, compression='gzip', index=False)

In [24]:
# Specify the correct JSON file name
json_file = 'Data/final_results_SC_Sushi.json'

# Save the DataFrame as JSON with optimal orientation for line-delimited JSON
df.to_json(json_file, orient='records', lines=True)

In [25]:
# Convert and Save as .CSV.GZ by replacing the file extension
csv_gz_file = json_file.replace('.json', '.csv.gz')

# Save the DataFrame as a compressed CSV without the index
df.to_csv(csv_gz_file, compression='gzip', index=False)

In [23]:
# Compare File Sizes to demonstrate the efficiency of compression
if os.path.exists(json_file) and os.path.exists(csv_gz_file):
    size_json = os.path.getsize(json_file)
    size_csv_gz = os.path.getsize(csv_gz_file)

    print(f'JSON FILE: {size_json:,} Bytes')
    print(f'CSV.GZ FILE: {size_csv_gz:,} Bytes')

    # Calculate and display the compression ratio if the .csv.gz file is not empty
    if size_csv_gz > 0:
        compression_ratio = size_json / size_csv_gz
        print(f'The csv.gz file is {compression_ratio:.2f} times smaller than the JSON file.')
    else:
        print("CSV.GZ file size is 0, cannot compare sizes.")
else:
    print("One or both files do not exist, check file paths.")

JSON FILE: 102,201 Bytes
CSV.GZ FILE: 16,014 Bytes
The csv.gz file is 6.38 times smaller than the JSON file.
