<h1 style="color:green; font-size:30px; font-weight:700; text-align:center">
    <u><b>Scrape Yelp Data using Yelp API</b></u>  
</h1> 

<p></p> 
<strong> Author:</strong> Gurtej Bains <br> 
<strong> Data Source:</strong>   <br> 
<strong> Last Updated: </strong>  <br>  
<strong> Problem Statement:</strong> <br>

<h1 style="color:green"><b>Importing Modules</b> </h1>

## Import Libraries

In [1]:
# import libraries  

# Numpy
import numpy as np

# Pandas
import pandas as pd

# MatPlotLib
import matplotlib.pyplot as plt

# Seaborn
import seaborn as sns

# MissingNo
import missingno as msno

# SQL related 
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

## Yelp API

In [25]:
## API Key 
with open('/Users/gurte/.secret/yelp_api.json') as f: #change the path to match YOUR path!!
    login = json.load(f)
login.keys()

dict_keys(['client-id', 'api-key'])

In [26]:
## API Login 
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)
yelp_api

<yelpapi.yelpapi.YelpAPI at 0x2662ce458e0>

<h1 style="color:green"><b>Scrape Data</b> </h1>

## Define function

In [19]:
def create_json_file(JSON_FILE,  delete_if_exists=False):
    
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)
    
    ## If it DOES exist:
    if file_exists == True:
        
        ## Check if user wants to delete if exists
        if delete_if_exists==True:
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")

            ## delete file and confirm it no longer exits.
            os.remove(JSON_FILE)

            ## Recursive call to function after old file deleted
            create_json_file(JSON_FILE,delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")            
            
            
    ## If it does NOT exist:
    else:
        
        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        
        ## CREATE ANY NEEDED FOLDERS 
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)
        
        ## If JSON_FILE included a folder:
        if len(folder)>0:
            
            # create the folder
            os.makedirs(folder,exist_ok=True)
        
        ## Save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f)  

## Define location and Term

In [20]:
# set our API call parameters 
LOCATION = 'Atlanta, GA'
TERM = 'Pizza'

## File path to save the results 

In [21]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = "Data/results_in_progress_GA_pizza.json"
JSON_FILE    

'Data/results_in_progress_GA_pizza.json'

## Setting some terms 

In [None]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)

## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
## How many results total?
total_results = results['total']

## How many did we get the details for?
results_per_page = len(results['businesses'])

# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

## Fetching data 

In [35]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(.2)

  0%|          | 0/125 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


## Convert data to Data Frame

In [38]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,CxQ1m2iY4wQpXC64tSfWgQ,antico-pizza-atlanta,Antico Pizza,https://s3-media3.fl.yelpcdn.com/bphoto/4QMeSq...,False,https://www.yelp.com/biz/antico-pizza-atlanta?...,3552,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.5,"{'latitude': 33.7846416243576, 'longitude': -8...","[delivery, pickup]",$$,"{'address1': '1093 Hemphill Ave NW', 'address2...",14047242333,(404) 724-2333,4341.712391
1,dm9Aq2dtziNTGBFCh0yhYA,fellinis-pizza-buckhead-atlanta,Fellini's Pizza Buckhead,https://s3-media2.fl.yelpcdn.com/bphoto/nMRzKk...,False,https://www.yelp.com/biz/fellinis-pizza-buckhe...,383,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 33.831573011169, 'longitude': -84...",[delivery],$,"{'address1': '2809 Peachtree Rd NE', 'address2...",14042660082,(404) 266-0082,2287.400935
2,U1S7bSPbyPU2vYVBSZ4POA,e-ke-pizza-atlanta,E Ke Pizza,https://s3-media3.fl.yelpcdn.com/bphoto/2X5aWX...,False,https://www.yelp.com/biz/e-ke-pizza-atlanta?ad...,26,"[{'alias': 'pizza', 'title': 'Pizza'}]",5.0,"{'latitude': 33.865184, 'longitude': -84.469759}","[pickup, delivery]",,"{'address1': '2810 Paces Ferry Rd', 'address2'...",17704058798,(770) 405-8798,10584.516757
3,454LWMBW-Ps723Ljkj3CeQ,donatos-pizza-atlanta-5,Donatos Pizza,https://s3-media4.fl.yelpcdn.com/bphoto/5S3OeL...,False,https://www.yelp.com/biz/donatos-pizza-atlanta...,16,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.0,"{'latitude': 33.817867, 'longitude': -84.310288}",[],,"{'address1': '2884 N Druid Hills Rd NE', 'addr...",14043825211,(404) 382-5211,5881.64017
4,ZMEZgMF9FkgR9yl_RJkWfQ,fellinis-pizza-atlanta,Fellini's Pizza,https://s3-media2.fl.yelpcdn.com/bphoto/Oz7xW-...,False,https://www.yelp.com/biz/fellinis-pizza-atlant...,598,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 33.773584, 'longitude': -84.357644}",[delivery],$,"{'address1': '909 Ponce De Leon Ave NE', 'addr...",14048733088,(404) 873-3088,4647.476166


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
995,1L5f1hHFwfS-f-CF8Bx2Cg,auburn-pizza-lounge-atlanta,Auburn Pizza Lounge,,False,https://www.yelp.com/biz/auburn-pizza-lounge-a...,6,"[{'alias': 'pizza', 'title': 'Pizza'}]",1.0,"{'latitude': 33.755787, 'longitude': -84.381622}","[delivery, pickup]",$,"{'address1': '171 Auburn Ave', 'address2': '',...",,,6428.095581
996,GVeJd7TWnpI1lpfd61DhTg,my-pizza-n-wings-duluth,My Pizza N Wings,https://s3-media2.fl.yelpcdn.com/bphoto/DfQhQX...,False,https://www.yelp.com/biz/my-pizza-n-wings-dulu...,7,"[{'alias': 'chicken_wings', 'title': 'Chicken ...",2.0,"{'latitude': 33.94231, 'longitude': -84.12757}",[],$,"{'address1': '3725 Club Dr', 'address2': 'Ste ...",16783801774.0,(678) 380-1774,26860.228967
997,eqFA5BJ5NDjZ4TaUXJazfg,canoe-atlanta-2,Canoe,https://s3-media3.fl.yelpcdn.com/bphoto/b0HcV_...,False,https://www.yelp.com/biz/canoe-atlanta-2?adjus...,2205,"[{'alias': 'newamerican', 'title': 'American (...",4.5,"{'latitude': 33.859863, 'longitude': -84.455464}",[delivery],$$$,"{'address1': '4199 Paces Ferry Rd SE', 'addres...",17704322663.0,(770) 432-2663,9167.343267
998,-jr8C7k7r0Xfcac7tR88ug,papa-johns-pizza-cumming-2,Papa Johns Pizza,https://s3-media3.fl.yelpcdn.com/bphoto/AMZdRT...,False,https://www.yelp.com/biz/papa-johns-pizza-cumm...,32,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",2.0,"{'latitude': 34.1227112, 'longitude': -84.1685...","[delivery, pickup]",$,"{'address1': '1595 Peachtree Pkwy', 'address2'...",17708882425.0,(770) 888-2425,39268.57932
999,MaIMxWff_Qn8b2PalLdkYw,kroger-atlanta-63,Kroger,https://s3-media1.fl.yelpcdn.com/bphoto/hlb6XY...,False,https://www.yelp.com/biz/kroger-atlanta-63?adj...,95,"[{'alias': 'grocery', 'title': 'Grocery'}]",3.5,"{'latitude': 33.8225260223259, 'longitude': -8...",[],$$,"{'address1': '2452 Morosgo Way', 'address2': '...",14049460438.0,(404) 946-0438,1251.706504


<h1 style="color:green"><b>Clean Data</b> </h1>

## Check duplicates 

In [39]:
# check for duplicate ID's 
final_df.duplicated(subset='id').sum()

0

## Save Data to CSV

In [40]:
# save the final results to a compressed csv
final_df.to_csv('Data/final_results_ATL_pizza.csv.gz', compression='gzip',index=False)