In [1]:
import json
import requests
import csv 
import pandas as pd

### found here: https://www.yelp.com/developers/v3/manage_app?
api_key = 'TODO'

In [14]:
def parse_yelp_business_data(businesses, params):
    csvData = []
    for business in businesses: 
        data = []
        for param in params: 
            try:
                if (param == 'coordinates'):
                    coordinates = business[param]
                    data.append(coordinates['latitude'])
                    data.append(coordinates['longitude'])
                elif (param == 'location'):
                    location = business[param]
                    address = location['address1']
                    city = location['city']
                    zipcode = location['zip_code']
                    state = location['state']
                    country = location['country']
                    data.extend([address, city, zipcode, state, country])
                elif (param == 'categories'):
                    categories = business[param]
                    cats = []
                    for category in categories:
                        cats.append(category['alias'])
                    data.append(','.join([str(x) for x in cats]))
                else:    
                    value = business[param]
                    data.append(value)
            except:
                data.append('MISSING')
        if ( len(data) != 0):
            csvData.append(data)
    return csvData

In [15]:
def get_parsed_json_yelp_data(api_key, term, borough, limit, offset):
    headers = {'Authorization': 'Bearer %s' % api_key}
    url = 'https://api.yelp.com/v3/businesses/search'
    params = {'term': term, 'location': borough, 'limit': limit, 'offset': offset}
    req = requests.get(url, params=params, headers=headers)
    parsed = json.loads(req.text)
    return parsed

def get_parsed_json_yelp_data_lat_long(api_key, term, limit, offset, latlong, radius):
    headers = {'Authorization': 'Bearer %s' % api_key}
    url = 'https://api.yelp.com/v3/businesses/search'
    params = {'term': term, 'latitude': latlong[0], 'longitude': latlong[1], 'radius': radius, 
              'limit': limit, 'offset': offset }
    req = requests.get(url, params=params, headers=headers)
    parsed = json.loads(req.text)
    return parsed

In [16]:
def create_empty_csv_file(name, header): 
    with open(name, 'w', newline='', encoding='utf-8') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerows([header])
    csvFile.close()
    
def append_to_csv_file(name, csvData):
    file_type = '.csv'
    with open(name + file_type, 'a', newline='', encoding='utf-8') as csvFile:
        writer = csv.writer(csvFile)
        try: 
            writer.writerows(csvData)
        except Exception as e:
            print(csvData)
            print(e)
    csvFile.close()

In [21]:
## Manhattan - 500 meter radius circles that cover it
latlongs = [
    [40.8702861,-73.9158498],[40.8664998,-73.9264616],[40.8630596,-73.9193376],[40.8576068,-73.9269766],
    [40.8553996,-73.9365037],[40.8494268,-73.9309248],[40.8476774,-73.9417123],[40.8419636,-73.9356183],
    [40.8390491,-73.944373],[40.8336592,-73.9390515],[40.8313213,-73.9487504],[40.8262098,-73.9397938],
    [40.823417,-73.9500076],[40.8197867,-73.9395633],[40.8174483,-73.9575019],[40.8158244,-73.9479747],
    [40.812966,-73.9388768],[40.8095232,-73.9556137],[40.7869768,-73.9435973],[40.807509,-73.946172],
    [40.8000371,-73.9341564],[40.8117967,-73.9649689],[40.803936,-73.9633383],[40.8015321,-73.9537251],
    [40.8051704,-73.9378466],[40.799323,-73.945228],[40.79367,-73.9370742],[40.7958143,-73.9613644],[40.79302,-73.948232],
    [40.798729,-73.9705936],[40.7911915,-73.969907],[40.7922312,-73.9787475],[40.7840431,-73.9738552],
    [40.7856677,-73.9831249],[40.7774441,-73.97849],[40.779069,-73.9872448],[40.7863254,-73.9535711],
    [40.7789815,-73.9582059],[40.7797614,-73.9479063],[40.7718318,-73.9593769],[40.7669565,-73.9549138],
    [40.7735868,-73.9490772],[40.7752659,-73.9688835],[40.7670757,-73.9668236],[40.7613549,-73.9613304],
    [40.770958,-73.9844276],[40.773493,-73.9937831],[40.7660176,-73.993354],[40.7642707,-73.9814235],
    [40.755819,-73.9652873],[40.76155,-73.9730121],[40.7606398,-73.9885474],[40.7559588,-73.9803935],
    [40.753098,-73.9735271],[40.7475664,-73.9672738],[40.7593349,-73.9989454],[40.7538085,-73.9907915],
    [40.7486068,-73.9829809],[40.7450955,-73.9753419],[40.7532234,-74.0023786],[40.74826,-73.9957696],
    [40.7445535,-73.9891607],[40.7474797,-74.0055543],[40.7399364,-74.0056738],[40.7414322,-73.9982066],
    [40.7403917,-73.9822421],[40.7369449,-73.9742598],[40.7366848,-73.9905676],[40.733628,-73.999494],
    [40.7323272,-74.0092787],[40.7264734,-74.0018115],[40.7289451,-73.9914259],[40.7321971,-73.9818129],
    [40.7283597,-73.9735732],[40.7251075,-73.9824137],[40.724522,-74.0113811],[40.7223477,-73.9929199],
    [40.7191879,-74.0029697],[40.7171062,-74.0121536],[40.7115113,-74.0039997],[40.7149317,-73.9946365],
    [40.7177568,-73.9852027],[40.7097547,-74.0143852],[40.7047245,-74.0061454],[40.7025698,-74.0160084],
    [40.7088886,-73.9971332],[40.7112959,-73.9871769],[40.7136712,-73.9784622],[40.7208272,-73.976059]
]

In [22]:
params = ['id','alias','name','is_closed','review_count','rating','price',
#          'transactions',
          'categories',         
          'coordinates',
          'location']

params_lat_long = ['id','alias','name','is_closed','review_count','rating','price',
#          'transactions',
          'categories',         
          'coordinates',
          'latitude', 'longitude','radius']

header = ['id','alias','name','is_closed','review_count','rating','price',
#          'transactions',
          'categories',
          'latitude','longitude',
          'address','city','zipcode','state','country']

file_name = 'restaurant_data2'
term = 'Restaurants'
borough = 'Brooklyn, NY'
offset = 0
total = 0
max_limit = 50 
limit = max_limit
radius = 500

create_empty_csv_file(file_name, header)

for latlong in latlongs:    
    print("Fetching for Latitude: {}, Longitude: {}".format(latlong[0], latlong[1]))
    offset = 0
    while (offset <= total): 
        try: 
            parsed = get_parsed_json_yelp_data_lat_long(api_key, term, limit, offset, latlong, radius)
            total = parsed['total']
            businesses = parsed['businesses']
            csvData = parse_yelp_business_data(businesses, params)
            print("Offset: {}, Total: {}, Businesses: {}".format(offset, total, len(businesses)))
            append_to_csv_file(file_name,csvData)
            offset += limit
        except Exception as e:
            print("Issue while appending parsed data: {}".format(parsed))
            raise e
        
print("Finished Scraping Here")

Fetching for Latitude: 40.8702861, Longitude: -73.9158498
Offset: 0, Total: 37, Businesses: 37
Fetching for Latitude: 40.8664998, Longitude: -73.9264616
Offset: 0, Total: 60, Businesses: 50
Offset: 50, Total: 60, Businesses: 10
Fetching for Latitude: 40.8630596, Longitude: -73.9193376
Offset: 0, Total: 69, Businesses: 50
Offset: 50, Total: 69, Businesses: 19
Fetching for Latitude: 40.8576068, Longitude: -73.9269766
Offset: 0, Total: 51, Businesses: 50
Offset: 50, Total: 51, Businesses: 1
Fetching for Latitude: 40.8553996, Longitude: -73.9365037
Offset: 0, Total: 58, Businesses: 50
Offset: 50, Total: 58, Businesses: 8
Fetching for Latitude: 40.8494268, Longitude: -73.9309248
Offset: 0, Total: 59, Businesses: 50
Offset: 50, Total: 59, Businesses: 9
Fetching for Latitude: 40.8476774, Longitude: -73.9417123
Offset: 0, Total: 60, Businesses: 50
Offset: 50, Total: 60, Businesses: 10
Fetching for Latitude: 40.8419636, Longitude: -73.9356183
Offset: 0, Total: 65, Businesses: 50
Offset: 50, Tot

Offset: 0, Total: 566, Businesses: 50
Offset: 50, Total: 566, Businesses: 50
Offset: 100, Total: 566, Businesses: 50
Offset: 150, Total: 566, Businesses: 50
Offset: 200, Total: 566, Businesses: 50
Offset: 250, Total: 566, Businesses: 50
Offset: 300, Total: 566, Businesses: 50
Offset: 350, Total: 566, Businesses: 50
Offset: 400, Total: 566, Businesses: 50
Offset: 450, Total: 566, Businesses: 50
Offset: 500, Total: 566, Businesses: 50
Offset: 550, Total: 566, Businesses: 16
Fetching for Latitude: 40.7559588, Longitude: -73.9803935
Offset: 0, Total: 600, Businesses: 50
Offset: 50, Total: 600, Businesses: 50
Offset: 100, Total: 600, Businesses: 50
Offset: 150, Total: 600, Businesses: 50
Offset: 200, Total: 600, Businesses: 50
Offset: 250, Total: 600, Businesses: 50
Offset: 300, Total: 600, Businesses: 50
Offset: 350, Total: 600, Businesses: 50
Offset: 400, Total: 600, Businesses: 50
Offset: 450, Total: 600, Businesses: 50
Offset: 500, Total: 600, Businesses: 50
Offset: 550, Total: 600, Bus

Offset: 50, Total: 469, Businesses: 50
Offset: 100, Total: 469, Businesses: 50
Offset: 150, Total: 469, Businesses: 50
Offset: 200, Total: 469, Businesses: 50
Offset: 250, Total: 469, Businesses: 50
Offset: 300, Total: 469, Businesses: 50
Offset: 350, Total: 469, Businesses: 50
Offset: 400, Total: 469, Businesses: 50
Offset: 450, Total: 469, Businesses: 19
Fetching for Latitude: 40.7191879, Longitude: -74.0029697
Offset: 0, Total: 284, Businesses: 50
Offset: 50, Total: 284, Businesses: 50
Offset: 100, Total: 284, Businesses: 50
Offset: 150, Total: 284, Businesses: 50
Offset: 200, Total: 284, Businesses: 50
Offset: 250, Total: 284, Businesses: 34
Fetching for Latitude: 40.7171062, Longitude: -74.0121536
Offset: 0, Total: 199, Businesses: 50
Offset: 50, Total: 199, Businesses: 50
Offset: 100, Total: 199, Businesses: 50
Offset: 150, Total: 199, Businesses: 49
Fetching for Latitude: 40.7115113, Longitude: -74.0039997
Offset: 0, Total: 356, Businesses: 50
Offset: 50, Total: 356, Businesses:

In [24]:
#This process has too many duplicates, let's remove them
file_name_output = file_name + '_sans_dupes' + '.csv'

df = pd.read_csv(file_name + '.csv')
df.drop_duplicates(subset=['id'], inplace=True)
df.to_csv(file_name_output, index=False)