In [1]:
# imports
import requests
import json
import pandas as pd
import os
import time

# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice.

In [2]:
data = pd.read_csv('df_city_bike.csv')

In [3]:
data.dropna(subset=['latitude', 'longitude'], inplace=True)
# creation of variable with lon and lat together
data['ll'] = data['latitude'].astype(str) + ',' + data['longitude'].astype(str)
data = data[data['ll'] != '0.0,0.0']

#list of longitude and latitude from bike station dataframe
bike_ll = list(set(data['ll']))

In [5]:
#replace , so we can add the ll of stations to URL
bike_stop_ll = [s.replace(',', '%2C') for s in bike_ll]

In [6]:
# went to https://location.foursquare.com/developer/reference/place-search and 
# https://location.foursquare.com/developer/reference/response-fields to check how API call should be made
# checked categories https://location.foursquare.com/places/docs/categories

#set the key
api_key = os.environ["FOURSQUARE_KEY"]

# Create dictionary for headers
headers = {"Accept": "application/json"}

# Add key with our API KEY
headers['Authorization'] = api_key

In [7]:
foursquare_list_rich_20 = []

# Getting data for places around bike stops
for station in bike_stop_ll:
    try:
        url = "https://api.foursquare.com/v3/places/search?radius=1000&fields=categories%2Crating%2Cgeocodes%2Ccategories%2Clocation%2Cname%2Cfsq_id%2Cstats%2Cprice&limit=20&ll=" + station
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            foursquare_list_rich_20.append(response.json())
        else:
            print('Error occurred during the API request')
    except Exception as e:
        print(e)
        if e == "Quota exceeded":
            print("Exceeded quota: waiting for an hour")
            time.sleep(3600)


KeyboardInterrupt: 

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [8]:
df_foursquare_20 = pd.DataFrame(columns=['row_from_station', 'fsq_id', 'category_id', 'name', 'latitude', 'longitude', 'rating', 'total_ratings', 'total_photos', 'total_tips'])

for station in range(len(foursquare_list_rich_20)):
    for result in range(len(foursquare_list_rich_20[station]['results'])):
        if 'fsq_id' in foursquare_list_rich_20[station]['results'][result]:
            fsq_id = foursquare_list_rich_20[station]['results'][result]['fsq_id']
        else:
            fsq_id = None
            
        if 'categories' in foursquare_list_rich_20[station]['results'][result] and len(foursquare_list_rich_20[station]['results'][result]['categories']) > 0:
            category_id = foursquare_list_rich_20[station]['results'][result]['categories'][0].get('id', None)
        else:
            category_id = None
            
        if 'name' in foursquare_list_rich_20[station]['results'][result]:
            name = foursquare_list_rich_20[station]['results'][result]['name']
        else:
            name = None
            
        if 'geocodes' in foursquare_list_rich_20[station]['results'][result] and 'main' in foursquare_list_rich_20[station]['results'][result]['geocodes']:
            latitude = foursquare_list_rich_20[station]['results'][result]['geocodes']['main'].get('latitude', None)
            longitude = foursquare_list_rich_20[station]['results'][result]['geocodes']['main'].get('longitude', None)
        else:
            latitude = None
            longitude = None
            
        if 'rating' in foursquare_list_rich_20[station]['results'][result]:
            rating = foursquare_list_rich_20[station]['results'][result]['rating']
        else:
            rating = None
            
        if 'stats' in foursquare_list_rich_20[station]['results'][result]:
            total_ratings = foursquare_list_rich_20[station]['results'][result]['stats'].get('total_ratings', None)
            total_photos = foursquare_list_rich_20[station]['results'][result]['stats'].get('total_photos', None)
            total_tips = foursquare_list_rich_20[station]['results'][result]['stats'].get('total_tips', None)
        else:
            total_ratings = None
            total_photos = None
            total_tips = None
        
        df_foursquare_20 = pd.concat([df_foursquare_20, pd.DataFrame({
            'row_from_station': [station],
            'fsq_id': [fsq_id],
            'category_id': [category_id],
            'name': [name],
            'latitude': [latitude],
            'longitude': [longitude],
            'rating': [rating],
            'total_ratings': [total_ratings],
            'total_photos': [total_photos],
            'total_tips': [total_tips]
        })], ignore_index=True)


In [9]:
#foursquare_list (foursquare_basic.js) is a file that has only the basic information about business

#foursquare_list_rich_50 (foursquare_50.js) is file that has the same code block as foursquare_rich_20, except limit for number of businesses was 50
# df_foursquare_50.csv is dataframe for same file

Put your parsed results into a DataFrame

In [10]:
# I already made DataFrame when parsing through foursquare json file

df_foursquare_20.to_csv('df_foursquare_20.csv', index=False)

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice.

In [11]:
# Set the key
yelp_api_key = os.environ["YELP_KEY"]
# Create dictionary for headers
headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + yelp_api_key
}


In [None]:
yelp_list = []

# Getting data for places around bike stops
for index in range(len(bike_stop_ll)):
    try:
        latitude = data.iloc[index]["latitude"]
        longitude = data.iloc[index]["longitude"]
        yelp_url = "https://api.yelp.com/v3/businesses/search?latitude=" + str(latitude) + "&longitude=" + str(longitude) + "&term=park&radius=1000&categories=&sort_by=best_match&limit=20"
        yelp_response = requests.get(yelp_url, headers=headers)
        
        if yelp_response.status_code == 200:
            yelp_list.append(yelp_response.json())
        else:
            print('Error occurred during the API request')
    except Exception as e:
        print(e)
        if e == "Quota exceeded":
            print("Exceeded quota: waiting for an hour")
            time.sleep(3600)




Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [None]:
df_yelp = pd.DataFrame(columns=['row_from_station', 'id', 'name', 'rating', 'review_count', 'latitude', 'longitude'])

In [23]:
#parsing through 'yelp_list' from previous step to get columns for dataframe (id, name, rating, review_count, latitude and longitude)
for station in range(len(yelp_list)):
    for business in range(len(yelp_list[station]['businesses'])):
        df_yelp = df_yelp.append({
            'row_from_station': station,
            'id': yelp_list[station]['businesses'][business]['id'],
            'name': yelp_list[station]['businesses'][business]['name'],
            'rating': yelp_list[station]['businesses'][business]['rating'],
            'review_count': yelp_list[station]['businesses'][business]['review_count'],
            'latitude': yelp_list[station]['businesses'][business]['coordinates']['latitude'],
            'longitude': yelp_list[station]['businesses'][business]['coordinates']['longitude']
            }, ignore_index=True)

In [24]:
#checking shape and first 5 rows of dataframe
print(df_yelp.shape)
print(df_yelp.head())

(0, 7)
Empty DataFrame
Columns: [row_from_station, id, name, rating, review_count, latitude, longitude]
Index: []


In [25]:
df_yelp.to_csv('df_yelp.csv', index=False)

# Comparing Results

Which API provided you with more complete data? Provide an explanation.

The Foursquare API offers more total ratings, enhancing the validity of the ratings, and its inclusion of category IDs simplifies filtering, making it more user-friendly and efficient. On average, Foursquare has 107 reviews compared to Yelp's 61.

Get the top 10 restaurants according to their rating

In [26]:
#remove duplicate rows
df_foursquare_20_clean = df_foursquare_20.drop_duplicates(subset=["fsq_id"], keep='first')
df_yelp_clean = df_yelp.drop_duplicates(subset=["id"], keep='first')

In [27]:
df_foursquare_filter = df_foursquare_20_clean[
    (df_foursquare_20_clean['total_ratings'] > 10)
]
print('Mean for total ratings is: ' + str(df_foursquare_filter['total_ratings'].mean()))
df_foursquare_filter.sort_values(by="rating", ascending=False).head(10)

Mean for total ratings is: 199.8591873243036


Unnamed: 0,row_from_station,fsq_id,category_id,name,latitude,longitude,rating,total_ratings,total_photos,total_tips
9505,475,4ac518d2f964a52026a720e3,16032,Hyde Park,51.507274,-0.1636,9.6,12979,32853.0,1124
2379,118,4b7ffcd1f964a520e94830e3,16039,Kensington Gardens,51.503429,-0.175275,9.6,1756,4034.0,183
3474,173,4b2e246ef964a520f9dc24e3,16032,Primrose Hill Jazz,51.541909,-0.163357,9.6,1786,2283.0,234
11708,585,4ac518cef964a520f8a520e3,17002,Tower Bridge,51.499972,-0.076936,9.5,6092,15980.0,470
2304,115,59105400772fbc7bd0b0e2b1,17073,Mestizo Mexican Market,51.527515,-0.138937,9.5,394,5.0,2
1762,88,4ad2f913f964a520ece220e3,17000,M&S Simply Food,51.505494,-0.09802,9.5,80,6.0,0
1964,98,4ada57e0f964a520982121e3,17018,Daunt Books,51.520404,-0.152328,9.5,674,477.0,98
69,3,4ac518eff964a52064ad20e3,17069,Borough Market,51.505554,-0.090842,9.5,8822,8743.0,1068
6415,320,55f9c860498e9521ebb5d30f,13003,Gielgud Theatre,51.511833,-0.132991,9.5,264,,0
2025,101,4ac518cef964a52027a620e3,16032,Holland Park,51.502976,-0.203482,9.5,1289,1729.0,136


In [1]:
df_yelp_clean_filter = df_yelp_clean.query('review_count > 10')
print("Mean for review_count is: " + str(df_yelp_clean_filter['review_count'].mean()))
df_yelp_clean.query('review_count > 10').sort_values(by="rating", ascending=False).head(10)

NameError: name 'df_yelp_clean' is not defined