# Yelp API

This file develops the interaction with the Yelp API in other to get the final dataset. This process includes:

1. Oauth2 token autentification
2. HTTP request
3. Collection of the JSON response into a DataFrame
4. Cleaning of the response and structuration of the variables for modeling.

In [2]:
import json
import urllib
import pandas as pd
import numpy as np
import requests
import json
import re
import matplotlib.pyplot as plt
import oauth2
import time
%matplotlib inline

## API Connection

This code creates a connection with the Yelp Fusion API based on your client ID and client secret. It after uses the Search API to query for businesses by a search term and location, and the Business API to query additional information about the top result from the search query.

In [3]:
import argparse
import json
import pprint
import requests
import sys
import urllib

from urllib2 import HTTPError
from urllib import quote
from urllib import urlencode


# OAuth credential placeholders that must be filled in by users.
# Finded on:
# https://www.yelp.com/developers/v3/manage_app
CLIENT_ID = "FQ0cqa62QmG2_jRfp7OJvw"
CLIENT_SECRET = "xfk0wI6RzvaCvpOSgcVtI7sgpSpiMZO8EQMHGgFKJww3UqZ0zXyxioMjWz6BPcy6"


# API constants, you shouldn't have to change these.
API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
BUSINESS_PATH = '/v3/businesses/'
TOKEN_PATH = '/oauth2/token'
GRANT_TYPE = 'client_credentials'


# Defaults search values:
DEFAULT_TERM = 'dinner'
DEFAULT_LOCATION = 'San Francisco, CA'
SEARCH_LIMIT = 50 #Tells the number of results per page. Cannot be set to a number greater then 50.
#For larger queries use:
REAL_SEARCH = 500 #This value makes iterations bringing results through the offset values to give higher order
#value of results. It can be set up to 950 results.

## Oauth2 token autentification, Bearer token obtaning:

In [34]:
def obtain_bearer_token(host, path):
    """Given a bearer token, send a GET request to the API.
    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        str: OAuth bearer token, obtained using client_id and client_secret.
    """
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    assert CLIENT_ID, """supplied by user"""
    assert CLIENT_SECRET, """supplied by user"""
    data = urlencode({
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'grant_type': GRANT_TYPE,
    })
    headers = {
        'content-type': 'application/x-www-form-urlencoded',
    }
    response = requests.request('POST', url, data=data, headers=headers) #post your credentials
    bearer_token = response.json()['access_token'] #Get the acces token result back
    return bearer_token

Make conection, obtaining the Oauth bearer token:

In [6]:
bearer_token1 = obtain_bearer_token(API_HOST, TOKEN_PATH)

## HTTP request

Function for given a bearer token, send a GET request to the API.

In [30]:
def request(host, path, bearer_token, url_params=None):
    """Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        bearer_token (str): OAuth bearer token, obtained using client_id and client_secret.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        dict: The JSON response from the request.
    """
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % bearer_token,
    }

    print(u'Querying {0}, Query Offset: {1}'.format(url_params['location'],url_params['offset'],url))

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()

Function for querying the Search API by a search term and location.

In [35]:
def search(bearer_token, term, location, nresults = REAL_SEARCH):
    """Args:
        term (str): The search term passed to the API.
        location (str): The search location passed to the API.
    Returns:
        dict: The JSON response from the request.
    """
    join_df = pd.DataFrame()
    
    #Loop through the number of offsets needed to obtain the total number of results requested    
    for i in range(nresults/SEARCH_LIMIT):
        origin_results = (i+1)*SEARCH_LIMIT
        url_params = {
            'term': term.replace(' ', '+'),
            'location': location.replace(' ', '+'),
            'offset': origin_results,
            'limit': SEARCH_LIMIT
        }
        request_dict = request(API_HOST, SEARCH_PATH, bearer_token, url_params=url_params)
        #Set the differnt request results in dataframes that are latter put together 
        request_df = pd.DataFrame().from_dict(request_dict['businesses'])
        join_df = pd.concat([join_df,request_df], axis=0)
    join_df.reset_index(inplace=True, drop=True)
    return join_df

In [36]:
#Obtain the restaurant results
time1 = time.time()
london = search(bearer_token1, term='',location="London", nresults=500)
paris = search(bearer_token1, term='',location="Paris", nresults=500)
madrid = search(bearer_token1, term='',location="Madrid", nresults=500)
rome = search(bearer_token1, term='',location="Rome", nresults=500)
dublin = search(bearer_token1, term='',location="Dublin", nresults=500)
manchester = search(bearer_token1, term='',location="Manchester", nresults=500)
time2 = time.time()
time_in_s = (time2-time1)

print "The search took %0.3f seconds to run" %time_in_s

Querying London, Query Offset: 50...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 100...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 150...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 200...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 250...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 300...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 350...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 400...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 450...https://api.yelp.com/v3/businesses/search
Querying London, Query Offset: 500...https://api.yelp.com/v3/businesses/search
Querying Paris, Query Offset: 50...https://api.yelp.com/v3/businesses/search
Querying Paris, Query Offset: 100...https://api.yelp.com/v3/businesses/search
Querying Paris, Query Offset: 150...https://api.yelp.com

In [37]:
#Merge all results together in one Dataframe
result = pd.concat([london,paris,madrid,rome,dublin,manchester], axis = 0)

In [41]:
result.head(2)

Unnamed: 0,categories,coordinates,display_phone,distance,id,image_url,is_closed,location,name,phone,price,rating,review_count,transactions,url
0,"[{u'alias': u'cocktailbars', u'title': u'Cockt...","{u'latitude': 51.509767, u'longitude': -0.119833}",+44 20 7836 4343,1166.830885,american-bar-london-2,https://s3-media1.fl.yelpcdn.com/bphoto/SKAdDh...,False,"{u'city': u'London', u'display_address': [u'Th...",American Bar,442078364343,£££,4.5,44,[],https://www.yelp.com/biz/american-bar-london-2...
1,"[{u'alias': u'seafood', u'title': u'Seafood'},...","{u'latitude': 51.5135487, u'longitude': -0.132...",+44 20 7432 4800,261.663283,burger-and-lobster-london-2,https://s3-media2.fl.yelpcdn.com/bphoto/DLmidv...,False,"{u'city': u'London', u'display_address': [u'36...",Burger & Lobster,442074324800,££,4.0,228,[],https://www.yelp.com/biz/burger-and-lobster-lo...


## Cleaning and processing of the results

As we can see the result frame obtained needs processing. Most of the columns need some preprocessing and others have no use. In this next step we are going to clean and process the results obtained

In [38]:
def datacleaner(dfm):

    dfm.reset_index(inplace=True, drop=True)

    dfm['alias'] = dfm['categories'].apply(lambda x: x[0]['alias'])
    dfm['title'] = dfm['categories'].apply(lambda x: x[0]['title'])
    dfm['latitude'] = dfm['coordinates'].apply(lambda x: x['latitude'])
    dfm['longitude'] = dfm['coordinates'].apply(lambda x: x['longitude'])
    dfm['Phone1'] = dfm['display_phone'].apply(lambda x:  0 if x=='' else 1)
    dfm['Phone2'] = dfm['phone'].apply(lambda x:  0 if x=='' else 1)
    dfm['price'].str.encode('utf-8')
    dfm['price'] = dfm['price'].apply(lambda x: np.nan if type(x) == float else len(x))
    
    city_keys = dfm['location'][0].keys()
    for key in city_keys:
        dfm[key] = dfm['location'].apply(lambda x: x[key])
        
    dfm.drop(['distance','categories','coordinates',"display_phone", 'is_closed', 'phone', 'url','location','id'], axis=1, inplace=True)


    dfm.reset_index(inplace=True, drop=True)
    
    #Giving the dataset a order
    dfm = dfm[['name','price','review_count','alias','Phone1','city', 'country','state', 'address1', 'zip_code','latitude', 'longitude', 'image_url', 'rating']]

#     for topic in topics:
#         for index, list in enumerate(dfm['topics']):
#             if topic in list:
#                 dfm.ix[index, topic] = 1
#             else:
#                 dfm.ix[index, topic] = 0

    return dfm

In [17]:
finaldf = datacleaner(result)

Function takes around 0.056 seconds to run


In [18]:
finaldf.shape

(3000, 14)

In [19]:
photos = pd.DataFrame(finaldf.image_url)

In [20]:
pd.DataFrame.to_csv(photos, path_or_buf= 'Ad_image.csv', index=False, )

In [21]:
finaldf.head()

Unnamed: 0,name,price,review_count,alias,Phone1,city,country,state,address1,zip_code,latitude,longitude,image_url,rating
0,American Bar,3.0,44,cocktailbars,1,London,GB,XGL,The Savoy Hotel,WC2R 0EU,51.509767,-0.119833,https://s3-media1.fl.yelpcdn.com/bphoto/SKAdDh...,4.5
1,Burger & Lobster,2.0,228,seafood,1,London,GB,XGL,36 Dean Street,W1D 4PS,51.513549,-0.132232,https://s3-media2.fl.yelpcdn.com/bphoto/DLmidv...,4.0
2,Quilon Restaurant,3.0,103,indpak,1,London,GB,XGL,41 Buckingham Gate,SW1E 6AF,51.49872,-0.137499,https://s3-media4.fl.yelpcdn.com/bphoto/weedog...,4.5
3,Nightjar,3.0,149,lounges,1,London,GB,XGL,129 City Road,EC1V 1JB,51.526522,-0.08775,https://s3-media1.fl.yelpcdn.com/bphoto/yzyTg5...,4.5
4,Savoir Faire,2.0,153,french,1,London,GB,XGL,42 New Oxford Street,WC1A 1EP,51.517195,-0.126012,https://s3-media2.fl.yelpcdn.com/bphoto/tbGiSu...,4.5


In [22]:
finaldf.to_csv("./Yelp_results.csv", encoding= 'UTF8', index= False)