In [53]:
# setup library imports
import io, time, json
import requests
from pathlib import Path
from bs4 import BeautifulSoup
#from testing.testing import test
from urllib.parse import quote

In [6]:
def read_api_key(filepath="api_key.txt"):
    """
    Read the Yelp API Key from file.
    
    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """
    
    # Feel free to modify this function if you are storing the API Key differently
    return Path(filepath).read_text().strip()

In [7]:
api_key = read_api_key()

In [8]:
def retrieve_html(url):
    """
    Return the raw HTML at the specified URL.

    Args:
        url (string): 

    Returns:
        status_code (integer):
        raw_html (string): the raw HTML content of the response, properly encoded according to the HTTP headers.
    """
    r = requests.get(url, auth=('user', 'pass'))   
    return (r.status_code, r.text)

In [1]:
#retrieve_html('https://www.yelp.com/biz/man-vs-fries-seattle-2?osq=Tacos')

In [17]:
def yelp_search(api_key, term, location):
    """
    Make an authenticated request to the Yelp API.

    Args:
        query (string): Search term

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the query
        businesses (list): list of dicts representing each business
    """
    
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    header_string = 'Bearer' + ' ' + api_key
    
    headers = {'Authorization': header_string}
    
    params = {"location": location, "term": term}
    
    response = requests.get(search_url, params = params, headers = headers)
    
    data = json.loads(response.text)
    
    return (data["total"], data["businesses"])


In [18]:
yelp_search(api_key, "tacos", "Seattle")

(1900,
 [{'id': 'pzjQKP3PEkclhSD-mFn7jA',
   'alias': 'tacos-chukis-seattle',
   'name': 'Tacos Chukis',
   'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/AU2LRinK4hKGU7nnCTq73Q/o.jpg',
   'is_closed': False,
   'url': 'https://www.yelp.com/biz/tacos-chukis-seattle?adjust_creative=ZgQxRTZk4a1F2l5iDXhQgQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=ZgQxRTZk4a1F2l5iDXhQgQ',
   'review_count': 1724,
   'categories': [{'alias': 'mexican', 'title': 'Mexican'}],
   'rating': 4.5,
   'coordinates': {'latitude': 47.620570445175,
    'longitude': -122.321262359619},
   'transactions': ['delivery'],
   'price': '$',
   'location': {'address1': '219 Broadway E',
    'address2': '',
    'address3': '',
    'city': 'Seattle',
    'zip_code': '98102',
    'country': 'US',
    'state': 'WA',
    'display_address': ['219 Broadway E', 'Seattle, WA 98102']},
   'phone': '+12069058537',
   'display_phone': '(206) 905-8537',
   'distance': 1196.2087504042854},
  {'id': 'edz

In [21]:
def all_restaurants(api_key, term, location):
    """
    Retrieve ALL the restaurants on Yelp for a given query.

    Args:
        term: keyword
        location: location to search

    Returns:
        results (list): list of dicts representing each business
    """
 
    search_url = "https://api.yelp.com/v3/businesses/search"
    
    header_string = 'Bearer' + ' ' + api_key
    
    headers = {'Authorization': header_string}
    
    params1 = {"location": location, "term": term, "categories" : "restaurants"}
    response1 = requests.get(search_url, params = params1, headers = headers)   
    data1 = json.loads(response1.text)

    #print(data1)
    num_records = data1['total']
    
    n = num_records//20 + 1
    offset = 0
    result = []
    for i in range(n):
        
        #20 restaurants each request
        curr_offset = offset + i*20
        params = {"location": location, "offset": curr_offset, "categories" : "restaurants"}
        response = requests.get(search_url, params = params, headers = headers)
        
        data = json.loads(response.text)
        
        #print(len(data['businesses']))
        
        #for x in data['businesses']:
            #print(x['name'])
        
        result += data["businesses"]
        time.sleep(.300)
    
    #pause slightly between requests
    
    
    return result

#data = all_restaurants(read_api_key(), 'Polish Hill, Pittsburgh')
#print(len(data))
#print([x['name'] for x in data])

In [22]:
tacos = all_restaurants(api_key, "pizza", "Bainbridge Island")

In [44]:
pizza_bi = yelp_search(api_key, "pizza", "Bainbridge Island")

In [54]:
parse_api_response(pizza_bi)

TypeError: the JSON object must be str, bytes or bytearray, not tuple

In [49]:
def request(host, path, api_key, url_params=None):
    
    """Given your API_KEY, send a GET request to the API.
    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        API_KEY (str): Your API Key.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        dict: The JSON response from the request.
    Raises:
        HTTPError: An error occurs from the HTTP request.
    """
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    print(u'Querying {0} ...'.format(url))

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()
    

def test_search(api_key, term, location):
    
    API_HOST = 'https://api.yelp.com'
    SEARCH_PATH = '/v3/businesses/search'
    BUSINESS_PATH = '/v3/businesses/'  # Business ID will come after slash.
    
    url_params = {
        'term' : term.replace(' ', '+'),
        'location' : location.replace(' ', '+'),
    }
    return request(API_HOST, SEARCH_PATH, api_key, url_params = url_params)

In [56]:
test_data = test_search(api_key, "tacos", "Seattle")

Querying https://api.yelp.com/v3/businesses/search ...


In [58]:
type(test_data)

dict

In [43]:
tacos_json = json.dumps(tacos)
data_dict = json.loads(tacos_json)
data_dict[0]

{'id': '6I28wDuMBR5WLMqfKxaoeg',
 'alias': 'pike-place-chowder-seattle',
 'name': 'Pike Place Chowder',
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/ZyQjV-wJQ2GHyX7l3jfbyg/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/pike-place-chowder-seattle?adjust_creative=ZgQxRTZk4a1F2l5iDXhQgQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=ZgQxRTZk4a1F2l5iDXhQgQ',
 'review_count': 7459,
 'categories': [{'alias': 'seafood', 'title': 'Seafood'},
  {'alias': 'soup', 'title': 'Soup'}],
 'rating': 4.5,
 'coordinates': {'latitude': 47.60939, 'longitude': -122.34112},
 'transactions': ['pickup', 'delivery'],
 'price': '$$',
 'location': {'address1': '1530 Post Aly',
  'address2': 'Ste 11',
  'address3': '',
  'city': 'Seattle',
  'zip_code': '98101',
  'country': 'US',
  'state': 'WA',
  'display_address': ['1530 Post Aly', 'Ste 11', 'Seattle, WA 98101']},
 'phone': '+12062672537',
 'display_phone': '(206) 267-2537',
 'distance': 14183.878140449346}

In [27]:
def parse_api_response(data):
    """
    Parse Yelp API results to extract restaurant URLs.
    
    Args:
        data (string): String of properly formatted JSON.

    Returns:
        (list): list of URLs as strings from the input JSON.
    """
    
    data_dict = json.loads(data)
    businesses = data_dict["businesses"]
    result = []
    for i in range(len(businesses)):
        business = businesses[i]
        result.append(business["url"])
    return result

In [29]:
parse_api_response(all_restaurants(api_key, "tacos", "Seattle"))

KeyError: 'businesses'

In [10]:
def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.
    
    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        tuple(list, string): a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: URL for the next page of reviews (or None if it is the last page)
    """
    
    soup = BeautifulSoup(html, 'html.parser')
    
    #print(soup.prettify()) 
    
    review_soups = soup.find_all(itemprop = "review")
    
    count1 = soup.find(itemprop = "reviewCount")
    count = int(count1.get_text().strip())
    num_pages = count//20 + 1
    #print(num_pages)
    
    review_list = []
    for each in review_soups:
        #print(each)
        
        review_info = dict()
        text = each.get_text().strip()
        date = each.find(itemprop = "datePublished").get("content")
        rating = each.find(itemprop = "ratingValue").get("content")
        author = each.find(itemprop = "author").get("content")
        
        review_info['author'] = author
        review_info['rating'] = float(rating)
        review_info['date'] = date
        review_info['description'] = text
        review_list.append(review_info)
        #print(review_info)
        
        
    return (review_list, num_pages)

In [11]:
def extract_reviews(url):
    """
    Retrieve ALL of the reviews for a single restaurant on Yelp.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        reviews (list): list of dictionaries containing extracted review information
    """
    
    (review_list,num_pages) = parse_page(retrieve_html(url)[1])
    #print(num_pages)
    
    result = review_list
    for i in range(1, num_pages):
        curr_offset = i*20
        add_string = '?start=%d'%curr_offset
        curr_url = url + add_string
        (curr_review_list,num_pages) = parse_page(retrieve_html(curr_url)[1])
        result += curr_review_list
        

        #https://www.yelp.com/biz/larry-and-carols-pizza-pittsburgh?start=20
    
    return result


In [12]:
extract_reviews('https://www.yelp.com/biz/man-vs-fries-seattle-2?osq=Tacos')

AttributeError: 'NoneType' object has no attribute 'get_text'