In [1]:
from yelp.client import Client
from yelp.oauth1_authenticator import Oauth1Authenticator
from bs4 import BeautifulSoup
import urllib
import re

### Define functions

In [2]:
def connect_API(consumer_key=None, consumer_secret=None, token=None, token_secret=None):
    """ Yelp API authorization
    :param consumer_key:
    :param consumer_secret:
    :param token:
    :param token_secret:
    :return: Yelp API client object
    """
    auth = Oauth1Authenticator(consumer_key, consumer_secret, token, token_secret)
    return Client(auth)

def get_businesses(client, keywords):
    """
    get Yelp business objects
    :param client: Yelp API client object
    :param keywords: name of a dish
    :return: a list of Yelp business object from responses
    """
    params = {
        'term': keywords
    }
    return client.search('San Francisco', **params).businesses

def get_data(business_id, keywords):
    """
    pull reviews and photos
    :param business_id: a Yelp business id
    :param keywords: name of a dish
    :return: a list of reviews and a list of photos relative to the keywords
    """

    # data in 1st page
    reviews = []
    photos = []
    SearchURL = 'https://www.yelp.com/biz_photos/%s?tab=food' % business_id
    try:
        yelppage = BeautifulSoup(urllib.urlopen(SearchURL).read(), "lxml")
        total_pages = yelppage.find_all('div', {'class': "page-of-pages arrange_unit arrange_unit--fill"})[0].get_text()
        total_pages = int(re.findall('of ([0-9]*)', total_pages)[0])
        reviews, photos = get_data_single_page(yelppage, keywords)
        # data in other pages (only get data from first 5 pages; otherwise, it's going to be super slow)
        if total_pages > 5:
            total_pages = 5
        for page in range(1, total_pages):
            num = 30*page
            SearchURL = 'https://www.yelp.com/biz_photos/%s?start=%s&tab=food' % (business_id, str(num))
            yelppage = BeautifulSoup(urllib.urlopen(SearchURL).read(), "lxml")
            r, p = get_data_single_page(yelppage, keywords)
            reviews += r
            photos += p
    except UnicodeError:
        pass
    return reviews, photos

def get_data_single_page(yelppage, keywords):
    """
    helper method for get_data(business_id, keywords)
    :param yelppage: BeautifulSoup object
    :return: a list of reviews and a list of photos relative to the keywords
    """
    results = yelppage.find_all('img', {'alt': re.compile(" " + keywords + " ", re.IGNORECASE)})
    reviews = []
    photos = []
    for result in results:
        r = result.get("alt").split(".")
        if len(r) > 1:
            reviews.append(r[1])
            photos.append(result.get("src"))
    return reviews, photos

def get_info_dict(businesses_list):
    """
    get restaurant information from searching results

    :param businesses_list: a list of Yelp business object
    :return: a dictionary of result (key - business_id, value - review list, photo list, business object)
    """
    info_dict = {}
    for business in businesses_list:
        id = business.id
        print id
        reviews, photos = get_data(id, keywords)
        if len(reviews) > 0:
            info_dict[id] = {"reviews": reviews}
            info_dict[id]["photos"] = photos
            info_dict[id]["business_obj"] = business
    return info_dict


 ### Connect Yelp API and get search responses

In [3]:
# main
consumer_key="8WhklGO_09rGkQZy9xMBQQ"
consumer_secret="aSYhuXHB0U__JcYxWmBU-qm5a5k"
token="ud25l_lGjq4PJPqxX_NXhSQHCbXWdZjq"
token_secret="n4jTgD1PzwiWCSVg-hHac8WHx8o"

keywords = "Pad See Ew"

# connect yelp api 
client = connect_API(consumer_key, consumer_secret, token, token_secret)
# get responses from api
businesses_list = get_businesses(client, keywords)

### create a restaurant information dictionary from responses

#### It takes a little bit time depend on the size of response, so I print out the restaurant id to check if the kernel died
#### we can think about how to improve the efficiency later

In [4]:
info_dict = get_info_dict(businesses_list)

thai-time-restaurant-san-francisco
lers-ros-san-francisco-2
champa-garden-san-francisco-2
house-of-thai-san-francisco-2
ben-thai-cafe-san-francisco
marnee-thai-san-francisco
kings-thai-cuisine-1-san-francisco-2
marnee-thai-san-francisco-2
basil-thai-restaurant-and-bar-san-francisco
jitlada-thai-cuisine-san-francisco
chabaa-thai-cuisine-san-francisco-2
zabb-thai-cuisine-san-francisco
ploy-ii-san-francisco
sri-thai-san-francisco
chim-san-francisco
pad-thai-restaurant-san-francisco
rins-thai-restaurant-san-francisco
thai-cottage-restaurant-san-francisco
siam-lotus-thai-cuisine-san-francisco
manivanh-thai-restaurant-san-francisco


### getting values from dictionary

In [6]:
for key, value in info_dict.iteritems():
    print "-----------------------" + value['business_obj'].name + "-----------------------"
    print value['reviews']
    print value['photos']

-----------------------Zabb Thai Cuisine-----------------------
[' Pad see ew with pork', ' Pad See Ew with beef - beef, rice noodle stir fried with broccoli, carrot and egg']
['https://s3-media1.fl.yelpcdn.com/bphoto/GCRSPkto2LZGebi-0r96GQ/258s.jpg', 'https://s3-media1.fl.yelpcdn.com/bphoto/_SV2ls9_PtbuvzstfvDrLA/258s.jpg']
-----------------------Marnee Thai-----------------------
[' Pad see ew with chicken', ' Pad see ew with pork']
['https://s3-media1.fl.yelpcdn.com/bphoto/bj64mQNIR8mEuNqEc5m0PQ/258s.jpg', 'https://s3-media1.fl.yelpcdn.com/bphoto/2OWlshVJMSao_8_R_1578w/258s.jpg']
-----------------------Ben Thai Cafe-----------------------
[' Pad see ew with prawns - $16']
['https://s3-media3.fl.yelpcdn.com/bphoto/hufCYlw_L_cpbruczHpXnA/258s.jpg']
-----------------------Thai Time Restaurant-----------------------
[' Pad See Ew with Beef', ' Pork pad see ew (with no carrots) $7']
['https://s3-media2.fl.yelpcdn.com/bphoto/HKlaek9LOGzYcsyMRbjumA/258s.jpg', 'https://s3-media1.fl.yelpcdn.