In [1]:
import pandas as pd
import requests
from splinter import Browser
from bs4 import BeautifulSoup
import time

In [2]:
# Yelp url for restaurants in Morris County, NJ to be scraped
url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Morris+County%2C+NJ&ns=1'
response = requests.get(url)

In [3]:
# intialize variables and create empty lists
nextpage_link = True
restaurant_name = []
yelp_url = []
restaurant_rating = []
price_cuisine = []
 
while (nextpage_link):
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # class for restaurant details
    containers = soup.find_all('div', class_='scrollablePhotos__09f24__1PpB8')[1:]
    
    # scroll through each container
    for item in containers:
        #print(item)
        # find and add restaurant name to list
        rest_name = item.find('a',class_='link__09f24__1kwXV')
        restaurant_name.append(rest_name.text)
        
        # find and add restaurant url to list
        yelp_url.append("https://www.yelp.com" + rest_name['href'])
        
        # find and add rating (if exists) to list
        rating = item.find('div',class_='i-stars__09f24__1T6rz')
        if rating:
            restaurant_rating.append(rating['aria-label'].split(' ')[0])
        else:
            restaurant_rating.append("")
        
        # find and add cuisine to list
        price_cuisine.append(item.find('div',class_='priceCategory__09f24__2IbAM').text)

    # find the next page url
    next_page = soup.find('a', class_='next-link')
    
    # if there is a another page, assign the url, otherwise exit
    if next_page:        
        nextpage_url = next_page['href']
        response = requests.get(nextpage_url)
        #time.sleep(2)
    else:
        nextpage_link = False

# Create Dataframe
yelp_df = pd.DataFrame({'restaurant_name': restaurant_name,
                       'yelp_url': yelp_url,
                       'restaurant_rating': restaurant_rating,
                       'price_cuisine': price_cuisine})

yelp_df    

Unnamed: 0,restaurant_name,yelp_url,restaurant_rating,price_cuisine
0,Rosie’s Trattoria,https://www.yelp.com/biz/rosie-s-trattoria-ran...,4.5,"$$$Italian, Bars, Venues & Event Spaces"
1,SubUrban Bar & Kitchen,https://www.yelp.com/biz/suburban-bar-and-kitc...,4,"$$American (New), Bars, Pizza"
2,4 Seasons Mediterranean Restaurant,https://www.yelp.com/biz/4-seasons-mediterrane...,4.5,"$$Wine Bars, French, Italian"
3,Quiet Man Pub,https://www.yelp.com/biz/quiet-man-pub-dover-2...,4.5,"$$Irish Pub, Seafood, Tapas/Small Plates"
4,Verona Restaurant,https://www.yelp.com/biz/verona-restaurant-ran...,4.5,$$Italian
...,...,...,...,...
233,Big D’s Hot Dogs,https://www.yelp.com/biz/big-ds-hot-dogs-dover...,5,"$Food Trucks, American (Traditional), Hot Dogs"
234,Luigis Pizza,https://www.yelp.com/biz/luigis-pizza-succasun...,4,$Pizza
235,Naranja Juice Bar,https://www.yelp.com/biz/naranja-juice-bar-dov...,5,"$Juice Bars & Smoothies, Soup, Sandwiches"
236,Clean Juice,https://www.yelp.com/biz/clean-juice-morristow...,4.5,"Juice Bars & Smoothies, Acai Bowls, Wraps"


In [4]:
# create empty lists to hold address
address1 = []
address2 = []
restaurant_website = []

executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser =  Browser("chrome", **executable_path, headless=False)

# iterate through all the rows in the yelp dataframe
for index, row in yelp_df.iterrows():
    
    # visit each restaurant's url
    browser.visit(row['yelp_url'])

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # find the address tag
    address = soup.find('address').find_all('span', class_='raw__373c0__3rcx7')
    
    # find address1 and add to list
    street_address = address[0].text
    for i in range(1, len(address)-1):
        street_address = street_address + address[i].text + " "
    address1.append(street_address)
    
    # find the rest of the address
    address2.append(address[len(address)-1].text)
    
    # find the restaurant's website if it exists
    rest_website = soup.find('div', class_='stickySidebar__373c0__3PY1o').find('a', class_='lemon--a__373c0__IEZFH').text
    if rest_website == "Get Directions":
        restaurant_website.append("")
    else:
        restaurant_website.append(rest_website)
    
browser.quit()

# add address columns to yelp dataframe
yelp_df['address1'] = address1
yelp_df['address2'] = address2
yelp_df['restaurant_website'] = restaurant_website
                  
yelp_df

Unnamed: 0,restaurant_name,yelp_url,restaurant_rating,price_cuisine,address1,address2,restaurant_website
0,Rosie’s Trattoria,https://www.yelp.com/biz/rosie-s-trattoria-ran...,4.5,"$$$Italian, Bars, Venues & Event Spaces",1181 Sussex Tpke,"Randolph, NJ 07869",Find a Table
1,SubUrban Bar & Kitchen,https://www.yelp.com/biz/suburban-bar-and-kitc...,4,"$$American (New), Bars, Pizza",500 NJ-10,"Randolph, NJ 07869",sbknj.com
2,4 Seasons Mediterranean Restaurant,https://www.yelp.com/biz/4-seasons-mediterrane...,4.5,"$$Wine Bars, French, Italian",322 S Main St,"Wharton, NJ 07885",4seasonswharton.com
3,Quiet Man Pub,https://www.yelp.com/biz/quiet-man-pub-dover-2...,4.5,"$$Irish Pub, Seafood, Tapas/Small Plates",64 E Mcfarlan St,"Dover, NJ 07801",quietmanpub.com
4,Verona Restaurant,https://www.yelp.com/biz/verona-restaurant-ran...,4.5,$$Italian,1171 Sussex Tpke,"Randolph, NJ 07869",veronarestaurant.com
...,...,...,...,...,...,...,...
233,Big D’s Hot Dogs,https://www.yelp.com/biz/big-ds-hot-dogs-dover...,5,"$Food Trucks, American (Traditional), Hot Dogs",Dover Train Station,"Dover, NJ 07801",
234,Luigis Pizza,https://www.yelp.com/biz/luigis-pizza-succasun...,4,$Pizza,275 State Rt 10 E,"Succasunna, NJ 07876",
235,Naranja Juice Bar,https://www.yelp.com/biz/naranja-juice-bar-dov...,5,"$Juice Bars & Smoothies, Soup, Sandwiches",6B S Warren St,"Dover, NJ 07801",
236,Clean Juice,https://www.yelp.com/biz/clean-juice-morristow...,4.5,"Juice Bars & Smoothies, Acai Bowls, Wraps",68 South St,"Morristown, NJ 07960",cleanjuice.com/locations/morri…


In [5]:
# save to csv file
yelp_df.to_csv('../Resources/yelp_data.csv', index=False)