In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import re
import time

In [115]:
page_num = 1
page = f'https://www.opentable.com/s?dateTime=2021-07-18T17%3A00%3A00&covers=1&latitude=40.751637&longitude=-73.986409&page={page_num}'

In [119]:
def get_restaurants(url):
    driver=webdriver.Chrome()
    driver.get(url)

    # scroll down page incrementally to load restaurant elements
    y = 500
    for timer in range(0,70):
        driver.execute_script("window.scrollTo(0, "+str(y)+")")
        y += 500
        time.sleep(0.05)
    results_html = driver.page_source
    time.sleep(0.1)
    driver.close()
    
    
    results = soup(results_html, 'html.parser')
    restaurants = results.find_all('div', attrs = {"class" : "_3uVfVbI1iLfMbszbU6KoOL"})
    print(f'{len(restaurants)} restaurants on results page {url} were found')
    
    rest_list = []
    
    for restaurant in restaurants:
        keys = ['name', 'url', 'promoted', 'price_tier', 'review_count', 'overall', 'food', 'service', 'ambience', 'value',
               'noise', 'pct_recommended', 'neighborhood', 'cuisines', 'dining_style', 'dress_code', 'chef', 'tags',
               'primary_cuisine', 'sanitizing', 'distancing', 'ppe', 'screening']
        
        # initialize all keys to None
        curr_rest_dict = dict(zip(keys, [None]*len(keys)))
        
        # get restaurant name
        restaurant_child = restaurant.find('a', attrs = {"class":"_1e9PcCDb012hY4BcGfraQB"})
        curr_rest_dict['name'] = restaurant_child.get('aria-label')

        # get restaurant url
        url = restaurant_child.get('href')
        curr_rest_dict['url'] = url

        # get promoted status
        curr_rest_dict['promoted'] = 1 if restaurant.get('data-promoted') == 'true' else 0

        # get price tier
        curr_rest_dict['price_tier'] = restaurant.find('span', attrs = {"class":"_3sSkv7iJ6Tl1VxRjYAjp13"}).get_text()
        
        print(curr_rest_dict['name'])
        get_restaurant_info(curr_rest_dict['url'], curr_rest_dict)

        rest_list.append(curr_rest_dict)
        
    return rest_list

In [48]:
def get_restaurant_info(url, curr_rest_dict):
    """
    get_restaurant_info: extracts information from restaurant pages whose urls were found by get_nyc_restaurants()
    
    args:
        url: url of the restaurant page
        curr_rest_dict: the dict of information on each restaurant generated by get_nyc_restaurants()
    
    output:
        no return, mutating function
        navigates the restaurant page and adds the following keys and values to curr_rest_dict:
            
            review_count: int, total # of reviews received
            ----------------------------------------------------------------------------
            
            RATINGS- all rating values are floats, out of 5.0 maximum rating
            -----------------------------------------------------------------------------
            overall: overall average rating by reviewers
            food: average food rating by reviewers
            service: average service rating by reviewers
            ambience: average ambience rating by reviewers
            value: average value rating by reviewers
            -----------------------------------------------------------------------------
            
            noise: string, noise level. Quiet, Moderate, or Energetic.
            pct_recommended: int, percent of reviewers who would recommend restaurant to a friend
            -----------------------------------------------------------------------------
            
            DETAILS- all details values are strings, with None if key not found on page
            -----------------------------------------------------------------------------
            neighborhood: neighborhood in which restaurant is located
            hours: hours of operation
            cuisines: cuisine styles served
            dining_style: dining style (fine dining, casual, etc)
            dress_code: dress code
            chef: chef's name
            tags: additional tags
            primary_cuisine: first item in cuisines, as listed on results page
            -----------------------------------------------------------------------------
            
            COVID-19 MEASURES- all values are bool, 1 if safety measure is implemented, 0 if not
            -----------------------------------------------------------------------------
            sanitizing: sanitization or enhanced cleaning practices
            distancing: physical distancing, barriers between tables, etc
            ppe: (personal protective equipment) mask-wearing by staff, requiring customers to do so
            screening: customer temperature checking, contact tracing
        
    """
    subdriver=webdriver.Chrome()
    subdriver.get(url)
    subdriver.maximize_window() # maximize to make sure page sidebar is loaded
    rest_html = subdriver.page_source
    subdriver.close()

    curr_rest = soup(rest_html, 'html.parser')
    
    # get number of reviews
    divs = curr_rest.find_all('div', attrs = {"class" : "c3981cf8 _965a91d5"})
    for div in divs:
        spans = div.find_all('span')
        for span in spans:
            if "Reviews" in span.string:
                curr_rest_dict['review_count'] = span.string
    has_reviews = 1
    if curr_rest_dict['review_count'] == "No Reviews":
        has_reviews = 0
    
    # get overall rating and subratings
    if has_reviews:
        curr_rest_dict['overall'] = float(curr_rest.find('div', attrs = {"class" : "oc-reviews-491257d8"}).span.string)
        subreviews = curr_rest.find_all('div', attrs = {"class" : "oc-reviews-15d38b07"})
        curr_rest_dict['food'] = float(subreviews[0].string)
        curr_rest_dict['service'] = float(subreviews[1].string)
        curr_rest_dict['ambience'] = float(subreviews[2].string)
        curr_rest_dict['value'] = float(subreviews[3].string)

    # get noise level
    noise_level = curr_rest.find('span', attrs = {"class" : "oc-reviews-624ebf8b"})
    if noise_level is not None:
        curr_rest_dict['noise'] = noise_level.string
    

    # get percent of reviewers who would recommended to a friend
    if has_reviews:
        has_recs = curr_rest.find_all('div', attrs = {"class" : "oc-reviews-dfc07aec"})[1]
        if has_recs is not None:
            rec_string = re.search('\d+%', has_recs.get_text()).group(0)
            curr_rest_dict['pct_recommended'] = int(re.search('\d+', rec_string).group(0))

    details_tags = ['neighborhood', 'hours', 'cuisines', 'dining_style', 'dress_code', 'chef', 'tags']

    # zip (field, value) tuples of restaurant page sidebar information into details_list
    sidebar = curr_rest.find('div', attrs = {"class":"_1e466fbf"})
    details = sidebar.find_all('div', attrs = {"class":"df8add00"})
    details_list = [zip(item.find_all('div', attrs = {"class":"c3981cf8 _965a91d5"}), 
                        item.find_all('div', attrs = {"class":"e7ff71b6 b2f6d1a4"})) for item in details] 
    for i in range(len(details_list)):
        for x, y in details_list[i]:
            details_list[i] = (x.string, y.string)

    # filter details_list for desired information, add info to dict under placeholder keys
    desired_details = zip(['Neighborhood', 'Hours of operation', 'Cuisines', 'Dining Style', 'Dress code', '(?i)(.*chef.*)', 'Additional'], details_tags)

    for x, y in desired_details:
        for a, b in details_list:
            if re.search(x, a):
                curr_rest_dict[y] = b
    
    # gets first tag in cuisines as primary cuisine
    curr_rest_dict['primary_cuisine'] = curr_rest_dict['cuisines'].split(',')[0]


    
    
    safety_categories = ['Cleaning & Sanitizing', 'Physical Distancing', 'Protective Equipment','Screening']
    safety_tags = ['sanitizing', 'distancing', 'ppe', 'screening']
    
    # check whether restaurant has safety information element at all
    if not (curr_rest.find('div', attrs = {"id" : "safety-precautions"}) is None):
    
        # set keys for safety information to 0
        for item in safety_tags:
            curr_rest_dict[item] = 0


        # get COVID-19 safety information
        safety_html = curr_rest.find_all('div', attrs = {"class" : "_77b505d0 _965a91d5"})
        safety_features = [item.find('span').string for item in safety_html]

        for i in range(len(safety_categories)):
            for j in range(len(safety_features)):
                if safety_categories[i] == safety_features[j]:
                    curr_rest_dict[safety_tags[i]] = 1

In [121]:
list1 = get_restaurants(page1)

100 restaurants on results page https://www.opentable.com/s?dateTime=2021-07-18T17%3A00%3A00&covers=1&latitude=40.751637&longitude=-73.986409&page=1 were found
Il Gattopardo restaurant
STK - NYC - Midtown restaurant
The Liberty NYC restaurant
Fogo de Chao - New York restaurant
Koi - New York restaurant
Tony's Di Napoli - Midtown restaurant
L'adresse restaurant
Quality Bistro restaurant
The Smith - Nomad restaurant
ilili restaurant
Benjamin Steakhouse Prime restaurant
L'Amico restaurant
Estiatorio Milos – Midtown New York restaurant
Del Frisco's Double Eagle Steakhouse - New York City restaurant
Spyglass restaurant
Stella 34 Trattoria restaurant
Becco restaurant
Newly added Primal Cut Grille restaurant
Serra by Birreria restaurant
Gallaghers Steakhouse - Manhattan restaurant
Pergola restaurant
La Grande Boucherie restaurant
Avra Estiatorio on 48th restaurant
Castell Rooftop Lounge restaurant
Mastro's Steakhouse - New York City restaurant
Wolfgang's Steak House - Times Square restaurant


In [122]:
list1

[{'name': 'Il Gattopardo restaurant',
  'url': 'https://www.opentable.com/r/il-gattopardo-new-york?corrid=6c317664-e21e-4479-a836-d1602a6a3150&avt=eyJ2IjoyLCJtIjoxLCJwIjoxLCJzIjowLCJuIjowfQ&p=1&sd=2021-07-18T17%3A00%3A00',
  'promoted': 1,
  'price_tier': '$$$$',
  'review_count': '1857 Reviews',
  'food': 4.7,
  'service': 4.8,
  'ambience': 4.7,
  'value': 4.3,
  'overall': 4.8,
  'noise': 'Moderate',
  'pct_recommended': 98,
  'neighborhood': 'Midtown West',
  'hours': None,
  'cuisines': 'Italian',
  'dining_style': 'Fine Dining',
  'dress_code': 'Business Casual',
  'chef': 'Vito Gnazzo',
  'tags': 'Banquet, Bar/Lounge, Beer, Cocktails, Counter Seating, Delivery, Entertainment, Full Bar, Gluten-free Menu, Late Night, Non-Smoking, Outdoor dining, Patio/Outdoor Dining, Private Room, Takeout, Weekend Brunch, Wine',
  'primary_cuisine': 'Italian',
  'sanitizing': 0,
  'distancing': 0,
  'ppe': 0,
  'screening': 0},
 {'name': 'STK - NYC - Midtown restaurant',
  'url': 'https://www.open

In [112]:
start = time.time()
dict1 = {}
get_restaurant_info(a['url'], dict1)
print(f'{time.time()-start} seconds per restaurant') # this means about 10 minutes per results page
# 38 results pages means 10 hours to crawl all 3750 restaurants in manhattan

9.278658390045166 seconds per restaurant


In [50]:
url = 'https://www.opentable.com/r/il-gattopardo-new-york?corrid=c743ae41-4bd6-402e-be5f-4e6748fffc49&avt=eyJ2IjoyLCJtIjoxLCJwIjoxLCJzIjowLCJuIjowfQ&p=1&sd=2021-07-18T17%3A00%3A00'
dict2 = {}
get_restaurant_info(url, dict2)
dict2

<class 'NoneType'>


{'review_count': '1858 Reviews',
 'overall': 4.8,
 'food': 4.7,
 'service': 4.8,
 'ambience': 4.7,
 'value': 4.3,
 'noise': 'Moderate',
 'pct_recommended': 98,
 'neighborhood': 'Midtown West',
 'hours': None,
 'cuisines': 'Italian',
 'dining_style': 'Fine Dining',
 'dress_code': 'Business Casual',
 'chef': 'Vito Gnazzo',
 'tags': 'Banquet, Bar/Lounge, Beer, Cocktails, Counter Seating, Delivery, Entertainment, Full Bar, Gluten-free Menu, Late Night, Non-Smoking, Outdoor dining, Patio/Outdoor Dining, Private Room, Takeout, Weekend Brunch, Wine',
 'primary_cuisine': 'Italian',
 'sanitizing': None,
 'distancing': None,
 'ppe': None,
 'screening': None}

In [51]:
url2 = 'https://www.opentable.com/restaurant/profile/1143472?corrid=6d5d5f9e-c8de-45e1-8d3c-682560ac92ec&avt=eyJ2IjoyLCJtIjowLCJwIjowLCJzIjowLCJuIjowfQ&p=1&sd=2021-07-17T23%3A00%3A00'
dict3 = {}
get_restaurant_info(url2, dict3)
dict3

<class 'bs4.element.Tag'>


{'review_count': 'No Reviews',
 'neighborhood': 'Upper West Side',
 'hours': None,
 'cuisines': 'Indian, Vegetarian / Vegan, Pakistani',
 'dining_style': 'Casual Dining',
 'dress_code': 'Casual Dress',
 'tags': 'Non-Smoking, Wheelchair Access',
 'primary_cuisine': 'Indian',
 'sanitizing': 1,
 'distancing': 1,
 'ppe': 1,
 'screening': 1}

In [21]:
type(b)

NoneType

In [96]:
dict1

{'review_count': '7784 Reviews',
 'overall': 4.7,
 'food': 4.6,
 'service': 4.6,
 'ambience': 4.5,
 'value': 4.3,
 'noise': 'Moderate',
 'pct_recommended': 96,
 'neighborhood': 'Lincoln Square',
 'hours': None,
 'cuisines': 'Italian, Vegetarian / Vegan',
 'dining_style': 'Casual Elegant',
 'dress_code': 'Smart Casual',
 'chef': None,
 'tags': 'Banquet, Beer, BYO Wine, Cafe, Cocktails, Corkage Fee, Counter Seating, Delivery, Full Bar, Late Night, Non-Smoking, Outdoor dining, Patio/Outdoor Dining, Takeout, View, Weekend Brunch, Wheelchair Access, Wine',
 'primary_cuisine': 'Italian',
 'sanitizing': 1,
 'distancing': 1,
 'ppe': 1,
 'screening': 1}

In [3]:
keys = ['name', 'url', 'promoted', 'price_tier', 'review_count', 'overall', 'food', 'service', 'ambience', 'value',
               'noise', 'pct_recommended', 'neighborhood', 'cuisines', 'dining_style', 'dress_code', 'chef', 'tags',
               'primary_cuisine', 'sanitizing', 'distancing', 'ppe', 'screening']

len(keys)
na=[None]*23
len(na)

23