# Extracting Restaurant Info

## 1. Basic Setup

### 1.1 Loading of Libaries

In [1]:
from selenium import webdriver      # conda install -c conda-forge selenium
from time import sleep
from selenium.webdriver.chrome.options import Options

import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

### 1.2 Initialising Chromedriver

In [2]:
# from webdriver_manager.chrome import ChromeDriverManager
# driver = webdriver.Chrome(ChromeDriverManager().install())

In [3]:
# configuration tool that can be passed into webdriver.Chrome
options = Options()

# windows
# https://www.youtube.com/watch?v=Xjv1sY630Uc 
# PATH = "C:\Program Files\chromedriver.exe"
# driver = webdriver.Chrome(PATH)

# macOS
# https://www.edureka.co/community/52315/how-to-setup-chrome-driver-with-selenium-on-macos
# need to put chromedriver.exe into /usr/local/bin
driver = webdriver.Chrome()
driver.maximize_window()

# alternatively
# from webdriver_manager.chrome import ChromeDriverManager
# driver = webdriver.Chrome(ChromeDriverManager().install())

## 2. Read in Dataset with the Restaurant Links

In [4]:
df_links = pd.read_excel("data/tripadvisor_restaurant_links.xlsx")  # from 01_scrape_restaurant_links notebook

## 3. Scraping of Restaurant Information

In [5]:
# to store scrapped data
final_df = pd.DataFrame()

In [6]:
index_start = 0
index_end = 20

In [7]:
for i in df_links.loc[index_start:index_end, "links"]:
    driver.get(i)
    sleep(2)
    
    # ----------------------------- Restaurant Name ----------------------------- #
    restaurant_name = driver.find_element_by_xpath('//h1[@data-test-target="top-info-header"]').text

    # ----------------------------- Address ----------------------------- #
    address = driver.find_element_by_class_name("brMTW").text

    # ----------------------------- Restaurant Rating ----------------------------- #
    try:
        restaurant_rating_div = driver.find_element_by_xpath("/html/body/div[2]/div[1]/div/div[3]/div/div/div[2]/span[1]/a")
        restaurant_rating = restaurant_rating_div.find_element_by_tag_name("svg").get_attribute("title")
        no_of_reviews = restaurant_rating_div.text
    except:
        restaurant_rating = driver.find_element_by_class_name("RWYkj").get_attribute("title")
        no_of_reviews = driver.find_element_by_class_name("eBTWs").text

    # ----------------------------- Price Tag / Other Tags ----------------------------- #
    try:
        first_tag = driver.find_element_by_class_name("VRlVV").find_elements_by_class_name("drUyy")[0].text
        if "$" in first_tag:
            price_tag = first_tag
        else:
            price_tag = np.nan
    except:
        price_tag = np.nan

    # ----------------------------- Operating hours ----------------------------- #
    try:
        operating_hours_modal_element = driver.find_element_by_xpath("//div[@class='dauAM']")
        operating_hours_modal_element.click()
        restaurant_operating_hours = {
            element.text.split("\n")[0] : element.text.split("\n")[1]
            for element in driver.find_elements_by_class_name("ferBE")}
    except:
        restaurant_operating_hours = {}

    if "Sun" in restaurant_operating_hours:
        sun_operating_hours = restaurant_operating_hours["Sun"]
    else:
        sun_operating_hours = np.nan

    if "Mon" in restaurant_operating_hours:
        mon_operating_hours = restaurant_operating_hours["Mon"]
    else:
        mon_operating_hours = np.nan

    if "Tue" in restaurant_operating_hours:
        tue_operating_hours = restaurant_operating_hours["Tue"]
    else:
        tue_operating_hours = np.nan

    if "Wed" in restaurant_operating_hours:
        wed_operating_hours = restaurant_operating_hours["Wed"]
    else:
        wed_operating_hours = np.nan

    if "Thu" in restaurant_operating_hours:
        thu_operating_hours = restaurant_operating_hours["Thu"]
    else:
        thu_operating_hours = np.nan

    if "Fri" in restaurant_operating_hours:
        fri_operating_hours = restaurant_operating_hours["Fri"]
    else:
        fri_operating_hours = np.nan

    if "Sat" in restaurant_operating_hours:
        sat_operating_hours = restaurant_operating_hours["Sat"]
    else:
        sat_operating_hours = np.nan
    
    # ----------------------------- Detailed Ratings ----------------------------- #
    try:
        restaurant_detailed_ratings = {
            element.text: element.find_element_by_class_name("ui_bubble_rating").get_attribute("class") 
            for element in driver.find_elements_by_class_name("cGQpb")}
    except:
        restaurant_detailed_ratings = {}

    if "Food" in restaurant_detailed_ratings:
        food_rating = restaurant_detailed_ratings["Food"]
    else:
        food_rating = np.nan

    if "Service" in restaurant_detailed_ratings:
        service_rating = restaurant_detailed_ratings["Service"]
    else:
        service_rating = np.nan

    if "Value" in restaurant_detailed_ratings:
        value_rating = restaurant_detailed_ratings["Value"]
    else:
        value_rating = np.nan        

    if "Atmosphere" in restaurant_detailed_ratings:
        atmosphere_rating = restaurant_detailed_ratings["Atmosphere"]
    else:
        atmosphere_rating = np.nan
    
    # ---------- Whether Restaurant is given a "Travellor's Choice" or Michellin Star ------------- #
    try:
        driver.find_element_by_class_name("dsZol")
        is_travellors_choice = True
    except:
        is_travellors_choice = False

    try:
        driver.find_element_by_class_name("bTuKi")
        is_michellin = True
    except:
        is_michellin = False

    # ----------------------------- List of Restaurant Information ----------------------------- #

    try:
        # see more button (with hidden restaurant information)
        see_more_button = driver.find_element_by_class_name("ZlyLX")
        see_more_button.click()
    except:
        pass
    try:
        # alternative see more button (with hidden restaurant information)
        see_more_button = driver.find_element_by_xpath("//a[@class='bQVEd']")
        see_more_button.click()
    except:
        pass
    
    # extracting the key-value information of hidden information
    list_of_features = [j.text for j in driver.find_elements_by_class_name("dMshX.b")]
    list_of_items = [k.text for k in driver.find_elements_by_class_name("cfvAV")]

    if len(list_of_items) == 0 and len(list_of_features) == 0:
        list_of_features = [j.text for j in driver.find_elements_by_class_name("csKes.Wf.b")]
        list_of_items = [k.text for k in driver.find_elements_by_class_name("bYIkW")]

    feature_tags = {
        "PRICE RANGE":[],
        "SPECIAL DIETS":[],
        "MEALS":[],
        "CUISINES":[],
        "FEATURES":[],
    }
    
    # storing restaurant information in dictionary
    for x in range(0, len(list_of_features)):
        feature_tags[list_of_features[x]] = list_of_items[x]

    # ----------------------------- About (Restaurant Description) ----------------------------- #
    try:
        about = driver.find_element_by_class_name("OMpFN").text
    except:
        about = np.nan

    if not about:
        try:
            about = driver.find_element_by_class_name("epsEZ").text
        except:
            pass

    # ----------------------------- Location Details ----------------------------- #
    try:
        location_info = driver.find_element_by_class_name("fnrgn").text

        if "\n" in location_info:
            district = location_info.split("\n")[0].strip()
            second_location_info = location_info.split("\n")[1].strip()

            if "from" in second_location_info:
                distance_from_nearby_attraction = second_location_info.split("from")[0].strip()
                nearby_attraction = second_location_info.split("from")[1].strip()
            else:
                distance_from_nearby_attraction = np.nan
                nearby_attraction = second_location_info
        else:
            if "from" in location_info:
                distance_from_nearby_attraction = location_info.split("from")[0].strip()
                nearby_attraction = location_info.split("from")[1].strip()
                district = np.nan
            else:
                district = location_info
                distance_from_nearby_attraction = np.nan
                nearby_attraction = np.nan        
    except:
        district = np.nan
        distance_from_nearby_attraction = np.nan
        nearby_attraction = np.nan 

    # ----------------------------- Lat/Lng ----------------------------- #
    try:
        lat_lng_string = re.search('(?<=@)(.*),(.*)' , driver.find_element_by_xpath('//a[contains(@href, "https://maps")]').get_attribute("href")).group()
        latitude = float(lat_lng_string.split(",")[0].strip())
        longitude = float(lat_lng_string.split(",")[1].strip())
    except:
        latitude = np.nan
        longitude = np.nan

    df = pd.DataFrame({
        'name': [restaurant_name],
        'address': [address],
        'num_of_reviews': [no_of_reviews],
        'overall_rating': [restaurant_rating],
        'price_category': [price_tag],

        'mon_operating_hours': [mon_operating_hours],
        'tue_operating_hours': [tue_operating_hours],
        'wed_operating_hours': [wed_operating_hours],
        'thu_operating_hours': [thu_operating_hours],
        'fri_operating_hours': [fri_operating_hours],
        'sat_operating_hours': [sat_operating_hours],
        'sun_operating_hours': [sun_operating_hours],

        'food_rating': [food_rating],
        'service_rating': [service_rating],
        'value_rating': [value_rating],
        'atmosphere_rating': [atmosphere_rating],

        'price_range': [feature_tags["PRICE RANGE"]],
        'list_of_meals': [feature_tags["MEALS"]],
        'list_of_cuisines': [feature_tags["CUISINES"]],
        'list_of_features': [feature_tags["FEATURES"]],
        'list_of_special_diets': [feature_tags["SPECIAL DIETS"]],
        
        'region': [district],
        'distance_from_attraction': [distance_from_nearby_attraction],
        'nearby_attraction': [nearby_attraction],
        
        'latitude': [latitude],
        'longitude': [longitude],
        'url': [i]
    })

    final_df = final_df.append(df)

In [8]:
final_df.reset_index(drop=True, inplace=True)

# 4. Data Processing

In [9]:
def extract_integer(x):
    if not pd.isna(x):
        int_match = re.search("\d+\,*\d*", x)
        if int_match:
            match = int_match.group()
            if "," in match:
                output = match.split(",")[0] + match.split(",")[1]
                return int(output)
            else:
                return int(match)
    return np.nan

def extract_float(x):
    if not pd.isna(x):
        float_match = re.search("\-*\d+\.*\d", x)
        if float_match:
            return float(float_match.group())
    return np.nan

def extract_distance_in_km(x):
    if not pd.isna(x):
        if "km" in x:
            float_match = re.search("\d+\.*\d", x)
            if float_match:
                return float(float_match.group())
    return np.nan

dct_price_category = {
    "$": "cheap eats",
    "$$ - $$$": "mid range",
    "$$$$": "fine dining"
}

In [10]:
final_df['num_of_reviews'] = final_df['num_of_reviews'].apply(extract_integer)
final_df['overall_rating'] = final_df['overall_rating'].apply(extract_float)
final_df['food_rating'] = final_df['food_rating'].apply(lambda x: extract_float(x) / 10 if not pd.isna(x) else np.nan)
final_df['service_rating'] = final_df['service_rating'].apply(lambda x: extract_float(x) / 10 if not pd.isna(x) else np.nan)
final_df['value_rating'] = final_df['value_rating'].apply(lambda x: extract_float(x) / 10 if not pd.isna(x) else np.nan)
final_df['atmosphere_rating'] = final_df['atmosphere_rating'].apply(lambda x: extract_float(x) / 10 if not pd.isna(x) else np.nan)

final_df['price_category'] = final_df['price_category'].map(dct_price_category)
final_df['distance_from_attraction'] = final_df['distance_from_attraction'].apply(extract_distance_in_km)
final_df.drop(columns=["distance_from_attraction"], inplace=True)

In [12]:
final_df

Unnamed: 0,name,address,num_of_reviews,overall_rating,price_category,mon_operating_hours,tue_operating_hours,wed_operating_hours,thu_operating_hours,fri_operating_hours,...,price_range,list_of_meals,list_of_cuisines,list_of_features,list_of_special_diets,region,nearby_attraction,latitude,longitude,url
0,Alt Pizza,"60 Robertson Quay The Quayside 01-05, Singapor...",16,4.0,mid range,11:00 AM - 10:00 PM,11:00 AM - 10:00 PM,11:00 AM - 10:00 PM,11:00 AM - 10:00 PM,11:00 AM - 10:00 PM,...,[],"Lunch, Dinner, Drinks","Pizza, Italian","Delivery, Takeout, Reservations, Outdoor Seati...",[],Robertson Quay,Chinatown,1.29026,103.83919,https://www.tripadvisor.com.sg/Restaurant_Revi...
1,Entre-Nous creperie,"27 Seah Street # 01-01, Singapore 188383 Singa...",1498,5.0,mid range,,12:00 PM - 2:30 PM,12:00 PM - 2:30 PM,12:00 PM - 2:30 PM,12:00 PM - 2:30 PM,...,S$13 - S$27,"Lunch, Dinner, Drinks","French, European, Healthy","Takeout, Reservations, Outdoor Seating, Seatin...","Vegetarian Friendly, Vegan Options, Gluten Fre...",Central Area/City Area,City Hall Building,1.296215,103.85462,https://www.tripadvisor.com.sg/Restaurant_Revi...
2,The Courtyard,1 Fullerton Square Fullerton Hotel The Fullert...,1422,4.5,fine dining,09:00 AM - 6:00 PM,09:00 AM - 6:00 PM,09:00 AM - 6:00 PM,09:00 AM - 6:00 PM,09:00 AM - 6:00 PM,...,S$40 - S$67,"Drinks, Breakfast, Lunch, Brunch, Dinner, Afte...","Japanese, Indian, International","Valet Parking, Full Bar, Reservations, Buffet,...","Vegetarian Friendly, Gluten Free Options, Vega...",Central Area/City Area,Merlion Park,1.286365,103.85306,https://www.tripadvisor.com.sg/Restaurant_Revi...
3,Portman's Bar,6 Raffles Boulevard Marina Square Level 4 At P...,250,5.0,fine dining,10:30 AM - 10:30 PM,10:30 AM - 10:30 PM,10:30 AM - 10:30 PM,10:30 AM - 10:30 PM,10:30 AM - 10:30 PM,...,S$10 - S$200,"Breakfast, Lunch, Dinner, Brunch, After-hours,...","Bar, Cafe, International, Singaporean, Wine Bar","Reservations, Seating, Serves Alcohol, Full Ba...",Vegan Options,Marina Centre,Merlion Park,1.291641,103.85701,https://www.tripadvisor.com.sg/Restaurant_Revi...
4,Grand Shanghai Restaurant,"390 Havelock Road King's Centre, Singapore 169...",630,5.0,mid range,,11:30 AM - 2:30 PM,11:30 AM - 2:30 PM,11:30 AM - 2:30 PM,11:30 AM - 2:30 PM,...,[],"Lunch, Dinner, Brunch","Chinese, Asian","Delivery, Reservations, Private Dining, Seatin...","Vegetarian Friendly, Vegan Options, Gluten Fre...",Robertson Quay,Chinatown,1.289593,103.83567,https://www.tripadvisor.com.sg/Restaurant_Revi...
5,Jade,1 Fullerton Square Fullerton Hotel The Fullert...,1033,4.5,fine dining,11:30 AM - 3:00 PM,09:00 AM - 10:30 PM,11:30 AM - 3:00 PM,11:30 AM - 3:00 PM,11:30 AM - 3:00 PM,...,S$34 - S$134,"Lunch, Dinner, Brunch, Drinks","Chinese, Asian, Singaporean","Takeout, Highchairs Available, Wheelchair Acce...","Vegetarian Friendly, Vegan Options, Gluten Fre...",Central Area/City Area,Merlion Park,1.286447,103.85332,https://www.tripadvisor.com.sg/Restaurant_Revi...
6,Derwish-Turkish,"60 Bussorah Street, Singapore 199476 Singapore",227,3.5,mid range,11:00 AM - 10:00 PM,11:00 AM - 10:00 PM,11:00 AM - 10:00 PM,11:00 AM - 10:00 PM,11:00 AM - 11:30 PM,...,[],"Lunch, Dinner, After-hours","Mediterranean, Turkish, Middle Eastern","Takeout, Reservations, Outdoor Seating, Seatin...","Halal, Vegetarian Friendly, Vegan Options",Rochor,Little India,1.301037,103.85984,https://www.tripadvisor.com.sg/Restaurant_Revi...
7,Paulaner Bräuhaus Singapore,9 Raffles Boulevard Millenia Walk #01-01 Mille...,998,4.5,mid range,12:15 PM - 11:30 PM,12:15 PM - 11:30 PM,12:15 PM - 11:30 PM,12:15 PM - 11:30 PM,12:15 PM - 11:30 PM,...,S$13 - S$47,[],[],[],[],Marina Centre,Merlion Park,1.293155,103.85819,https://www.tripadvisor.com.sg/Restaurant_Revi...
8,Positano Risto,"66 Bussorah Street, Singapore 199479 Singapore",1675,5.0,mid range,11:30 AM - 9:30 PM,11:30 AM - 9:30 PM,11:30 AM - 9:30 PM,11:30 AM - 9:30 PM,11:30 AM - 10:30 PM,...,S$10 - S$40,"Lunch, Dinner, Brunch","Italian, Pizza, European, Sicilian, Southern-I...","Takeout, Reservations, Outdoor Seating, Seatin...","Vegetarian Friendly, Vegan Options, Halal, Glu...",Rochor,Little India,1.301108,103.85991,https://www.tripadvisor.com.sg/Restaurant_Revi...
9,Bar-Roque Grill,"165 Tanjong Pagar Road #01-00 Amara Singapore,...",669,4.5,mid range,12:00 PM - 2:30 PM,12:00 PM - 2:30 PM,12:00 PM - 2:30 PM,12:00 PM - 2:30 PM,12:00 PM - 2:30 PM,...,[],"Lunch, Dinner, After-hours","French, Steakhouse, Bar, European, Grill","Takeout, Reservations, Outdoor Seating, Seatin...","Vegetarian Friendly, Vegan Options, Gluten Fre...",Central Area/City Area,Chinatown,1.275679,103.84353,https://www.tripadvisor.com.sg/Restaurant_Revi...


In [13]:
# final_df.to_csv("data/restaurant_rating_data.csv", index=False, encoding="utf-8-sig")

In [11]:
driver.quit()