# Extract Restaurant Review
Notebook used to extract the TripAdvisor Restaurant Review data

## 1. Basic Setup

### 1.1 Loading of Libaries

In [1]:
from selenium import webdriver      # conda install -c conda-forge selenium
from time import sleep
from selenium.webdriver.chrome.options import Options

import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

## 2. Read in Dataset with Restaurant Informtaion
- To generate the "Review Pages" to scrape

In [2]:
df_restaurant = pd.read_csv("data/full_restaurant_rating_data.csv")
df_restaurant_subset = df_restaurant[["url", "num_of_reviews"]].copy()

In [3]:
df_restaurant_subset

Unnamed: 0,url,num_of_reviews
0,https://www.tripadvisor.com.sg/Restaurant_Revi...,16
1,https://www.tripadvisor.com.sg/Restaurant_Revi...,1488
2,https://www.tripadvisor.com.sg/Restaurant_Revi...,1420
3,https://www.tripadvisor.com.sg/Restaurant_Revi...,249
4,https://www.tripadvisor.com.sg/Restaurant_Revi...,626
...,...,...
11156,https://www.tripadvisor.com.sg/Restaurant_Revi...,0
11157,https://www.tripadvisor.com.sg/Restaurant_Revi...,0
11158,https://www.tripadvisor.com.sg/Restaurant_Revi...,0
11159,https://www.tripadvisor.com.sg/Restaurant_Revi...,0


In [4]:
# Functions that generate the list of "review pages" of a restaurant (given its restaurant url page on TripAdvisor)

def get_list_of_reviews_url(url, num_of_reviews):
    lst_of_urls = [url]
    url_front = re.findall(".*\-Reviews", url)[0]
    url_back = re.findall("(?<=Reviews\-).*",  url)[0]

    max_review_page = int(np.ceil(num_of_reviews/ 10) * 10)
    for i in range(10, max_review_page, 10):
        new_url = url_front + f"-or{i}-" + url_back
        lst_of_urls.append(new_url)

    return lst_of_urls

In [5]:
# variable to control which range of index to scrape from df_restaurant
index_start = 2000;
index_end = 2003;

driver = webdriver.Chrome()

dct_review_links = {}   # key: restaurant url, value: list of review pages url

In [6]:
for idx, row in df_restaurant.loc[index_start:index_end,:].iterrows():
        list_of_urls = get_list_of_reviews_url(row["url"], row["num_of_reviews"])
        counter = len(list_of_urls)
        for i in range(len(list_of_urls)-1, -1, -1):
            driver.get(list_of_urls[i])
            curr_url = driver.current_url
            if curr_url == list_of_urls[0]:
                counter = counter - 1
            else:
                break
        dct_review_links[idx] = [row["url"]]
        print(f"Done {idx} of {index_end}")

Done 2000 of 2003
Done 2001 of 2003
Done 2002 of 2003
Done 2003 of 2003


In [7]:
df_links = pd.DataFrame()

for key, values in dct_review_links.items():
    df_links = df_links.append(pd.DataFrame({"restaurant_url":[df_restaurant.loc[key, "url"]] * len(values), "review_url": values}))

df_links.reset_index(drop=True, inplace=True)

In [8]:
df_links

Unnamed: 0,restaurant_url,review_url
0,https://www.tripadvisor.com.sg/Restaurant_Revi...,https://www.tripadvisor.com.sg/Restaurant_Revi...
1,https://www.tripadvisor.com.sg/Restaurant_Revi...,https://www.tripadvisor.com.sg/Restaurant_Revi...
2,https://www.tripadvisor.com.sg/Restaurant_Revi...,https://www.tripadvisor.com.sg/Restaurant_Revi...
3,https://www.tripadvisor.com.sg/Restaurant_Revi...,https://www.tripadvisor.com.sg/Restaurant_Revi...


## 3. Scraping of Restaurant Reviews

In [9]:
# Function to scrape review information from a page
def scrape_page_review(df, restaurant_url):
    review_container = driver.find_elements_by_class_name("review-container")
    date_of_visit_list = [e.find_element_by_class_name("prw_reviews_stay_date_hsx").text for e in review_container]
    for i in range(0, len(review_container)):
        # ----------------------------- Review Title ----------------------------- #
        review_title = review_container[i].find_element_by_class_name("quote").text

        # ----------------------------- Review Date ----------------------------- #
        review_date = review_container[i].find_element_by_class_name("ratingDate").get_attribute("title")

        # ----------------------------- Review Rating (out of 5) ----------------------------- #
        try:
            review_rating = review_container[i].find_element_by_class_name("ui_bubble_rating").get_attribute("class")
        except:
            return df

        # ----------------------------- Review Description ----------------------------- #
        review_description = review_container[i].find_element_by_class_name("partial_entry").get_attribute("textContent").replace("...More", " ")
        # ----------------------------- More Review Description ----------------------------- #
        try:
            more_description = driver.find_elements_by_class_name("review-container")[i].find_element_by_class_name("postSnippet").get_attribute("innerHTML")
            review_description = review_description.replace + more_description
        except:
            pass
        
        try:
            driver.find_elements_by_class_name("review-container")[i].find_element_by_class_name("ulBlueLinks").click()
        except:
            pass

        # ----------------------------- Date of visit to restaurant ----------------------------- #
        review_date_of_visit = date_of_visit_list[i]
        
        # ----------------------------- Image uploaded ----------------------------- #
        try:
            review_img_src = [element.find_element_by_tag_name("img").get_attribute("src") 
                for element in review_container[i].find_elements_by_class_name("imgWrap")]
        except:
            review_img_src = []

        df_temp = pd.DataFrame({
            "rating": [review_rating],
            "date": [review_date],
            "title": [review_title],
            "description": [review_description],
            "date_of_visit": [review_date_of_visit],
            "img_src": [review_img_src],
            "url": [restaurant_url]
        })

        df = df.append(df_temp)

    return df

In [10]:
# to store scraped dataset
df_scraped = pd.DataFrame()

In [11]:
# index of df_links to begin scrape
start = 0

# number of times to run the codes below
loop = 2

In [12]:
for i in range(1,loop):
    driver = webdriver.Chrome()
    driver.maximize_window()

    index_start = start + ((i-1) * 10)
    index_end = start + (i*10) - 1
    
    df_scraped = pd.DataFrame()
    for idx, row in df_links.loc[index_start:index_end,:].iterrows():
        driver.get(row["review_url"])
        sleep(10)        # shorter sleep time faster but easier to break from StaleElementReferenceException
        attempts = 0
        done = False
        while (attempts < 5) and (not done):
            try:
                df_scraped = scrape_page_review(df_scraped, row["restaurant_url"])
                print(f"Done {idx} of {index_end}")
                done = True
            except:
                attempts += 1
            
    # scraped file goes into review directory
    df_scraped.to_csv(f"review/reviews_from_{index_start}_to_{idx}.csv", encoding="utf-8-sig", index=False)
    driver.quit()

Done 0 of 9
Done 1 of 9
Done 2 of 9
Done 3 of 9


In [14]:
df_scraped.reset_index(drop=True, inplace=True)
df_scraped

Unnamed: 0,rating,date,title,description,date_of_visit,img_src,url
0,ui_bubble_rating bubble_50,16 February 2022,Best tempura experience in Singapore,Awesome place to enjoy other type of Japanese ...,Date of visit: February 2022,"[data:image/gif;base64,R0lGODlhAQABAAAAACH5BAE...",https://www.tripadvisor.com.sg/Restaurant_Revi...
1,ui_bubble_rating bubble_50,15 January 2022,Best tempura in Singapore,"The batter is thin and crispy, so we can enjoy...",Date of visit: December 2021,"[data:image/gif;base64,R0lGODlhAQABAAAAACH5BAE...",https://www.tripadvisor.com.sg/Restaurant_Revi...
2,ui_bubble_rating bubble_50,23 September 2021,Repeat customer,"Chloe's service is nothing short of excellent,...",Date of visit: September 2021,[],https://www.tripadvisor.com.sg/Restaurant_Revi...
3,ui_bubble_rating bubble_30,20 December 2020,Room for improvement,Had a nice counter seat. Ordered SGD 85 set of...,Date of visit: December 2020,[https://media-cdn.tripadvisor.com/media/photo...,https://www.tripadvisor.com.sg/Restaurant_Revi...
4,ui_bubble_rating bubble_40,3 November 2020,Tempura overload,My hubby and I went there for dinner one Sat n...,Date of visit: October 2020,[https://media-cdn.tripadvisor.com/media/photo...,https://www.tripadvisor.com.sg/Restaurant_Revi...
5,ui_bubble_rating bubble_10,30 October 2020,rubbish food,honestly might get a sore throat--tempura over...,Date of visit: October 2020,[],https://www.tripadvisor.com.sg/Restaurant_Revi...
6,ui_bubble_rating bubble_10,22 August 2020,Bad value,We didn't get to sit at the counter and the ou...,Date of visit: August 2020,[],https://www.tripadvisor.com.sg/Restaurant_Revi...
7,ui_bubble_rating bubble_50,24 June 2020,We had the tsubaki set and,We had the tsubaki set and the chef was Execut...,Date of visit: June 2020,[],https://www.tripadvisor.com.sg/Restaurant_Revi...
8,ui_bubble_rating bubble_50,20 February 2020,Fantastic tempura by the master himself,We were invited to lunch here today by some fr...,Date of visit: February 2020,"[data:image/gif;base64,R0lGODlhAQABAAAAACH5BAE...",https://www.tripadvisor.com.sg/Restaurant_Revi...
9,ui_bubble_rating bubble_50,2 December 2019,"Great food, attentive service",Had the set menu. Sit down dinner with friends...,Date of visit: November 2019,[],https://www.tripadvisor.com.sg/Restaurant_Revi...
