# Extracting Links of Restaurants to Scrape

## 1. Basic Setup

### 1.1 Loading of Libaries

In [1]:
from selenium import webdriver      # conda install -c conda-forge selenium
from time import sleep
from selenium.webdriver.chrome.options import Options

import pandas as pd

import warnings
warnings.filterwarnings("ignore")

### 1.2 Initialising Chromedriver

In [2]:
# configuration tool that can be passed into webdriver.Chrome
options = Options()

# windows
# https://www.youtube.com/watch?v=Xjv1sY630Uc 
# PATH = "C:\Program Files\chromedriver.exe"
# driver = webdriver.Chrome(PATH)

# macOS
# https://www.edureka.co/community/52315/how-to-setup-chrome-driver-with-selenium-on-macos
# need to put chromedriver.exe into /usr/local/bin
driver = webdriver.Chrome()

## 2. Scraping of Links

### 2.1 Accessing website to scrape

In [3]:
# website to scrape
driver.get("https://www.tripadvisor.com.sg/Restaurants-g294265-Singapore.html")     ### Website with full listing of restaurants in Singapore

### 2.2 Extracting the page links of all the restaurants listed

#### 2.2.1 Extracting Links from Page 1

In [4]:
# 'a' tag element that redirects to the specific page of a restaurant
tags = driver.find_elements_by_class_name('bHGqj')

# extract the href value of all these 'a' tag elements
links = [element.get_attribute("href") for element in tags]

#### 2.2.2 Navgiating to subsequent pages

In [5]:
# 'next' button element of pagination that leads to the next page
next_page_button = driver.find_element_by_class_name("next")

while next_page_button.is_displayed():
    try:
        next_page_button.click()    # click on 'next' button
        sleep(5)       # wait for next page items to be fully loaded

        # --- extracting links --- #
        new_tags = driver.find_elements_by_class_name('bHGqj')
        new_links = [element.get_attribute("href") for element in new_tags]
        links.extend(new_links)

        # get 'next' button of current page
        next_page_button = driver.find_element_by_class_name("next")
    except:
        break

### 2.3 Exporting the Links into an Excel file

In [7]:
df_output = pd.DataFrame({'links': links})

In [11]:
df_output.drop_duplicates(inplace=True)

In [18]:
len(df_output)

11161

In [17]:
df_output.to_excel("data/tripadvisor_restaurant_links.xlsx", encoding="utf-8-sig", index=False)