<h1> <b> Scripts for Extracting Restaurant Data from Source </b> </h1>
<i> Source: <a> https://www.google.com/maps</a> </i>

In [1]:
from selenium.webdriver import ActionChains
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import numpy as np
import os
import sys
import pandas as pd

from time import sleep


<br>
<h2> <b> Function and Class Definitions </b> </h2>

In [80]:
class Category:
    def __init__(self, button, name):
        self.button = button
        self.name = name

class Review:

    def __init__(self, review_card):
        self.reviewee = self.get_reviewee_name(review_card) 
        self.rating = self.get_rating(review_card) 
        self.comment = self.get_comment(review_card) 

        self.id()

    def get_reviewee_name(self, review_card):
        reviewee_name = review_card.get_attribute('aria-label')
        return reviewee_name
    
    def get_rating(self, review_card):
        selector = ('div.section-review-line div.section-'
                     + 'review-metadata span.section-review-stars')
        rating = review_card.find_element(By.CSS_SELECTOR, selector).get_attribute('aria-label')

        return rating
    
    def get_comment(self, review_card):
        selector = ('div.section-review-content div.section-review-line '
                   + 'div.section-review-review-content ' 
                   + 'span.section-review-text')

        comment = review_card.find_element(By.CSS_SELECTOR, selector) 
        return comment.text
    
    def id(self):
        id = {
            'reviewee name': self.reviewee,
            'rating': self.rating,
            'comment': self.comment
        }

        return id

class Location:
    def __init__(self, driver: Chrome, selector):
        self.block = None  # refer to specific location
        self.city = None
        self.province = None
        self.get_location(driver, selector)
    
    def get_location(self, driver, selector) -> list:
        '''
        Get and preprocess location string
        '''
    
        location_str = driver.find_element(By.CSS_SELECTOR, 
                                           selector).text

        location_list = location_str.split(', ')
        processed_location_list = self.process_location_list(location_list)
        self.set_location(processed_location_list)
    
    def process_location_list(self, location_list: list) -> list:
        # city and province are located at indexed -1 and -2 respectively
        block_list = self.join_blocks(location_list[:-2])

        try:
            city = location_list[-2]
        except IndexError:
            city = None
        else:
            city = location_list[-2]

        province = self.get_Province(location_list[-1])

        processed_location_list = [block_list, city, province]

        return processed_location_list

    def set_location(self, processed_location_list: list):

        self.block = processed_location_list[0]
        self.city = processed_location_list[1]
        self.province = processed_location_list[2]
    
    def get_Province(self, zip_code_province: str) -> str: 
        '''
        if province only return the province str

        if zip + province, preprocess then return the province str
            use filter on zip_code_province: returns list of all characters
            then join each item extracted.
        '''
        try:
            zip_code_province[0].isalpha()
        except:
            province_str = None
        else:
            if zip_code_province[0].isalpha():
                # return as is
                province_str = zip_code_province 
            
            else:
                characters = list(filter(lambda x: ((x.isalpha())|(x == ' ')), 
                                zip_code_province))

                # do not include first white space
                province_str = ''.join(character for character 
                                        in characters[1:])

        return province_str

    def join_blocks(self, blocks_list: list) -> str:
        # blocks are structured such that
        # ... : ['Timog Ave', 'Diliman']
        # ... : ['2F', 'Uptown Parade']
        # join all elements
        # in between each element we add ', '
        # | after join we have something like 
        # ... '2F, Uptown Parade'

        joined_blocks: str = ''
        current_block_index = 0
        last_block_index = len(blocks_list) - 1

        for block in blocks_list:
            if current_block_index != last_block_index:
                joined_blocks += block + ', '
                current_block_index += 1
            else:
                joined_blocks += block

        return joined_blocks 

class Restaurant:
    
    def __init__(self, container):
        self.set_Name(container)
        self.set_ReviewCount(container)
        self.set_Rating(container)
        self.set_PriceRange(container)
        self.set_Specialty(container)
        self.set_Location(container)
        self.set_Services(container)
        
    def set_Name(self, container):
        name = container.find_element(
            By.CSS_SELECTOR, '.fYOrjf .Hhmu2e .Ftghae .SPZz6b h2.qrShPb span').text
        self.name = name
    
    def set_ReviewCount(self, container):
        # original review count has format '203 reviews'
        review_count_str = container.find_element(
            By.CSS_SELECTOR, 'span.hqzQac span a span').text
        
        # remove non numerical characters
        review_count = int(''.join(filter(lambda x: x.isnumeric(), review_count_str)))
        
        self.review_count = review_count
        
    
    def set_Rating(self, container):
        rating = float(container.find_element(By.CSS_SELECTOR, 
                                              'span.Aq14fc').text)
        self.rating = rating
        
    def set_PriceRange(self, container):
        price_range = container.find_elements(By.CSS_SELECTOR,
                                             'span.YhemCb')
        if len(price_range)>=2:
            price_range = price_range[0].text
        else:
            price_range = None
            
        self.price_range = price_range
    
    def set_Specialty(self, container):
        try:
            specialty = container.find_elements(By.CSS_SELECTOR,
                                                'span.YhemCb')[1].text
        except:
            specialty = container.find_elements(By.CSS_SELECTOR,
                                                'span.yhemcb')
            if len(specialty)==0:
                specialty=None
            else:
                specialty=specialty[0]
            
        self.specialty = specialty
    
    def set_Location(self, container):
        location_selector = 'span.LrzXr'
        location = Location(container, location_selector)
        
        self.block = location.block
        self.city = location.city
        self.province = location.province

    def set_Services(self, container):
        service_elements: list = container.find_elements(By.CSS_SELECTOR, 'div.ElGe3c ul.REGfue li.A4D4f span[role="text"]')
            
        if len(service_elements) == 0:
            # list is empty
            services = ['N/A'] 
        else:
            services = [service.get_attribute('aria-label') for service in service_elements]
            
        self.services = services

        
    

def init_base():
       path = 'chromedriver\chromedriver.exe'
       driver = Chrome(path)
       wait = WebDriverWait(driver, 10)
       
       return driver, wait

def search(site_name, driver):
    search_box = driver.find_element(By.ID, 'searchboxinput')
    search_box.send_keys(site_name)

    sleep(3)

    suggestion_1 = driver.find_element(By.CSS_SELECTOR, 'div.sbdd_b div.suggestions div.sbsb_b .sbsb_c')
    suggestion_1.click()

    
def get_SearchedData(driver, index=0, increment=5):
    reviews_collection = []
    
    for name in restaurants_data['Restaurant Name'][i:i+increment]:
        search(name, driver)
        
        try:
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.mapsConsumerUiSubviewSectionGm2Listitem__primary-text')))
        except:
            reviews = None
        else:
            # get reviews 
            driver.find_element(By.CSS_SELECTOR, 'button[jsaction = "pane.rating.moreReviews"]').click()
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.section-layout')))
            sleep(3)
            reviews_container = driver.find_elements(By.CSS_SELECTOR, '.section-layout')[4]
            reviews = [Review(review_card).id() for review_card in reviews_container.find_elements(By.CSS_SELECTOR,'.section-review')[:15]]

            back_button = driver.find_element(By.CSS_SELECTOR,'button.mdc-icon-button')
            back_button.click()

            # wait until x button is visible
            wait.until(EC.visibility_of_element_located((By.ID,'sb_cb50')))

        # click x button, this will clear the search bar
        x_button=driver.find_element(By.ID,'sb_cb50')
        x_button.click()
        
        # store data in initial list first before actual adding to actual data to avoid ...
        # ... appending incomplete data
        reviews_collection.append(reviews)
    
    # add the data
    restaurants_data['Reviews'].extend(reviews_collection)        

def extract_MainData(driver):
    # get 5 cards from current window
    restaurant_name_collection = []
    review_count_collection = []
    rating_collection = []
    price_range_collection = []
    specialty_collection = []
    block_collection = []
    city_collection = []
    province_collection = []
    services_collection = []
    
    card_container = driver.find_element(By.ID, 'search') #div.rl_tile-group div.rlfl__tls')
    cards = card_container.find_elements(By.CSS_SELECTOR, 'div div div.uMdZh div.VkpGBb a.C8TUKc ')[:5]
    
    for card in cards:
        card.click()
        sleep(2)
        container = driver.find_element(By.CLASS_NAME, 'immersive-container')
        restaurant = Restaurant(container)
        
        restaurant_name_collection.append(restaurant.name)
        review_count_collection.append(restaurant.review_count)
        rating_collection.append(restaurant.rating)
        price_range_collection.append(restaurant.price_range)
        specialty_collection.append(restaurant.specialty)
        block_collection.append(restaurant.block)
        city_collection.append(restaurant.city)
        province_collection.append(restaurant.province)
        services_collection.append(restaurant.services)
    
    restaurants_data['Restaurant Name'].extend(restaurant_name_collection)
    restaurants_data['Review Count'].extend(review_count_collection)
    restaurants_data['Rating'].extend(rating_collection)
    restaurants_data['Price Range'].extend(price_range_collection)
    restaurants_data['Specialty'].extend(specialty_collection)
    restaurants_data['Block'].extend(block_collection)
    restaurants_data['City'].extend(city_collection)
    restaurants_data['Province'].extend(province_collection)
    restaurants_data['Services'].extend(services_collection)

<br>
<h2> <b> Data Extraction </b> </h2>

In [33]:
# each url points over a location in google maps, a query for restaurants returns the restaurants present ...
# ... within the area
url = {
        'Metro Manila': ('https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=ALeKk02YqNfecn9rz1E733Bx04XaAtB4Dw:1619154167029&q'
                          + '=restaurant+philippines&rflfq=1&num=10&sa=X&ved=2ahUKEwjng9mny5PwAhUbPXAKHcHMBqUQjGp6BAgCEFc&biw=1920&bih=979#r'
                          + 'lfi=hd:;si:;mv:[[15.112006154864638,122.34085035956056],[14.016796308805139,120.28091383612306],null,[14.565081'
                          + '16626156,121.31088209784181],10]'),
    
        'Northern Philippines': ('https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=ALeKk02YqNfecn9rz1E733Bx04XaAtB4Dw:1619154167029&q=r'
                                + 'estaurant+philippines&rflfq=1&num=10&sa=X&ved=2ahUKEwjng9mny5PwAhUbPXAKHcHMBqUQjGp6BAgCEFc&biw=1920&bih=979#rlfi'
                                + '=hd:;si:;mv:[[18.535119569365957,121.88405795806774],[17.458912768825044,119.82412143463024],null,[17.9978370778'
                                + '60638,120.85408969634899],10]'),
        'Bicol': ('https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=ALeKk02YqNfecn9rz1E733Bx04XaAtB4Dw:1619154167029&q=r'
                  + 'estaurant+philippines&rflfq=1&num=10&sa=X&ved=2ahUKEwjng9mny5PwAhUbPXAKHcHMBqUQjGp6BAgCEFc&biw=1920&bih=979#rlfi'
                  + '=hd:;si:;mv:[[14.091679844838067,124.93425390369251],[12.991565071892175,122.87431738025501],null,[13.5422584073'
                  + '20224,123.90428564197376],10]'),
        'Metro Cebu': ('https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=ALeKk02YqNfecn9rz1E733Bx04XaAtB4Dw:1619154167029&q=r'
                       + 'estaurant+philippines&rflfq=1&num=10&sa=X&ved=2ahUKEwjng9mny5PwAhUbPXAKHcHMBqUQjGp6BAgCEFc&biw=1920&bih=979#rlfi'
                       + '=hd:;si:;mv:[[10.8276251696008,125.06000392446218],[9.71418500248894,123.00006740102468],null,[10.27139521663097'
                       + ',124.03003566274343],10]'),
        'Metro Davao': ('https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=ALeKk02YqNfecn9rz1E733Bx04XaAtB4Dw:1619154167029&q=r'
                        + 'estaurant+philippines&rflfq=1&num=10&sa=X&ved=2ahUKEwjng9mny5PwAhUbPXAKHcHMBqUQjGp6BAgCEFc&biw=1920&bih=979#rlfi'
                        + '=hd:;si:;mv:[[7.700092642732062,126.74669230938842],[6.577291200198587,124.68675578595092],null,[7.1390364001018'
                        + '26,125.71672404766967],10]')
    }

In [135]:
restaurants_data = {
        'Restaurant Name': [],
        'Block': [],
        'City': [],
        'Province': [],
        'Review Count':[],
        'Rating': [],
        'Price Range': [],
        'Specialty': [],
        'Services': [],
        'Reviews': []
    }

In [136]:
driver, wait = init_base()

In [137]:
driver.get('https://www.google.com/')

<br>
<p> do extraction by batch</p>
<p>the extraction was done by batches in order to avoid the need to restart the whole extraction process in the case of a failed extraction. Selenium also has limits on the total amount of data it can handle</p>

In [138]:
# batch 1: Metro Manila
driver.execute_script("window.open('{0}')".format(url['Metro Manila'])) # opens a new window, and swtiches to that window
driver.switch_to.window(driver.window_handles[1])
extract_MainData(driver)

In [140]:
# batch 2: Northern Philippines
driver.execute_script("window.open('{0}')".format(url['Northern Philippines'])) # opens a new window, and swtiches to that window
driver.switch_to.window(driver.window_handles[2])
extract_MainData(driver)

In [141]:
# batch 3: Bicol 
driver.execute_script("window.open('{0}')".format(url['Bicol'])) # opens a new window, and swtiches to that window
driver.switch_to.window(driver.window_handles[3])
extract_MainData(driver)

In [142]:
# batch 4: Metro Cebu 
driver.execute_script("window.open('{0}')".format(url['Metro Cebu'])) # opens a new window, and swtiches to that window
driver.switch_to.window(driver.window_handles[4])
extract_MainData(driver)

In [143]:
# batch 5: Metro Davao 
driver.execute_script("window.open('{0}')".format(url['Metro Davao'])) # opens a new window, and swtiches to that window
driver.switch_to.window(driver.window_handles[5])
extract_MainData(driver)

In [160]:
# edited info on one restaurant
restaurants_data['Restaurant Name'][5] = "Gigiligan's Ayala"

<br>
<p>extract reviews via search.</p>

In [146]:
driver.execute_script("window.open('https://www.google.com/maps/@11.6978351,122.6217542,6z')")
driver.switch_to.window(driver.window_handles[7])

In [169]:
i = 0

# batch 1
get_SearchedData(driver, i)
i+=5

In [170]:
# batch 2
get_SearchedData(driver, i)
i+=5

In [171]:
# batch 3
get_SearchedData(driver, i)
i+=5

In [172]:
# batch 4
get_SearchedData(driver, i)
i+=5

In [174]:
# batch 5
get_SearchedData(driver, i)

In [178]:
restaurants_data_df = pd.DataFrame(restaurants_data)

In [181]:
restaurants_data_df.to_csv('data/Restaurant.csv', index=False)

In [182]:
# check
df = pd.read_csv('data/Restaurant.csv')
df.head()

Unnamed: 0,Restaurant Name,Block,City,Province,Review Count,Rating,Price Range,Specialty,Services,Reviews
0,Cafe Juanita,19 W Capitol Dr,Pasig,Metro Manila,383,4.5,₱₱,Filipino restaurant,"['Serves dine-in', 'Offers takeaway', 'Offers ...","[{'reviewee name': 'May Sajonas Martin', 'rati..."
1,Manam Comfort Filipino,"Central Bus. Dist. Unit 105 G/F Greenbelt 2, E...",Makati,Metro Manila,701,4.4,₱₱,Filipino restaurant,"['Serves dine-in', 'Offers takeaway']","[{'reviewee name': 'Mike Carmona', 'rating': '..."
2,Toyo Eatery,2316 Chino Roces Ave,Makati,Metro Manila,320,4.5,,,"['Serves dine-in', 'Offers takeaway', 'No deli...","[{'reviewee name': 'Catherina Barcel', 'rating..."
3,"SALU, the Filipino Restaurant","UAP Bldg, 53 Sct. Rallos St, Diliman",Quezon City,Metro Manila,171,4.2,₱₱,Filipino restaurant,"['Serves dine-in', 'Offers takeaway', 'No deli...","[{'reviewee name': 'May Sajonas Martin', 'rati..."
4,Gallery By Chele,"5/F Clipp Center, 11th Avenue corner, 39th St",Taguig,Metro Manila,125,4.8,,,"['Serves dine-in', 'Offers curbside pickup', '...","[{'reviewee name': 'S Chan', 'rating': ' 5 sta..."
