In [20]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time

In [23]:
class Tareviews:
    
    def __init__(self, headless=False, max_ranking=30):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        
        if headless:
            options.add_argument('--headless')
        
        self.MAX_RANKING = max_ranking
            
        self.attractions = []

        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
    def scrape_top_attraction(self, attr_item):
        
        attraction = defaultdict()
        
        # try to find attraction ranking (make it integer)
        try:
            pos = int(attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__item_position--")]').text.strip())
        except:
            return (None, None)
                
        info = attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__info--")]')

        for tag in info.find_elements_by_xpath('.//span[contains(@class, "attractions-commerce-CategoryTag__category_tag--")]'):
            if tag.text.strip():
                attraction['category'] = tag.text.lower().strip()
          
        try:
            a_with_name = info.find_element_by_xpath('.//a[contains(@class, "attractions-attraction-overview-main-TopPOIs__name--")]')
            attraction['name'] = a_with_name.text.strip().lower()
            attraction['review_url'] = a_with_name.get_attribute('href')
            attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
        except:
            pass
          
        try:
            rating_div = info.find_element_by_xpath('.//div[@class="ui_poi_review_rating"]')
            review_counts = rating_div.text.strip().lower()
            attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
            rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
            attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
        except:
            pass
        
        return (pos, attraction)
    
    def scrape_normal_attraction(self, attr_item):
        
        attraction = defaultdict()
                
        tag = attr_item.find_element_by_xpath('.//div[@class="tag_line"]')

        attraction['category'] = tag.text.lower().strip()
          
        try:
            a_with_name = attr_item.find_element_by_xpath('.//div[@class="listing_title"]/a')
            attraction['name'] = a_with_name.text.strip().lower()
            attraction['review_url'] = a_with_name.get_attribute('href')
            attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
        except:
            pass
          
        try:
            rating_div = attr_item.find_element_by_xpath('.//div[@class="listing_rating"]')
            review_counts = rating_div.text.strip().lower()
            attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
            rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
            attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
        except:
            pass
        
        return attraction
    
    def check_pagination_buttons(self, pagination_wrapper):
        
        previous_button = selected_button = last_page = next_button = None
        
        # previous button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__disabled--")]'):
            _text = _.text.lower().strip()
            if _text == 'previous':
                previous_button = _ 
                break
                
        # selected button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__selected--")]'):
            _text = _.text.lower().strip()
            if _text.isdigit():
                selected_button = int(_text)
                break
                
        # last page button
        visible_page_numbers = []
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__link--")]'):
            _text = _.text.lower().strip()
            if _text.isdigit():
                visible_page_numbers.append(int(_text))

        last_page = max(visible_page_numbers)
        
        # next button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__button--")]'):
            _a = _.find_element_by_xpath('.//a')
            if _a and _a.text.strip().lower() == 'next':
                next_button = _
                break
                
        return (previous_button, selected_button, next_button, last_page)
          
    def get(self, destination):
        
        self.driver.get(destination)
        
        # wait for the top attractions block
        WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                          'div.attractions-attraction-overview-main-TopPOIs__wrapper--2ZcCL')))
         
        
        
        keep_going = True
        
        
        poss = []
        
        see_more_clicked = False
        top_attractions = True
        
        while keep_going:
            
            filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
            
            if top_attractions:
                
                if (not see_more_clicked):
                    lst = filtered_list.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__initial_set--")]')
                else:
                    # we'll browse the additional block that appeared after See More was clicked
                    lst = filtered_list.find_element_by_xpath('.//div/div[contains(@class, "attractions-attraction-overview-main-TopPOIs__wrapper--")]')
                
            
                for i in lst.find_elements_by_xpath(f'.//li[contains(@class, "attractions-attraction-overview-main-TopPOIs__item--")]'):
                
                    pos, attraction = self.scrape_top_attraction(i)
                    
                    if pos:
                        poss.append(pos)
                        print(f'top attractions: {len(poss)}')
                        self.attractions.append(attraction)
                    else:
                        top_attractions = False
            
                # got through the attractions on the initial list. now what? click on See More
                if not see_more_clicked:
                
                    self.driver.find_element_by_xpath('//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__see_more--")]').click()
                    see_more_clicked = True
                    print('clicked See More')
                    continue
                
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[contains(@class, "attractions-attraction-overview-main-Pagination__wrapper--")]')))
            
            
                previous_button, selected_button, next_button, last_page = self.check_pagination_buttons(pagination_wrapper)
                    
                # click once after See More to start looking at the beyong top-30 attractions
                next_button.click()
                top_attractions = False
                print('clicked Next')
                
            else:
                
                for d in filtered_list.find_elements_by_xpath('.//div[@class="attraction_element_tall"]'):
                    attraction = self.scrape_normal_attraction(d)
                    self.attractions.append(attraction)
                
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[@class="pagination"]')))
            
                previous_button = pagination_wrapper.find_element_by_xpath('.//div/a[contains(@class, "previous")]')
                
                # find next button; there's no next button on the last page
                try:
                    next_button = WebDriverWait(self.driver, 15).until(EC.element_to_be_clickable((By.XPATH, 
                                                                      '//div[contains(@class, "pagination")]/a[contains(@class, "next")]')))
                except:
                    next_button = None
                    
                selected_button = int(pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]/span[contains(@class, "current")]').text.strip().lower())

                visible_page_numbers = []
                
                pn = pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]')
                for a_ in pn.find_elements_by_xpath('.//a'):
                    text_ = a_.text.strip().lower()
                    if text_.isdigit():
                        visible_page_numbers.append(int(text_))
                        
                last_page = max(visible_page_numbers)
                
                print(f'page {selected_button}/{last_page}...')
                
                if (selected_button < last_page) and next_button:
                    next_button.click()
                    time.sleep(6)
                else:
                    keep_going = False
                    print('this is the last page')

        
        self.driver.quit()
        
        return self
    
    def save(self, file):
        
        if not os.path.exists('data'):
            os.mkdir('data')
        
        json.dump(self.attractions, open(os.path.join('data', file), 'w'))
        

In [24]:
if __name__ == '__main__':
    
    ta = Tareviews() \
        .get(destination='https://www.tripadvisor.com.au/Attractions-g255100-Activities-Melbourne_Victoria.html') \
        .save('attractions.json')

top attractions: 1
top attractions: 2
top attractions: 3
top attractions: 4
top attractions: 5
top attractions: 6
top attractions: 7
top attractions: 8
top attractions: 9
top attractions: 10
clicked See More
top attractions: 11
top attractions: 12
top attractions: 13
top attractions: 14
top attractions: 15
top attractions: 16
top attractions: 17
top attractions: 18
top attractions: 19
top attractions: 20
top attractions: 21
top attractions: 22
top attractions: 23
top attractions: 24
top attractions: 25
top attractions: 26
top attractions: 27
top attractions: 28
top attractions: 29
top attractions: 30
clicked Next
page 2/16...


WebDriverException: Message: unknown error: Element <a data-page-number="3" data-offset="60" href="/Attractions-g255100-Activities-oa60-Melbourne_Victoria.html#FILTERED_LIST" class="nav next rndBtn ui_button primary taLnk" onclick="      ta.setEvtCookie('STANDARD_PAGINATION', 'next', '3', 0, this.href);
  ">...</a> is not clickable at point (1154, 1153). Other element would receive the click: <div style="position: absolute; top: 0px; left: 0px; width: 282px; height: 214px; overflow: hidden; display: block;">...</div>
  (Session info: chrome=71.0.3578.98)
  (Driver info: chromedriver=2.45.615355 (d5698f682d8b2742017df6c81e0bd8e6a3063189),platform=Mac OS X 10.14.3 x86_64)
