In [103]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os

In [136]:
class Tareviews:
    
    def __init__(self, headless=False, max_ranking=30):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        
        if headless:
            options.add_argument('--headless')
        
        self.MAX_RANKING = max_ranking
            
        self.attractions = []

        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
    def scrape_top_attraction(self, attr_item):
        
        attraction = defaultdict()
        
        # try to find attraction ranking (make it integer)
        try:
            pos = int(attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__item_position--")]').text.strip())
        except:
            pos = None
                
        info = attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__info--")]')

        for tag in info.find_elements_by_xpath('.//span[contains(@class, "attractions-commerce-CategoryTag__category_tag--")]'):
            if tag.text.strip():
                attraction['category'] = tag.text.lower().strip()
          
        try:
            a_with_name = info.find_element_by_xpath('.//a[contains(@class, "attractions-attraction-overview-main-TopPOIs__name--")]')
            attraction['name'] = a_with_name.text.strip().lower()
            attraction['review_url'] = a_with_name.get_attribute('href')
            attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
        except:
            pass
          
        try:
            rating_div = info.find_element_by_xpath('.//div[@class="ui_poi_review_rating"]')
            review_counts = rating_div.text.strip().lower()
            attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
            rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
            attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
        except:
            pass
        
        return (pos, attraction)
          
    def get(self, destination):
        
        self.driver.get(destination)
        
        # wait for the top attractions block
        WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                          'div.attractions-attraction-overview-main-TopPOIs__wrapper--2ZcCL')))
         
        see_more_clicks = 0
        
        keep_going = True
        top_attractions = True
        
        poss = [0]
        
        while keep_going:
            
            filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
            
            if see_more_clicks == 0:
                lst = filtered_list.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__initial_set--")]')
            
            elif see_more_clicks == 1:
                # we'll browse the additional block that appeared after See More was pressed
                lst = filtered_list.find_element_by_xpath('.//div/div[contains(@class, "attractions-attraction-overview-main-TopPOIs__wrapper--")]')
                
            
            if top_attractions:   
                
                for i in lst.find_elements_by_xpath(f'.//li[contains(@class, "attractions-attraction-overview-main-TopPOIs__item--")]'):
                
                    pos, attraction = self.scrape_top_attraction(i)
                    
                    if pos:
                        poss.append(pos)
                    
                    if (max(poss) >= self.MAX_RANKING) or (not pos) :
                        top_attractions = False

                    self.attractions.append(attraction)
                    
            
            # got through the attractions on the initial list. now what? click on See More
            if see_more_clicks == 0:
                
                see_more = self.driver.find_element_by_xpath('//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__see_more--")]').click()
                see_more_clicks += 1
                print('clicked See More')
                
            # wait for the pagination wrapper
            pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[contains(@class, "attractions-attraction-overview-main-Pagination__wrapper--")]')))
            
            # previous button
            previous_button = None
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__disabled--")]'):
                _text = _.text.lower().strip()
                if _text == 'previous':
                    previous_button = _   
                    
            # selected button
            selected_button = None
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__selected--")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    selected_button = _
                    break
                    
            # last page button
            last_page = None
            visible_page_numbers = []
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__link--")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    visible_page_numbers.append(int(_text))

            last_page = max(visible_page_numbers)
            
            # next button
            next_button = None
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__button--")]'):
                _a = _.find_element_by_xpath('.//a')
                if _a and _a.text.strip().lower() == 'next':
                    next_button = _
                    break
            
                    
            # click once after See More to start looking at the beyong top-30 attractions
            if not top_attractions:
                
                next_button.click()
                print('clicked Next')
                
                filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
                
                for d in filtered_list.find_elements_by_xpath('.//div[@class="attraction_element_tall"]'):
                    print(d.text)
                
                

            
#             keep_going = False
        
#         self.driver.quit()
        
        return self
    
    def save(self, file):
        
        if not os.path.exists('data'):
            os.mkdir('data')
        
        json.dump(self.attractions, open(os.path.join('data', file), 'w'))
        

In [137]:
if __name__ == '__main__':
    
    ta = Tareviews() \
        .get(destination='https://www.tripadvisor.com.au/Attractions-g255100-Activities-Melbourne_Victoria.html') \
        .save('attractions.json')

clicked See More
clicked Next
SPECIALITY MUSEUMS
National Sports Museum
678 reviews
Experiences from AU$25
See 4 Experiences
THEATRES
Her Majesty's Theatre
457 reviews
LIBRARIES
Melbourne Athenaeum Library
2 reviews
CASINOS
Crown Casino
2,004 reviews
Experiences from AU$25
See 3 Experiences
ARCHITECTURAL BUILDINGS
Royal Exhibition Building
602 reviews
Experiences from AU$72
See 6 Experiences
SPORTING EVENTS
Australian Open
197 reviews
SPECIALITY MUSEUMS
The Ian Potter Centre: NGV Australia
676 reviews
Experiences from AU$99
See 1 Experience
FARMS
Myuna Farm
144 reviews
POINTS OF INTEREST & LANDMARKS
Hardware Lane
415 reviews
Experiences from AU$59
See 2 Experiences
FLEA & STREET MARKETS
Prahran Market
347 reviews
VISITOR CENTRES
Melbourne Visitor Centre
1,161 reviews
Experiences from AU$99
See 1 Experience
FACTORY OUTLETS
DFO South Wharf
851 reviews
Experiences from AU$40
See 2 Experiences
POINTS OF INTEREST & LANDMARKS
Lygon Street
1,067 reviews
Experiences from AU$72
See 2 Experience

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//div/div[contains(@class, "attractions-attraction-overview-main-TopPOIs__wrapper--")]"}
  (Session info: chrome=71.0.3578.98)
  (Driver info: chromedriver=2.45.615355 (d5698f682d8b2742017df6c81e0bd8e6a3063189),platform=Mac OS X 10.14.3 x86_64)
