In [103]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os

In [108]:
class Tareviews:
    
    def __init__(self, headless=False):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        
        if headless:
            options.add_argument('--headless')
            
        self.attractions = []

        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
    def get(self, destination):
        
        self.driver.get(destination)
        
        # wait for the top attractions block
        WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                          'div.attractions-attraction-overview-main-TopPOIs__wrapper--2ZcCL')))
         
        poss = []
        clicks = 0
        
        keep_going = True
        
        while keep_going:
            
            filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
            
            if clicks == 0:

                lst = filtered_list.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__initial_set--")]')
            else:
                lst = filtered_list.find_element_by_xpath('.//div/div[contains(@class, "attractions-attraction-overview-main-TopPOIs__wrapper--")]')
                
            
            for i in lst.find_elements_by_xpath('.//li[contains(@class, "attractions-attraction-overview-main-TopPOIs__item--")]'):
                
                pos = int(i.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__item_position--")]').text.strip())
                
                if pos in poss:
                    continue
                poss.append(pos)
                    
                attraction = defaultdict()

                info = i.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__info--")]')

                for tag in  info.find_elements_by_xpath('.//span[contains(@class, "attractions-commerce-CategoryTag__category_tag--")]'):
                    if tag.text.strip():
                        attraction['category'] = tag.text.lower().strip()
                
                try:
                    a_with_name = info.find_element_by_xpath('.//a[contains(@class, "attractions-attraction-overview-main-TopPOIs__name--")]')
                    attraction['name'] = a_with_name.text.strip().lower()
                    attraction['review_url'] = a_with_name.get_attribute('href')
                    attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
                except:
                    pass
                
                try:
                    rating_div = info.find_element_by_xpath('.//div[@class="ui_poi_review_rating"]')
                    review_counts = rating_div.text.strip().lower()
                    attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
                    rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
                    attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
                except:
                    pass

                self.attractions.append(attraction)
            
            # got through the attractions on the initial list. now what? click on See More
            try:
                see_more = self.driver.find_element_by_xpath('//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__see_more--")]').click()
                clicks += 1
                
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                          '//div[contains(@class, "attractions-attraction-overview-main-Pagination__wrapper--")]')))
                # next url
                next_ = pagination_wrapper.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__button--")]').find_element_by_xpath('.//a')
                
            except:
                
                print('looks like no more attractions left')
                keep_going = False
        
        self.driver.quit()
        
        return self
    
    def save(self, file):
        
        if not os.path.exists('data'):
            os.mkdir('data')
        
        json.dump(self.attractions, open(os.path.join('data', file), 'w'))
        

In [109]:
if __name__ == '__main__':
    
    ta = Tareviews() \
        .get(destination='https://www.tripadvisor.com.au/Attractions-g255100-Activities-Melbourne_Victoria.html') \
        .save('attractions.json')

looks like no more attractions left
