In [46]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time

In [53]:
class Tareviews:
    
    def __init__(self, headless=False, max_ranking=30):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        
        if headless:
            options.add_argument('--headless')
        
        self.MAX_RANKING = max_ranking
        
        self.attraction_ids = set()
        self.attractions = []

        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
    def scrape_top_attraction(self, attr_item):
        
        attraction = defaultdict()
        
        # try to find attraction ranking (make it integer)
        try:
            pos = int(attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__item_position--")]').text.strip())
        except:
            return (None, None)
                
        info = attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__info--")]')

        for tag in info.find_elements_by_xpath('.//span[contains(@class, "attractions-commerce-CategoryTag__category_tag--")]'):
            if tag.text.strip():
                attraction['category'] = tag.text.lower().strip()
          
        try:
            a_with_name = info.find_element_by_xpath('.//a[contains(@class, "attractions-attraction-overview-main-TopPOIs__name--")]')
            attraction['name'] = a_with_name.text.strip().lower()
            attraction['review_url'] = a_with_name.get_attribute('href')
            attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
            self.attraction_ids.add(attraction['id'])
        except:
            pass
          
        try:
            rating_div = info.find_element_by_xpath('.//div[@class="ui_poi_review_rating"]')
            review_counts = rating_div.text.strip().lower()
            attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
            rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
            attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
        except:
            pass
        
        return (pos, attraction)
    
    def scrape_normal_attraction(self, attr_item):
        
        attraction = defaultdict()
                
        tag = attr_item.find_element_by_xpath('.//div[@class="tag_line"]')

        attraction['category'] = tag.text.lower().strip()
          
        try:
            title_block = attr_item.find_element_by_css_selector('div.listing_title')
        except:
            title_block = None
            print('no title block!')
        
        if title_block:
            a_with_name = title_block.find_element_by_xpath('.//a[@href]')
            attraction['name'] = a_with_name.text.strip().lower()
            attraction['review_url'] = a_with_name.get_attribute('href')
            attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
            self.attraction_ids.add(attraction['id'])
        try:
            rating_div = attr_item.find_element_by_xpath('.//div[@class="listing_rating"]')
            review_counts = rating_div.text.strip().lower()
            attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
            rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
            attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
        except:
            pass
        
        return attraction
    
    def check_for_popup(self):
        
        try:
            pp = self.driver.find_element_by_id('BODY_BLOCK_JQUERY_REFLOW')
            self.driver.switch_to.default_content()
            return True
        except:
            return False
            
    
    def check_pagination_buttons(self, pagination_wrapper):
        
        previous_button = selected_button = last_page = next_button = None
        
        # previous button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__disabled--")]'):
            _text = _.text.lower().strip()
            if _text == 'previous':
                previous_button = _ 
                break
                
        # selected button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__selected--")]'):
            _text = _.text.lower().strip()
            if _text.isdigit():
                selected_button = int(_text)
                break
                
        # last page button
        visible_page_numbers = []
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__link--")]'):
            _text = _.text.lower().strip()
            if _text.isdigit():
                visible_page_numbers.append(int(_text))

        last_page = max(visible_page_numbers)
        
        # next button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__button--")]'):
            _a = _.find_element_by_xpath('.//a')
            if _a and _a.text.strip().lower() == 'next':
                next_button = _a
                break
                
        return (previous_button, selected_button, next_button, last_page)
          
    def get(self, destination):
        
        self.driver.get(destination)
        
        # wait for the top attractions block
        WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                          'div.attractions-attraction-overview-main-TopPOIs__wrapper--2ZcCL')))
         
        
        
        keep_going = True
        
        
        poss = []
        
        see_more_clicked = False
        top_attractions = True
        
        while keep_going:
            
            filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
            
            if top_attractions:
                
                if (not see_more_clicked):
                    lst = filtered_list.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__initial_set--")]')
                else:
                    # we'll browse the additional block that appeared after See More was clicked
                    lst = filtered_list.find_element_by_xpath('.//div/div[contains(@class, "attractions-attraction-overview-main-TopPOIs__wrapper--")]')
                
            
                for i in lst.find_elements_by_xpath(f'.//li[contains(@class, "attractions-attraction-overview-main-TopPOIs__item--")]'):
                
                    pos, attraction = self.scrape_top_attraction(i)
                    
                    if pos:
                        poss.append(pos)
                        print(f'top attractions: {len(poss)}')
                        self.attractions.append(attraction)
                    else:
                        top_attractions = False
            
                # got through the attractions on the initial list. now what? click on See More
                if not see_more_clicked:
                
                    self.driver.find_element_by_xpath('//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__see_more--")]').click()
                    see_more_clicked = True
                    print('clicked See More')
                    continue
                
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[contains(@class, "attractions-attraction-overview-main-Pagination__wrapper--")]')))
            
            
                previous_button, selected_button, next_button, last_page = self.check_pagination_buttons(pagination_wrapper)
                    
                # click once after See More to start looking at the beyong top-30 attractions
                next_button.click()
                top_attractions = False
                print('clicked Next')
                
            else:
                
                for d in filtered_list.find_elements_by_xpath('.//div[@class="attraction_element_tall"]'):
                    attraction = self.scrape_normal_attraction(d)
                    self.attractions.append(attraction)
                
                
                # find next button; there's no next button on the last page
                
                ispopup = self.check_for_popup()
                
                if ispopup:
                    print('there\'s a popup')
                    self.driver.switch_to.default_content()
                    
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[@class="pagination"]')))
            
                previous_button = pagination_wrapper.find_element_by_xpath('.//div/a[contains(@class, "previous")]')
                
                try:
                    next_button = WebDriverWait(self.driver, 15).until(EC.element_to_be_clickable((By.XPATH, 
                                                                      '//div[contains(@class, "pagination")]/a[contains(@class, "next")]')))
                except:
                    print('no next button!')
                    next_button = None
                    
                selected_button = int(pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]/span[contains(@class, "current")]').text.strip().lower())
                
                pn = pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]')
                visible_page_numbers = [int(_) for _ in pn.text.strip().lower().split() if _.isdigit()]
                        
                last_page = max(visible_page_numbers)
                
                print(f'page {selected_button}/{last_page}...')
                
                if (selected_button < last_page) and next_button:
                    next_button.send_keys(Keys.ENTER)
                    print('clicked next')
                    time.sleep(6)
                else:
                    keep_going = False
                    print('this is the last page!')
                    print('selected button is ', selected_button, ' last page is ', last_page)

        
        self.driver.quit()
        
        return self
    
    def get_attraction_info(self, attr_url):
        
        """
        open attr_url and collect all info about this attraction
        """
        
        attraction = defaultdict()
        
        self.driver.get(attr_url)
        
        reviews_block = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.ID, 
                                                                      'REVIEWS')))
        number_reviews = int(''.join([_ for _ in reviews_block.find_element_by_class_name('reviews_header_count').text if _.isdigit()]))
        
        try:
            address = self.driver.find_element_by_class_name('detail').text.lower().strip()
        except:
            address = None
            
        try:
            self.driver.find_element_by_xpath('.//span[contains(@class, "attractions-attraction-detail-about-card-Description__readMore--")]').click()
            about = self.driver.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-detail-about-card-Description__modalText--")]').text
            self.driver.find_element_by_xpath('.//div[contains(@class, "overlays-pieces-CloseX__close--")]').click()
        except:
            about = self.driver.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-detail-about-card-AttractionDetailAboutCard__section--")]').text
        
        attraction['address'] = address
        attraction['about'] = about
        
        print(attraction)
        
        while True:
            
            for review in self.driver.find_elements_by_xpath('//div[@class="review-container"]'):
                
                for _ in review.find_elements_by_xpath('.//span[contains(@class, "ulBlueLinks")]'):
                    if 'more' in _.text:
                        _.click()
                        time.sleep(3)
                        break
                
                id = review.get_attribute('data-reviewid')
                
                print('review id=', id)
                
                member_info = review.find_element_by_xpath('.//div[@class="member_info"]')
                info_text = member_info.find_element_by_xpath('.//div[@class="info_text"]')
                
                user_name = info_text.find_element_by_xpath('.//div').text
                
                print('user name=', user_name)
                
                try:
                    user_location = info_text.find_element_by_xpath('.//div[@class="userLoc"]')
                
                    print('user_location=', user_location)
                except:
                    pass
    
    def save(self, file):
        
        if not os.path.exists('data'):
            os.mkdir('data')
        
        json.dump(self.attractions, open(os.path.join('data', file), 'w'))
        
        return self
        

In [54]:
if __name__ == '__main__':
    
    ta = Tareviews()
    
    ta.get_attraction_info('https://www.tripadvisor.com.au/Attraction_Review-g255100-d522360-Reviews-Shrine_of_Remembrance-Melbourne_Victoria.html')
#     \
#         .get(destination='https://www.tripadvisor.com.au/Attractions-g255100-Activities-Melbourne_Victoria.html') \
#         .save('attractions.json')
    
#     print(len(ta.attraction_ids))

defaultdict(None, {'address': 'sights & landmarks, museums, more', 'about': 'Visit the Shrine of Remembrance, Melbourne\'s most iconic landmark, where Victorians have been coming since 1934 to honour the service and sacrifice of Australian men and women in war and peacekeeping. Enter the Shrine and experience the quiet solitude of the Sanctuary where hundreds of thousands pay their respects each year. All visitors are invited to participate in a Remembrance Ceremony featuring the Ray of Light re-enactment. Surrounded by the Shrine Reserve, the Shrine is elevated, overlooking the city of Melbourne within 13 hectares of beautiful parkland. Enjoy stunning views of the Reserve and many of Melbourne\'s landmarks from the balcony of the Shrine. Explore the monuments and memorials throughout the Reserve, including the Gallipoli Memorial, as well as the many remembrance trees dedicated to Victorian service units from the Boer War onwards. The Eternal Flame on the Second World War Forecourt sym

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//div[@class="userLoc"]"}
  (Session info: chrome=71.0.3578.98)
  (Driver info: chromedriver=2.45.615355 (d5698f682d8b2742017df6c81e0bd8e6a3063189),platform=Mac OS X 10.14.3 x86_64)
