In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow

In [8]:
class Attraction:
    
    def __init__(self):
        
        self._name = None
        self._about = None
        self._address = None
        self._reviews = None
    
    @property
    def name(self):
        return self._name
    
    @name.setter
    def name(self, s):
        if isinstance(s, str) and s.strip():
            self._name = s.strip().lower()
            
    @property
    def about(self):
        return self._about
    
    @about.setter
    def about(self, s):
        if isinstance(s, str) and s.strip():
            self._about = s
            
    @property
    def address(self):
        return self._address
    
    @address.setter
    def address(self, s):
        if isinstance(s, str) and s.strip():
            self._address = s.strip().lower()
            
    @property
    def reviews(self):
        return self._reviews
    
    @about.setter
    def reviews(self, s):
        if isinstance(s, str) and s.strip():
            self._reviews = s
        

class Tareviews:
    
    def __init__(self, headless=False, max_ranking=30):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        
        if headless:
            options.add_argument('--headless')
        
        self.MAX_RANKING = max_ranking
        
        self.attraction_ids = set()
        self.attractions = []

        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
    def scrape_top_attraction(self, attr_item):
        
        attraction = defaultdict()
        
        # try to find attraction ranking (make it integer)
        try:
            pos = int(attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__item_position--")]').text.strip())
        except:
            return (None, None)
                
        info = attr_item.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__info--")]')

        for tag in info.find_elements_by_xpath('.//span[contains(@class, "attractions-commerce-CategoryTag__category_tag--")]'):
            if tag.text.strip():
                attraction['category'] = tag.text.lower().strip()
          
        try:
            a_with_name = info.find_element_by_xpath('.//a[contains(@class, "attractions-attraction-overview-main-TopPOIs__name--")]')
            attraction['name'] = a_with_name.text.strip().lower()
            attraction['review_url'] = a_with_name.get_attribute('href')
            attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
            self.attraction_ids.add(attraction['id'])
        except:
            pass
          
        try:
            rating_div = info.find_element_by_xpath('.//div[@class="ui_poi_review_rating"]')
            review_counts = rating_div.text.strip().lower()
            attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
            rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
            attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
        except:
            pass
        
        return (pos, attraction)
    
    def scrape_normal_attraction(self, attr_item):
        
        attraction = defaultdict()
                
        tag = attr_item.find_element_by_xpath('.//div[@class="tag_line"]')

        attraction['category'] = tag.text.lower().strip()
          
        try:
            title_block = attr_item.find_element_by_css_selector('div.listing_title')
        except:
            title_block = None
            print('no title block!')
        
        if title_block:
            a_with_name = title_block.find_element_by_xpath('.//a[@href]')
            attraction['name'] = a_with_name.text.strip().lower()
            attraction['review_url'] = a_with_name.get_attribute('href')
            attraction['id'] = re.search(r'd\d+', attraction['review_url']).group(0)
            self.attraction_ids.add(attraction['id'])
        try:
            rating_div = attr_item.find_element_by_xpath('.//div[@class="listing_rating"]')
            review_counts = rating_div.text.strip().lower()
            attraction['review_counts'] = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
            rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
            attraction['bubble_rating'] = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
        except:
            pass
        
        return attraction
    
    def check_for_popup(self):
        
        try:
            pp = self.driver.find_element_by_id('BODY_BLOCK_JQUERY_REFLOW')
            self.driver.switch_to.default_content()
            return True
        except:
            return False
            
    
    def check_pagination_buttons(self, pagination_wrapper):
        
        previous_button = selected_button = last_page = next_button = None
        
        # previous button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__disabled--")]'):
            _text = _.text.lower().strip()
            if _text == 'previous':
                previous_button = _ 
                break
                
        # selected button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__selected--")]'):
            _text = _.text.lower().strip()
            if _text.isdigit():
                selected_button = int(_text)
                break
                
        # last page button
        visible_page_numbers = []
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__link--")]'):
            _text = _.text.lower().strip()
            if _text.isdigit():
                visible_page_numbers.append(int(_text))

        last_page = max(visible_page_numbers)
        
        # next button
        for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__button--")]'):
            _a = _.find_element_by_xpath('.//a')
            if _a and _a.text.strip().lower() == 'next':
                next_button = _a
                break
                
        return (previous_button, selected_button, next_button, last_page)
    
    def check_pagination_reviews(self, pagination_wrapper):
        
        previous_button = selected_button = last_page = next_button = None
        
        # previous button
        
        try:
            for _ in pagination_wrapper.find_elements_by_xpath('.//a[contains(@class, "previous")]'):
                _text = _.text.lower().strip()
                if _text == 'previous':
                    previous_button = _ 
                    break
        except:
            pass
                
        # selected button
        
        try:
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[@class="pageNumbers"]/a[contains(@class, "current")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    selected_button = int(_text)
                    break
        except:
            pass
                
        # last page button
        
        try:
            last_page = max([int(t.text) for t in pagination_wrapper.find_elements_by_xpath('.//div[@class="pageNumbers"]/a') if t.text.isdigit()])
        except:
            pass
        
        # next button
        
        try:
            for _a in pagination_wrapper.find_elements_by_xpath('.//a[contains(@class, "next")]'):
                if _a and _a.text.strip().lower() == 'next':
                    next_button = _a
                    break
        except:
            pass
                
        return (previous_button, selected_button, next_button, last_page)
          
    def collect_attractions(self, destination):
        
        self.driver.get(destination)
        
        # wait for the top attractions block
        WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.XPATH, 
                                                                          '//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__wrapper--")]')))
        poss = [] # attraction ranks already collected
        
        see_more_clicked = False
        top_attractions = True
                                             
        keep_going = True
                                             
        pref = 'attractions-attraction-overview-main-TopPOIs__'
                                             
        while keep_going:
            
            filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
            
            if top_attractions:
                
                if (not see_more_clicked):
                    lst = filtered_list.find_element_by_xpath(f'.//div[contains(@class, "{pref}initial_set--")]')
                else:
                    # we'll browse the additional block that appeared after See More was clicked
                    lst = filtered_list.find_element_by_xpath(f'.//div/div[contains(@class, "{pref}wrapper--")]')
                
            
                for i in lst.find_elements_by_xpath(f'.//li[contains(@class, "attractions-attraction-overview-main-TopPOIs__item--")]'):
                
                    pos, attraction = self.scrape_top_attraction(i)
                    
                    if pos:
                        poss.append(pos)
                        print(f'top attractions: {len(poss)}')
                        self.attractions.append(attraction)
                    else:
                        top_attractions = False
            
                # got through the attractions on the initial list. now what? click on See More
                if not see_more_clicked:
                
                    self.driver.find_element_by_xpath('//div[contains(@class, "attractions-attraction-overview-main-TopPOIs__see_more--")]').click()
                    see_more_clicked = True
                    print('clicked See More')
                    continue
                
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[contains(@class, "attractions-attraction-overview-main-Pagination__wrapper--")]')))
            
            
                previous_button, selected_button, next_button, last_page = self.check_pagination_buttons(pagination_wrapper)
                    
                # click once after See More to start looking at the beyong top-30 attractions
                next_button.click()
                top_attractions = False
                print('clicked Next')
                
            else:
                
                for d in filtered_list.find_elements_by_xpath('.//div[@class="attraction_element_tall"]'):
                    attraction = self.scrape_normal_attraction(d)
                    self.attractions.append(attraction)
                
                
                # find next button; there's no next button on the last page
                
                ispopup = self.check_for_popup()
                
                if ispopup:
                    print('there\'s a popup')
                    self.driver.switch_to.default_content()
                    
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[@class="pagination"]')))
            
                previous_button = pagination_wrapper.find_element_by_xpath('.//div/a[contains(@class, "previous")]')
                
                try:
                    next_button = WebDriverWait(self.driver, 15).until(EC.element_to_be_clickable((By.XPATH, 
                                                                      '//div[contains(@class, "pagination")]/a[contains(@class, "next")]')))
                except:
                    print('no next button!')
                    next_button = None
                    
                selected_button = int(pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]/span[contains(@class, "current")]').text.strip().lower())
                
                pn = pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]')
                visible_page_numbers = [int(_) for _ in pn.text.strip().lower().split() if _.isdigit()]
                        
                last_page = max(visible_page_numbers)
                
                print(f'page {selected_button}/{last_page}...')
                
                if (selected_button < last_page) and next_button:
                    next_button.send_keys(Keys.ENTER)
                    print('clicked next')
                    time.sleep(6)
                else:
                    keep_going = False
                    print('this is the last page!')
                    print('selected button is ', selected_button, ' last page is ', last_page)

        
        self.driver.quit()
        
        return self
    
    def about_attraction(self):
        
        """
        once on the attraction page, get attraction description;
        returns the description as text (as is)
        """
        
        about = None
        
        try:
            # if theres an option to extend description via clicking More, do it
            self.driver.find_element_by_xpath('.//span[contains(@class, "attractions-attraction-detail-about-card-Description__readMore--")]').click()
            about = self.driver.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-detail-about-card-Description__modalText--")]').text
            # close the window with full description
            self.driver.find_element_by_xpath('.//div[contains(@class, "overlays-pieces-CloseX__close--")]').click()
        except:
            # if description is short, just pick it up
            about = self.driver.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-detail-about-card-AttractionDetailAboutCard__section--")]').text
        
        return about
    
    def get_user_details(self, review_container):
        
        """
        find and return user name and location
        """
        
        user_name = user_location = None
        
        try:
            info_text = review_container.find_element_by_css_selector('div.info_text')
        except:
            return (user_name, user_location)
        
        try:
            user_name = info_text.find_element_by_xpath('.//div').text
        except:
            pass
                
        try:
            user_location = info_text.find_element_by_xpath('.//div[@class="userLoc"]').text
        except:
            pass
        
        return (user_name, user_location)
    
    def get_review_rating(self, review_container):
        
        rating = None
        
        try:
            rating = int(re.search(r'(?<=bubble_)\d+', review_container.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]').get_attribute('class')).group(0))/10
        except:
            pass
        
        return rating
    
    def get_review_rating_date(self, review_container):
        
        rating_date = None
        
        try:
            rating_date = review_container.find_element_by_xpath('.//span[@class="ratingDate"]').get_attribute('title')
        except:
            pass
        
        return rating_date
    
    def get_review_title(self, review_container):
        
        review_title = None
        
        try:
            review_title = review_container.find_element_by_xpath('.//a[contains(@class, "title")]').text
        except:
            pass 
        
        return review_title
    
    def get_date_of_experience(self, review_container):
        
        exp_date = None
        
        try:
            exp_date = review_container.find_element_by_xpath('.//div[contains(@class, "prw_reviews_stay_date_hsx")]').text.split(':')[-1].strip()
        except:
            pass 
        
        # 31 January 2019
        return arrow.get(exp_date, 'MMMM YYYY').format('MM/YYYY')
    
    def get_review_text(self, review_container):
        
        review_text = None
        
        for _ in review_container.find_elements_by_xpath('.//span[contains(@class, "ulBlueLinks")]'):
            if 'more' in _.text.lower():
                _.click()
                time.sleep(2)
                break
        
        try:
            review_text = review_container.find_element_by_xpath('.//p[@class="partial_entry"]').text
        except:
            pass
        
        return review_text
    
    def get_attraction_info(self, attr_url):
        
        """
        open attr_url and collect all info about this attraction
        """
        
        attraction = defaultdict()
        
        self.driver.get(attr_url)
        
        reviews_block = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.ID, 'REVIEWS')))
        number_reviews = int(''.join([_ for _ in reviews_block.find_element_by_class_name('reviews_header_count').text if _.isdigit()]))
        
        try:
            address = self.driver.find_element_by_css_selector('div.detail_section.address').text.lower().strip()
        except:
            address = None
            
        a = Attraction()
            
        attraction['address'] = address
        a.address = address
        attraction['about'] = self.about_attraction()
        a.about =  attraction['about']
        
        print(attraction)
        
        print(a)
        
        while True:
            
            pagi = self.driver.find_element_by_xpath('//div[contains(@class, "ui_pagination")]')

            previous_button, selected_button, next_button, last_page = self.check_pagination_reviews(pagi)
            
            print('previous_button=', previous_button)
            print('next_button=', next_button)
            
            print('selected_button=', selected_button)
            print('last_page=', last_page)
            
            for review in self.driver.find_elements_by_xpath('//div[@class="review-container"]'):
                
                
                
                id = review.get_attribute('data-reviewid')
                
                rating = self.get_review_rating(review)
                
                print('review id=', id)
                print('review rating=', rating)
                
                rating_date = self.get_review_rating_date(review)
                
                title = self.get_review_title(review)
                
                print('rating date=', rating_date)
                print('review_title=', title)
                
                exp_date = self.get_date_of_experience(review)
                
                print('date of experience: ', exp_date)
                
                txt = self.get_review_text(review)
                
                print('text: ', txt)
                
                user_name, user_location = self.get_user_details(review)
                
                print(f'user_name: {user_name}, user_location: {user_location}')
                
            break
                
                
    
    def save(self, file):
        
        if not os.path.exists('data'):
            os.mkdir('data')
        
        json.dump(self.attractions, open(os.path.join('data', file), 'w'))
        
        return self
        

In [9]:
if __name__ == '__main__':
    
    ta = Tareviews()
    
    ta.get_attraction_info('https://www.tripadvisor.com.au/Attraction_Review-g255100-d522360-Reviews-Shrine_of_Remembrance-Melbourne_Victoria.html')

defaultdict(None, {'address': 'st. kilda road, melbourne, victoria 3001, australia', 'about': 'Visit the Shrine of Remembrance, Melbourne\'s most iconic landmark, where Victorians have been coming since 1934 to honour the service and sacrifice of Australian men and women in war and peacekeeping. Enter the Shrine and experience the quiet solitude of the Sanctuary where hundreds of thousands pay their respects each year. All visitors are invited to participate in a Remembrance Ceremony featuring the Ray of Light re-enactment. Surrounded by the Shrine Reserve, the Shrine is elevated, overlooking the city of Melbourne within 13 hectares of beautiful parkland. Enjoy stunning views of the Reserve and many of Melbourne\'s landmarks from the balcony of the Shrine. Explore the monuments and memorials throughout the Reserve, including the Gallipoli Memorial, as well as the many remembrance trees dedicated to Victorian service units from the Boer War onwards. The Eternal Flame on the Second World