In [14]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow

In [38]:
class Attraction:
    
    def __init__(self):
        
        self.name_ = None
        self.about_ = None
        self.attr_url_ = None
        self.rank_ = None
        self.address_ = None
        self.reviews_ = None
        self.rating_ = None
        self.cat_ = None
        self.id_ = None
    
    @property
    def name(self):
        return self.name_
    
    @name.setter
    def name(self, s):
        if isinstance(s, str) and s.strip():
            self.name_ = s.strip().lower()
            
    @property
    def about(self):
        return self.about_
    
    @about.setter
    def about(self, s):
        if isinstance(s, str) and s.strip():
            self.about_ = s
            
    @property
    def attr_id(self):
        return self.id_
    
    @attr_id.setter
    def attr_id(self, s):
        if isinstance(s, str) and s.strip():
            self.id_ = s
            
    @property
    def attr_url(self):
        return self.attr_url_
    
    @attr_url.setter
    def attr_url(self, s):
        if isinstance(s, str) and s.strip():
            self.attr_url_ = s
            
    @property
    def rank(self):
        return self.rank_
    
    @rank.setter
    def rank(self, r):
        self.rank_ = r 
            
    @property
    def address(self):
        return self.address_
    
    @address.setter
    def address(self, s):
        if isinstance(s, str) and s.strip():
            self.address_ = s.strip().lower()
            
    @property
    def cat(self):
        return self.cat_
    
    @cat.setter
    def cat(self, s):
        if isinstance(s, str):
            self.cat_ = s
            
    @property
    def reviews(self):
        return self.reviews_
    
    @reviews.setter
    def reviews(self, s):
        if isinstance(s, int):
            self.reviews_ = s
            
    @property
    def rating(self):
        return self.rating_
    
    @rating.setter
    def rating(self, s):
        self.rating_ = s
            
    def to_dict(self):
        
        return {'name': self.name_, 
                'about': self.about_,
                'attr_url': self.attr_url_,
                'rank': self.rank,
                'address': self.address_,
                'reviews': self.reviews_,
                'rating': self.rating_,
                'cat': self.cat_,
                'id': self.id_}
        
class Review:
    
    def __init__(self):
        
        self.id_ = None
        self.title_ = None
        self.text_ = None
        self.rating_ = None
        self.date_of_experience_ = None
        
    @property
    def review_id(self):
        return self.id_
    
    @review_id.setter
    def review_id(self, s):
        if isinstance(s, str):
            self.id_ = s
            
    @property
    def title(self):
        return self.title_
    
    @title.setter
    def title(self, s):
        if isinstance(s, str):
            self.title_ = s
    
    @property
    def text(self):
        return self.text_
    
    @text.setter
    def text(self, s):
        if isinstance(s, str):
            self.text_ = s
            
    @property
    def rating(self):
        return self.rating_
    
    @rating.setter
    def rating(self, s):
        self.rating_ = s
            
    @property
    def date_of_experience(self):
        return self.date_of_experience_
    
    @date_of_experience.setter
    def date_of_experience(self, s):
        if isinstance(s, str):
            self.date_of_experience_ = s   
            
class User:
    
    def __init__(self):
        
        self.name_ = None
        self.age_ = None
        self.gender_ = None
        self.loc_ = None
    
    @property
    def name(self):
        return self.name_
    
    @name.setter
    def name(self, s):
        if isinstance(s, str) and s.strip():
            self.name_ = s.strip()
            
    @property
    def age(self):
        return self.age_
    
    @age.setter
    def age(self, s):
        if isinstance(s, str):
            self.age_ = s  
            
    @property
    def gender(self):
        return self.gender_
    
    @gender.setter
    def gender(self, s):
        if isinstance(s, str):
            self.gender_ = s  
    
    @property
    def loc(self):
        return self.loc_
    
    @loc.setter
    def loc(self, s):
        if isinstance(s, str):
            self.loc_ = s
    
            
class Tareviews:
    
    def __init__(self, headless=False, max_ranking=30):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        
        if headless:
            options.add_argument('--headless')
        
        self.MAX_RANKING = max_ranking
        
        self.attraction_ids = set()
        self.attractions = []
        
        try:
            self.user_nicknames = {u['name'] for u in json.load(open(os.path.join('data', 'users.json')))}
        except:
            self.user_nicknames = set()
            
        print(f'available user nicknames: {len(self.user_nicknames)}')

        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
    def get_attraction_basic_info(self, attr_item):
        
        """
        collects basic attraction information from the attraction list (NOT on individual attraction pages!)
        handles both the top ranked and normal attractions
        returns an instance of the Attraction class 
        """
        
        attraction = Attraction()
        
        pref = 'attractions-attraction-overview-main-TopPOIs__'
            
        # try to find attraction ranking; if successfull, it's one of the top attractions, otherwise it's a normal attraction
        try:
            attraction.rank = int(attr_item.find_element_by_xpath(f'.//div[contains(@class, "{pref}item_position--")]').text.strip())
        except:
            pass
            
        if attraction.rank:
     
            info = attr_item.find_element_by_xpath(f'.//div[contains(@class, "{pref}info--")]')

            for tag in info.find_elements_by_xpath('.//span[contains(@class, "attractions-commerce-CategoryTag__category_tag--")]'):
                if tag.text.strip():
                    attraction.cat = tag.text.lower().strip()
          
            try:
                a_with_name = info.find_element_by_xpath(f'.//a[contains(@class, "{pref}name--")]')
                attraction.name = a_with_name.text.strip().lower()
                attraction.attr_url = a_with_name.get_attribute('href')
                attraction.attr_id = re.search(r'd\d+', attraction.attr_url).group(0)
            except:
                pass
          
            try:
                rating_div = info.find_element_by_xpath('.//div[@class="ui_poi_review_rating"]')
                review_counts = rating_div.text.strip().lower()
                attraction.reviews = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
                rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
                attraction.rating = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
            except:
                pass
            
        else:
            
            try:
                attraction.cat = attr_item.find_element_by_xpath('.//div[@class="tag_line"]').text.lower().strip()
            except:
                pass
            
            try:
                title_block = attr_item.find_element_by_css_selector('div.listing_title')
                a_with_name = title_block.find_element_by_xpath('.//a[@href]')
                attraction.name = a_with_name.text.strip().lower()
                attraction.attr_url = a_with_name.get_attribute('href')
                attraction.attr_id = re.search(r'd\d+', attraction.attr_url).group(0)
            except:
                pass
        
            try:
                rating_div = attr_item.find_element_by_xpath('.//div[@class="listing_rating"]')
                review_counts = rating_div.text.strip().lower()
                attraction.reviews = int(re.search(r'\d+\,*\d*', review_counts).group(0).replace(',',''))
                rating_span = rating_div.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]')
                attraction.rating = int(re.search(r'(?<=bubble_)\d+', rating_span.get_attribute('class')).group(0))/10
            except:
                pass
            
        return attraction
    
    def check_for_popup(self):
        
        try:
            pp = self.driver.find_element_by_id('BODY_BLOCK_JQUERY_REFLOW')
            self.driver.switch_to.default_content()
            return True
        except:
            return False
            
    
    def pagination_on_attraction_list_pages(self, pagination_wrapper, is_top):
        
        """
        
        check status of relevant pagination buttons on theattraction list pages (both top and normal attractions)
        
        """
        
        previous_button = selected_button = last_page = next_button = None
        
        if is_top:
            
            # previous button
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__disabled--")]'):
                _text = _.text.lower().strip()
                if _text == 'previous':
                    previous_button = _ 
                    break
        else:
            
            try:
                previous_button = pagination_wrapper.find_element_by_xpath('.//div/a[contains(@class, "previous")]')
            except:
                pass
            
        if is_top:   
            # selected button
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__selected--")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    selected_button = int(_text)
                    break
        else:
            
            try:
                selected_button = int(pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]/span[contains(@class, "current")]').text.strip().lower())
            except:
                pass
            
        if is_top:
                
            # last page button
            visible_page_numbers = []
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__link--")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    visible_page_numbers.append(int(_text))
            last_page = max(visible_page_numbers)
       
        else:
            
            try:
                visible_page_numbers = [int(_) for _ in pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]') \
                                        .text.strip().lower().split() if _.isdigit()]
                        
                last_page = max(visible_page_numbers)
                
            except:
                pass
            
        if is_top:
            
            # next button
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__button--")]'):
                _a = _.find_element_by_xpath('.//a')
                if _a and _a.text.strip().lower() == 'next':
                    next_button = _a
                    break
        else:
            
            try:
                
                next_button = WebDriverWait(self.driver, 15).until(EC.element_to_be_clickable((By.XPATH, 
                                                                      '//div[contains(@class, "pagination")]/a[contains(@class, "next")]')))
            except:
                pass
            
                
        return (previous_button, selected_button, next_button, last_page)
    
    def check_pagination_reviews(self, pagination_wrapper):
        
        previous_button = selected_button = last_page = next_button = None
        
        # previous button
        
        try:
            for _ in pagination_wrapper.find_elements_by_xpath('.//a[contains(@class, "previous")]'):
                _text = _.text.lower().strip()
                if _text == 'previous':
                    previous_button = _ 
                    break
        except:
            pass
                
        # selected button
        
        try:
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[@class="pageNumbers"]/a[contains(@class, "current")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    selected_button = int(_text)
                    break
        except:
            pass
                
        # last page button
        
        try:
            last_page = max([int(t.text) for t in pagination_wrapper.find_elements_by_xpath('.//div[@class="pageNumbers"]/a') if t.text.isdigit()])
        except:
            pass
        
        # next button
        
        try:
            for _a in pagination_wrapper.find_elements_by_xpath('.//a[contains(@class, "next")]'):
                if _a and _a.text.strip().lower() == 'next':
                    next_button = _a
                    break
        except:
            pass
                
        return (previous_button, selected_button, next_button, last_page)
          
    def get_attractions_basic_info(self, url):
        
        self.driver.get(url)
        
        pref = 'attractions-attraction-overview-main-TopPOIs__'
        
        # is this the top attraction overview page?  
        try:
            top_attractions_title = self.driver.find_element_by_xpath(f'//div[contains(@class, "{pref}title--")]')
        except:
            top_attractions_title = None
            
        if top_attractions_title:
            print(top_attractions_title.text.strip())
            is_top = True
            # wait for the top attractions block
            WebDriverWait(self.driver, 15) \
                .until(EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "{pref}wrapper--")]')))
        else:
            is_top = False
            
        
        see_more_clicked = False                          
        keep_going = True
                                             
        while keep_going:
            
            filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
            
            if is_top:
                
                if (not see_more_clicked):
                    lst = filtered_list.find_element_by_xpath(f'.//div[contains(@class, "{pref}initial_set--")]')
                else:
                    # we'll browse the additional block that appeared after See More was clicked
                    lst = filtered_list.find_element_by_xpath(f'.//div/div[contains(@class, "{pref}wrapper--")]')
                
                for i in lst.find_elements_by_xpath(f'.//li[contains(@class, "{pref}item--")]'):
                
                    attraction = self.get_attraction_basic_info(i)
                    
                    if attraction.attr_id:
                        self.attraction_ids.add(attraction.attr_id)
                    
                    self.attractions.append(attraction)

            
                # got through the attractions on the initial list. now what? click on See More
                if not see_more_clicked:
                
                    self.driver.find_element_by_xpath(f'//div[contains(@class, "{pref}see_more--")]').click()
                    see_more_clicked = True
                    continue
                
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[contains(@class, "attractions-attraction-overview-main-Pagination__wrapper--")]')))
            
            
                previous_button, selected_button, next_button, last_page = self.pagination_on_attraction_list_pages(pagination_wrapper, is_top=is_top)
                    
                # click once after See More to start looking at the beyong top-30 attractions
                next_button.click()
                is_top = False
                
            else:
                
                for d in filtered_list.find_elements_by_xpath('.//div[@class="attraction_element_tall"]'):
                    
                    attraction = self.get_attraction_basic_info(d)
                    
                    if attraction.attr_id:
                        self.attraction_ids.add(attraction.attr_id)
                        
                    self.attractions.append(attraction)
                
                
                # find next button; there's no next button on the last page
                
#                 ispopup = self.check_for_popup()
                
#                 if ispopup:
#                     print('there\'s a popup')
#                     self.driver.switch_to.default_content()
                    
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.XPATH, 
                                                                      '//div[@class="pagination"]')))

                previous_button, selected_button, next_button, last_page = self.pagination_on_attraction_list_pages(pagination_wrapper, is_top=is_top)
                
                print(f'page {selected_button}/{last_page}...')
                
                if (selected_button < last_page) and next_button:
                    next_button.send_keys(Keys.ENTER)
                    time.sleep(4)
                else:
                    keep_going = False
                    print('selected button is ', selected_button, ' last page is ', last_page)
                    print('this is the last page!')

        
#         self.driver.quit()
        
        return self
    
    def get_attractions_extras(self):
        
        attractions_ = []
        
        for i, a in enumerate(self.attractions, 1):
            a = self.get_attraction_info(a)
            attractions_.append(a)
            
            print(a.to_dict())
            
            if i == 10:
                break
            
        self.attractions = attractions_
        
        return self
    
    
    def get_user_details(self, review_container):
        
        """
        find and return user name and location
        """
        
        user_name = user_location = None
        
        try:
            info_text = review_container.find_element_by_css_selector('div.info_text')
        except:
            return (user_name, user_location)
        
        try:
            user_name = info_text.find_element_by_xpath('.//div').text
        except:
            pass
                
        try:
            user_location = info_text.find_element_by_xpath('.//div[@class="userLoc"]').text
        except:
            pass
        
        info_text.click()
        
        time.sleep(2)
        
        try:
            t_ = self.driver.find_element_by_css_selector('ul.memberdescriptionReviewEnhancements').text
            print(t_)
        except:
            print('no member description text')
            t_ = None
            
        if t_:
            
            try:
                age_bracket = re.search(r'\d+\-\d+', t_).group(0)
            except:
                print('no age bracket!')
            
            gender = 'm' if 'man' in t_ else 'f' if 'woman' in t_ else None
            
        try:
            self.driver.find_element_by_css_selector('body > span > div.ui_close_x').click()
        except:
            pass
        
        return (user_name, user_location)
    
    def get_review_rating(self, review_container):
        
        rating = None
        
        try:
            rating = int(re.search(r'(?<=bubble_)\d+', review_container.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]').get_attribute('class')).group(0))/10
        except:
            pass
        
        return rating
    
    def get_review_rating_date(self, review_container):
        
        rating_date = None
        
        try:
            rating_date = review_container.find_element_by_xpath('.//span[@class="ratingDate"]').get_attribute('title')
        except:
            pass
        
        return rating_date
    
    def get_review_title(self, review_container):
        
        review_title = None
        
        try:
            review_title = review_container.find_element_by_xpath('.//a[contains(@class, "title")]').text
        except:
            pass 
        
        return review_title
    
    def get_date_of_experience(self, review_container):
        
        exp_date = None
        
        try:
            exp_date = review_container.find_element_by_xpath('.//div[contains(@class, "prw_reviews_stay_date_hsx")]').text.split(':')[-1].strip()
        except:
            pass 
        
        # 31 January 2019
        return arrow.get(exp_date, 'MMMM YYYY').format('MM/YYYY')
    
    def get_review_text(self, review_container):
        
        review_text = None
        
        for _ in review_container.find_elements_by_xpath('.//span[contains(@class, "ulBlueLinks")]'):
            if 'more' in _.text.lower():
                _.click()
                time.sleep(2)
                break
        
        try:
            review_text = review_container.find_element_by_xpath('.//p[@class="partial_entry"]').text
        except:
            pass
        
        return review_text
    
    def get_attraction_info(self, attraction):
        
        """
        go to the attraction page and get all useful info;
        - some attractions have NO REVIEWS
        """
        
        try:
            self.driver.get(attraction.attr_url)
        except:
            print(f'can\'t get attraction url {attraction.attr_url}')
            return attraction
        
        reviews_block = WebDriverWait(self.driver, 15).until(EC.visibility_of_element_located((By.ID, 'REVIEWS')))
        
        try:
            number_reviews = int(''.join([_ for _ in reviews_block.find_element_by_class_name('reviews_header_count').text if _.isdigit()]))
            if number_reviews > attraction.reviews:
                print(f'warning: number of reviews for attraction {attraction.attr_id} increased to {number_reviews} (was {attraction.reviews})!')
            elif number_reviews < attraction.reviews:
                print(f'warning: number of reviews for attraction {attraction.attr_id} decreased to {number_reviews} (was {attraction.reviews})!')
        except:
            print(f'can\'t find the number of reviews for attraction id {attraction.attr_id}!')
            number_reviews = 0
       
        attraction.reviews = number_reviews 
         
        try:
            attraction.address = self.driver.find_element_by_css_selector('div.detail_section.address').text.lower().strip()
        except:
            print(f'can\'t find address for attraction id {attraction.attr_id}!')
            
        try:
            # if theres an option to extend description via clicking More, do it
            self.driver.find_element_by_xpath('.//span[contains(@class, "attractions-attraction-detail-about-card-Description__readMore--")]').click()
            about = self.driver.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-detail-about-card-Description__modalText--")]').text
            # close the window with full description
            self.driver.find_element_by_xpath('.//div[contains(@class, "overlays-pieces-CloseX__close--")]').click()
        except:
            # if description is short, just pick it up
            about = self.driver.find_element_by_xpath('.//div[contains(@class, "attractions-attraction-detail-about-card-AttractionDetailAboutCard__section--") and not(contains(@class, "title"))]').text
        
        attraction.about = about
        
        return attraction
        
#         while True:
            
#             pagi = self.driver.find_element_by_xpath('//div[contains(@class, "ui_pagination")]')

#             previous_button, selected_button, next_button, last_page = self.check_pagination_reviews(pagi)
            
#             for review in self.driver.find_elements_by_xpath('//div[@class="review-container"]'):
 
#                 id = review.get_attribute('data-reviewid')
                
#                 rating = self.get_review_rating(review)
                
#                 print('review id=', id)
#                 print('review rating=', rating)
                
#                 rating_date = self.get_review_rating_date(review)
                
#                 title = self.get_review_title(review)
                
#                 print('rating date=', rating_date)
#                 print('review_title=', title)
                
#                 exp_date = self.get_date_of_experience(review)
                
#                 print('date of experience: ', exp_date)
                
#                 txt = self.get_review_text(review)
                
#                 print('text: ', txt)
                
#                 user_name, user_location = self.get_user_details(review)
                
#                 print(f'user_name: {user_name}, user_location: {user_location}')
                
#             break
                
                
    
    def save(self, file):
        
        if not os.path.exists('data'):
            os.mkdir('data')
        
        json.dump([a.to_dict() for a in self.attractions], open(os.path.join('data', file), 'w'))
        
        return self
        

In [39]:
if __name__ == '__main__':
    
    ta = Tareviews()
    
    ta.get_attractions_basic_info('https://www.tripadvisor.com.au/Attractions-g255100-Activities-Melbourne_Victoria.html') \
    .get_attractions_extras() \
    .save('attractions.json')

Top Attractions in Melbourne
page 2/16...
page 3/16...
page 4/16...
page 5/16...
page 6/16...
page 7/16...
page 8/16...
page 9/16...
page 10/16...
page 11/16...
page 12/16...
page 13/16...
page 14/16...
page 15/16...
page 16/16...
selected button is  16  last page is  16
this is the last page!
{'name': 'melbourne cricket ground (mcg)', 'about': "The Melbourne Cricket Ground (MCG) is Australia's largest, oldest and most popular sporting venue. The MCG has hosted plenty of international cricket, including the first-ever Test and the 1992 World Cup final, countless VFL/AFL Grand Finals, the 1956 Olympic Games and 2006 Commonwealth Games. It also hosted the final of ICC Cricket World Cup 2015. Other sporting spectacles include FIFA World Cup soccer qualifiers, rugby league home and away matches and State of Origin and international rugby union clashes. Apart from its sporting events, the MCG has also witnessed many blockbuster music concerts, and even Pope John Paul II held a mass there wh