In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow
import random

from attraction import Attraction
from review import Review
from user import User

In [2]:
class Tareviews:
    
    def __init__(self, headless=False, save_every=20, min_reviews=20, min_year=2012):
        
        """
        note: because there doesn't seem to be a working way to get rid of the annoying "allow location" notifications
        in Chrome at the moment, we don't implement the tart when you search for a location forst and then go to the things-to-do 
        page. Instead, as a temporary solution, we hardcode a number of the things-to-do page urls to choose from. 
        """
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        prefs = {"profile.default_content_setting_values.notifications" : 2}
        options.add_experimental_option("prefs",prefs)
        
        if headless:
            options.add_argument('--headless')
        
        self.locations = {'sydney': 'https://www.tripadvisor.com.au/Attractions-g255060-Activities-Sydney_New_South_Wales.html',
                         'melbourne': 'https://www.tripadvisor.com.au/Attractions-g255100-Activities-Melbourne_Victoria.html',
                         'perth': 'https://www.tripadvisor.com.au/Attractions-g255103-Activities-Perth_Greater_Perth_Western_Australia.html',
                         'brisbane': 'https://www.tripadvisor.com.au/Attractions-g255068-Activities-Brisbane_Brisbane_Region_Queensland.html',
                         'adelaide': 'https://www.tripadvisor.com.au/Attractions-g255093-Activities-Adelaide_Greater_Adelaide_South_Australia.html',
                         'hobart': 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-Hobart_Greater_Hobart_Tasmania.html',
                         'darwin': 'https://www.tripadvisor.com.au/Attractions-g255066-Activities-Darwin_Top_End_Northern_Territory.html',
                         'canberra': 'https://www.tripadvisor.com.au/Attractions-g255057-Activities-Canberra_Australian_Capital_Territory.html'}
        
        self.attractions = []
        self.reviews = []
        self.users = []
        
        self.attraction_ids = set()
        self.review_ids = set()

        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        self.WAIT_SEC = 20
        self.WAIT_SEC_SHORT = 8
        
        self.min_reviews = min_reviews
        self.min_year = min_year
        
        self.save_every = save_every
        
        self.location = 'hobart'
    
    def click_and_wait(self, el, sec):
        
        clicked = False
        
        t = 0
        
        while t < 3:
            
            try:
                el.click()
                clicked = True
                break
            except:
                try:
                    el.send_keys(Key.RETURN)
                    clicked = True
                    break
                except:
                    self.driver.execute_script("window.scrollBy(0,50);")
                    try:
                        el.click()
                        clicked = True
                        break
                    except:
                        self.driver.execute_script("window.scrollBy(0,-50);")
            t += 1
                    
            
        if clicked:
            
            time.sleep(random.choice(range(sec,sec+3)))
            
        return clicked
            
        
    def get_attr_info(self, attr_item):
        
        """
        collects basic attraction information for A SINGLE ATTRACTION from the attraction list (NOT on individual attraction pages!)
        handles both the top ranked and normal attractions
        returns an instance of the Attraction class 
        
        note: the number of reviews on the top attraction list is not always the same as on the individual attraction pages (for whatever reason)!
               - we trust the number on the attraction page more
               - so, no need to collect the number of reviews on the top attraction list
        """
        
        attraction = Attraction()
        
        pref = 'attractions-attraction-overview-main-TopPOIs__'
            
        # try to find attraction ranking; if successfull, it's one of the top attractions, otherwise it's a normal attraction
        try:
            attraction.rank = int(attr_item.find_element_by_xpath(f'.//div[contains(@class, "{pref}item_position--")]').text.strip())
        except:
            pass
            
        if attraction.rank:
     
            info = attr_item.find_element_by_xpath(f'.//div[contains(@class, "{pref}info--")]')
          
            try:
                a_with_name = info.find_element_by_xpath(f'.//a[contains(@class, "{pref}name--")]')
                attraction.name = a_with_name.text.strip().lower()
                attraction.attr_url = a_with_name.get_attribute('href')
                attraction.attr_id = re.search(r'd\d+', attraction.attr_url).group(0)
            except:
                pass
            
        else:
            
            try:
                title_block = attr_item.find_element_by_css_selector('div.listing_title')
                a_with_name = title_block.find_element_by_xpath('.//a[@href]')
                attraction.name = a_with_name.text.strip().lower()
                attraction.attr_url = a_with_name.get_attribute('href')
                attraction.attr_id = re.search(r'd\d+', attraction.attr_url).group(0)
            except:
                pass
            
        return attraction
    
    def get_attrs_info(self, location, use_local=False):
        
        """
        collect basic attraction information FOR ALL ATTRACTIONS from the attraction list
        """
        
        self.location = location.lower().strip()
        
        if not self.location in self.locations:
            raise Exception(f'your location ({location}) is not supported!')
            
        self.ATR_FILE = os.path.join('data', f'attractions_{self.location}.json')
        self.USR_FILE = os.path.join('data', f'users_{self.location}.json')
        self.REV_FILE = os.path.join('data', f'reviews_{self.location}.json')
        
        if use_local:
            
            # check for a local attraction file
            try:
                self.attractions = [Attraction().from_dict(a) for a in json.load(open(self.ATR_FILE))] 
                self.attraction_ids = {r.attr_id for r in self.attractions}
                print(f'found {len(self.attraction_ids)} attractions stored locally')
            except:
                print('no locally stored attractions!')
                
            # check for a local review file; any reviews with ids found in there are comsidered complete and
            # we are not updating these
            try:
                self.reviews = [Review().from_dict(r) for r in json.load(open(self.REV_FILE))]
                self.review_ids = {r.review_id for r in self.reviews if r.review_id}
                print(f'found {len(self.review_ids)} reviews stored locally')
            except:
                print('no locally stored reviews!')
                
            # check for a local user file
            try:
                self.users = [User().from_dict(r) for r in json.load(open(self.USR_FILE))]
                self.user_ids = {r.name for r in self.users}
                print(f'found {len(self.user_ids)} user names stored locally')
            except:
                print('no locally stored users!')
            
            if len(self.attractions) > 0:
                return self
            
        print(f'attractions: {len(self.attractions)}, reviews: {len(self.reviews)}, users: {len(self.users)}')
            
        print(f'browsing the attraction list for {self.location.upper()}...')
        
        self.driver.get(self.locations[self.location])
        
        pref = 'attractions-attraction-overview-main-TopPOIs__'
        
        # is this the top attraction overview page?  
        try:
            top_attractions_title = self.driver.find_element_by_xpath(f'//div[contains(@class, "{pref}title--")]')
        except:
            top_attractions_title = None
            
        if top_attractions_title:
            is_top = True
            # wait for the top attractions block
            WebDriverWait(self.driver, self.WAIT_SEC) \
                .until(EC.presence_of_element_located((By.XPATH, f'//div[contains(@class, "{pref}wrapper--")]')))
        else:
            is_top = False
            
        
        see_more_clicked = False                          
        keep_going = True
                                             
        while keep_going:
            
            filtered_list = self.driver.find_element_by_xpath('//div[@id="FILTERED_LIST"]')
            
            if is_top:
                
                if (not see_more_clicked):
                    lst = filtered_list.find_element_by_xpath(f'.//div[contains(@class, "{pref}initial_set--")]')
                else:
                    # we'll browse the additional block that appeared after See More was clicked
                    lst = filtered_list.find_element_by_xpath(f'.//div/div[contains(@class, "{pref}wrapper--")]')
                
                for j, i in enumerate(lst.find_elements_by_xpath(f'.//li[contains(@class, "{pref}item--")]'), 1):
                
                    attraction = self.get_attr_info(i)
                    
                    if attraction.attr_id:
                        self.attraction_ids.add(attraction.attr_id)
                    
                    self.attractions.append(attraction)
 
                # got through the attractions on the initial list. now what? click on See More
                if not see_more_clicked:
                
                    self.driver.find_element_by_xpath(f'//div[contains(@class, "{pref}see_more--")]').click()
                    
                    time.sleep(random.choice(range(2,5)))
                    see_more_clicked = True
                    continue
                
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, self.WAIT_SEC) \
                                    .until(EC.visibility_of_element_located((By.XPATH, 
                                    '//div[contains(@class, "attractions-attraction-overview-main-Pagination__wrapper--")]')))
            
            
                previous_button, selected_button, next_button, last_page = \
                            self.pagination_on_attraction_list_pages(pagination_wrapper, is_top=is_top)
                    
                self.click_and_wait(next_button, 3)
                
                is_top = False
                
            else:
                
                for d in filtered_list.find_elements_by_xpath('.//div[@class="attraction_element_tall"]'):
                    
                    attraction = self.get_attr_info(d)
                    
                    if attraction.attr_id:
                        self.attraction_ids.add(attraction.attr_id)
                        
                    self.attractions.append(attraction)
                    
                # wait for the pagination wrapper
                pagination_wrapper = WebDriverWait(self.driver, self.WAIT_SEC) \
                                .until(EC.visibility_of_element_located((By.XPATH, '//div[@class="pagination"]')))

                previous_button, selected_button, next_button, last_page = \
                self.pagination_on_attraction_list_pages(pagination_wrapper, is_top=is_top)
                
                
                
                if (selected_button < last_page) and next_button:
                    
                    next_clicked = False
                    
                    while not next_clicked:
                        
                        status = self.click_and_wait(next_button, 2)
                    
                        if not status:
                            
                            print('warning: couldn\'t click the NEXT button! retrying..')
                            
                            try:
                                self.driver.find_element_by_css_selector('#BODY_BLOCK_JQUERY_REFLOW > div.QSISlider.SI_aWPlGTVzhBjDiCh_SliderContainer > div:nth-child(8) > div').click()
                                print('clicked the ad!')
                            except:
                                self.driver.execute_script("window.scrollBy(0,50);")
#                                 self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                                print('scrolled..')
                        else:
                            next_clicked = True         
                    
                else:
                    
                    keep_going = False
        
        print(f'done. found {len(self.attractions)} attractions')
        
        total_attrs = len(self.attractions)
        
        print(f'looking for additional attraction information...')
        
        attractions_ = []
        
        t0 = time.time()
        
        for i, a in enumerate(self.attractions, 1):
                
            a = self.get_attr_about_and_address(a)
            attractions_.append(a)
            
            m, s = divmod(time.time() - t0, 60)
            
            print(f'{i}/{total_attrs} ({100*i/total_attrs:03.1f}%) done. elapsed time: {m:02.0f} min {s:02.0f} sec')
            
        self.attractions = attractions_
        
        self.save(what=['attractions'])
        
        return self
            
    
    def pagination_on_attraction_list_pages(self, pagination_wrapper, is_top):
        
        """
        
        check status of relevant pagination buttons on the attraction list pages (both top and normal attractions)
        
        """
        
        previous_button = selected_button = last_page = next_button = None
        
        if is_top:
            
            # previous button
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__disabled--")]'):
                _text = _.text.lower().strip()
                if _text == 'previous':
                    previous_button = _ 
                    break
        else:
            
            try:
                previous_button = pagination_wrapper.find_element_by_xpath('.//div/a[contains(@class, "previous")]')
            except:
                pass
            
        if is_top:   
            # selected button
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__selected--")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    selected_button = int(_text)
                    break
        else:
            
            try:
                selected_button = int(pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]/span[contains(@class, "current")]').text.strip().lower())
            except:
                pass
            
        if is_top:
                
            # last page button
            visible_page_numbers = []
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__link--")]'):
                _text = _.text.lower().strip()
                if _text.isdigit():
                    visible_page_numbers.append(int(_text))
            last_page = max(visible_page_numbers)
       
        else:
            
            try:
                visible_page_numbers = [int(_) for _ in pagination_wrapper.find_element_by_xpath('.//div[@class="pageNumbers"]') \
                                        .text.strip().lower().split() if _.isdigit()]
                        
                last_page = max(visible_page_numbers)
                
            except:
                pass
            
        if is_top:
            
            # next button
            for _ in pagination_wrapper.find_elements_by_xpath('.//div[contains(@class, "attractions-attraction-overview-main-Pagination__button--")]'):
                _a = _.find_element_by_xpath('.//a')
                if _a and _a.text.strip().lower() == 'next':
                    next_button = _a
                    break
        else:
            
            try:
                
                next_button = WebDriverWait(self.driver, self.WAIT_SEC) \
                            .until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "pagination")]/a[contains(@class, "next")]')))
            except:
                pass
            
                
        return (previous_button, selected_button, next_button, last_page)
    
    def check_pagination_reviews(self):
        
        previous_button = selected_button = last_page = next_button = None
        
        # run until all buttons get some value
        try:
            previous_button = self.driver.find_element_by_css_selector('div.unified.ui_pagination>a.nav.previous.ui_button.secondary')
        except:
            print('review pagination: no PREVIOUS button found!')
                    
        # selected button, element
        try:
            selected_button = self.driver.find_element_by_css_selector('div.pageNumbers>a.pageNum.current')
        except:
            try:
                selected_button = self.driver.find_element_by_css_selector('div.pageNumbers>a.pageNum.last.current')
            except:
                print('review pagination: no SELECTED button found')
        
        try:
            page_numbers = self.driver.find_elements_by_css_selector('div.mobile-more>div>div.unified.ui_pagination>div.pageNumbers>a[data-page-number]')
        except:
            page_numbers = None
            
        if page_numbers:
            # last page; integer number
            ns = [t.get_attribute('data-page-number') for t in page_numbers]
            last_page = max([int(s) for s in ns])
        else:
            print('review pagination: NO PAGE NUMBERS found')
            last_page = None
        
        # next button; element  
        # can become nav next ui_button primary disabled
        try:
            next_button = self.driver.find_element_by_css_selector('div.unified.ui_pagination>a.nav.next')
        except:
            print('review pagination: no NEXT button found!')
        
        return (previous_button, selected_button, next_button, last_page)
    
    
    def get_user_details(self, review_container):
        
        """
        find and return user name and location
        """
        
        user = User()
        
        try:
            info_text = review_container.find_element_by_css_selector('div.memberOverlayLink.clickable>div.info_text')
        except:
            info_text = None
            print(f'no infotext..')
        
        if info_text:
            
            try:
                user.name = info_text.find_element_by_xpath('.//div').text
            except:
                print('can\'t find user nickname!')
            
            try:
                user.loc = info_text.find_element_by_xpath('.//div[@class="userLoc"]').text
            except:
                pass
            
            try:
                info_text.click()
                time.sleep(random.choice(range(3,6)))
            except:
                print('didnt click infotext!')
            
            try:
                t_ = WebDriverWait(self.driver, self.WAIT_SEC) \
                    .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'ul.memberdescriptionReviewEnhancements'))).text
                user.age = re.search(r'\d+\-\d+', t_).group(0)
                user.gender ='f' if 'woman' in t_ else 'm' if 'man' in t_ else None
            except:
                pass
            
            try:
                user.tags = [t.text.strip().lower() for t in self.driver.find_elements_by_css_selector('a.memberTagReviewEnhancements')]
            except:
                pass
            
            popop_closed = False
            attempts = 0
            
            while (not popop_closed) or (attempts <= 3):
                
                status = self.click_and_wait(WebDriverWait(self.driver, self.WAIT_SEC) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'span.ui_overlay.ui_popover>div.ui_close_x'))), 2)
                if status:
                    popop_closed = True
                else:
                    attempts += 1
                    
            if not popop_closed:
                print(f'didn\'t close the customer info pop-up after {attempts} attempts!')
        
        return user
    
    def get_review(self, review_container):
        
        """
        get reviews
        """
        
        rev = Review()
        
        try:
            rev.review_id = review_container.get_attribute('data-reviewid')
        except:
            print('can\'t find review id!')
            
        try:
            rev.rating = int(re.search(r'(?<=bubble_)\d+', review_container.find_element_by_xpath('.//span[contains(@class, "ui_bubble_rating")]').get_attribute('class')).group(0))/10
        except:
            print('can\'t find review rating!') 
            
        try:
            rev.title = review_container.find_element_by_xpath('.//a[contains(@class, "title")]').text
        except:
            print('can\'t find review title!')
            
        try:
            rev.date_of_experience = arrow.get(review_container \
                                               .find_element_by_xpath('.//div[@data-prwidget-name="reviews_stay_date_hsx"]') \
                                               .text.split(':')[-1].strip(), 'MMMM YYYY') \
                                                .format('MM/YYYY')
        except:
            print('can\'t find review date of experience!')
            
        try:
            rev.date_of_writing = arrow.get(review_container.find_element_by_xpath('.//span[@class="ratingDate"]').get_attribute('title'), 'D MMMM YYYY').format('DD/MM/YYYY')
        except:
            print('can\'t find review date of writing!')
        
        for _ in review_container.find_elements_by_xpath('.//span[contains(@class, "ulBlueLinks")]'):
            if 'more' in _.text.lower():
                _.click()
                time.sleep(random.choice(range(1,5)))
                break
        
        try:
            rev.text = review_container.find_element_by_xpath('.//p[@class="partial_entry"]').text
        except:
            print('can\'t find review text!')
            
        return rev
        
    
    def get_attr_about_and_address(self, attraction):
        
        """
        go to the attraction page and get all useful info;
        - some attractions have NO REVIEWS
        """
        
        print(f'attraction: {attraction.name}...')
        
        try:
            self.driver.get(attraction.attr_url)
        except:
            print(f'can\'t get attraction url {attraction.attr_url}')
            return attraction

        try:
            reviews_block = WebDriverWait(self.driver, self.WAIT_SEC) \
                    .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#REVIEWS')))
        except:
            print('no reviews block! reloading..')
            self.driver.get(attraction.attr_url)
        
        try:
            self.driver.find_element_by_css_selector('span>div>span.viewMore').click()
            time.sleep(random.choice(range(1,4))) 
        except:
            pass
        
        category_span = None
        
        try:
            category_span = WebDriverWait(self.driver, self.WAIT_SEC) \
                                .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                                                         'span.is-hidden-mobile.header_detail.attractionCategories>div.detail')))

        except:
            print(f'no category span found for {attraction.name}!')
            
        if category_span:
            
            try:
                attraction.cat = [c.lower().strip() for c in category_span.text.split(',')]
            except:
                print(f'can\'t extract attraction categories from {category_span.text}!')  
        
        attraction.reviews = 0
        
        review_count_span = None
        
        try:
            review_count_span = WebDriverWait(self.driver, self.WAIT_SEC_SHORT) \
                                    .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                                                             'div.headerInfoWrapper>div.ratingContainer>a>span.reviewCount')))
        except:
            print(f'can\'t find the review count span for {attraction.name}')
        
        if review_count_span:
            try:
                # text is like 7,260 Reviews or 220 Reviews; 377 Reviews
                attraction.reviews = int(re.search(r'\d+(?=\s+Review)', review_count_span.text.replace(',','')).group(0))
            except:
                print(f'problem with extracting review count from {review_count_span.text.upper()} for {attraction.name}!')
        
        # bubble rating
        try:
            rating_ = self.driver.find_element_by_css_selector('div.section.rating>span.overallRating')
        except:
            print('cannot find bubble rating!')
            rating_ = None
            
        if rating_:
            attraction.rating = float(rating_.text)
        else:
            if attraction.reviews:
                print(f'can\'t extract rating for {attraction.name} although it has {attraction.reviews} reviews!')                      
        try:
            attraction.address = WebDriverWait(self.driver, self.WAIT_SEC_SHORT) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.contactInfo>div.detail_section.address'))).text
        except:                   
            print(f'can\'t find the address section for {attraction.name}')
        
        try:
            read_more = self.driver.find_element_by_xpath('.//span[contains(@class, "attractions-attraction-detail-about-card-Description__readMore--")]')
        except:
            read_more = None
        
        if read_more:
            
            self.click_and_wait(read_more, 1)
            
            try:
                
                detailed_about = WebDriverWait(self.driver, self.WAIT_SEC_SHORT) \
                    .until(EC.visibility_of_element_located((By.XPATH, './/div[contains(@class, "attractions-attraction-detail-about-card-Description__modalText--")]')))
                
                attraction.about = detailed_about.text
                
                self.click_and_wait(self.driver.find_element_by_xpath('.//div[contains(@class, "overlays-pieces-CloseX__close--")]'), 1)
                
            except:
                pass
            
        else:
        
            about = ''
            
            # if description is short, just pick it up
            try:
                about = self.driver.find_element_by_xpath('//div[contains(@class, "attractions-attraction-detail-about-card-AttractionDetailAboutCard__section--") and not(contains(@class, "title"))]').text
            except:
                pass
            
            try:
                about = self.driver.find_element_by_xpath('//div[contains(@class, "attractions-supplier-profile-SupplierAbout__about--")]').text
            except:
                pass
            
            attraction.about = about
        
        # popularity
        try:
            attraction.popularity = int(self.driver.find_element_by_css_selector('div.popIndexContainer>div>span.header_popularity.popIndexValidation>b').text.replace(',','').replace('#',''))
        except:
            print('cant find attraction popularity!')
        
        return attraction
    
    def get_attrs_about_and_address(self):
        
        """
        get additional information about the attractions from the attraction pages;
        spacifically, we are after the "about" section and location address
        """
        
        total_attrs = len(self.attractions)
        
#         digs = len(str(total_attrs))
        
        print(f'looking for additional attraction information...')
        
        attractions_ = []
        
        t0 = time.time()
        
        for i, a in enumerate(self.attractions, 1):
                
            a = self.get_attr_about_and_address(a)
            attractions_.append(a)
            
            time.sleep
            
            m, s = divmod(time.time() - t0, 60)
            
            print(f'{i}/{total_attrs} ({100*i/total_attrs:03.1f}%) done. elapsed time: {m:02.0f} min {s:02.0f} sec')
            
        self.attractions = attractions_
        
        self.save(what=['attractions'])
        
        return self
    
    def select_eng_reviews(self):
        
        found = None
        clicked = False
        
        try:
            found = WebDriverWait(self.driver, self.WAIT_SEC_SHORT) \
                            .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                'div.ui_radio.item[data-value="en"][data-tracker="English"]')))
        except:
            print('can\'t find the English radio button!')
            return clicked
        
        rep = 0
        
        while rep <= 3:
                
            clicked = self.click_and_wait(found, 2)
            
            if not clicked:
                rep += 1
            else:
                break
            
        if not clicked:
            print('found the English radio button but couldn\'t click it (tried 3 times)!')
            return clicked
        
        # since we got to here, there at least some english reviews   
        try:
            # note: radio button text is like "English (3,122)"
            n_eng_reviews = int(re.search(r'\d+', found.text.replace(',','')).group(0))
        except:
            n_eng_reviews = None
            raise Exception('can\'t extract the number of English reviews!')
        
        # check the review block header; text here is supposed ot be like "1 - 10 of 3,122 reviews"
        
        try:
            hrd_review_txt = self.driver.find_element_by_css_selector('div[data-contextchoice="DETAIL"]>div.pagination-details').text.replace(',','')
        except:
            print('can\'t find the header text with review counts!')
            hrd_review_txt = None
        
        if hrd_review_txt:
            try:
                n_reviews_hdr = int(re.search(r'(?<=of)\s+\d+', hrd_review_txt).group(0).strip())
            except:
                n_reviews_hdr = None
                print('can\'t extract the review counts from header!')
                return clicked
        
            if n_eng_reviews != n_reviews_hdr:
                print(f'warning: {n_eng_reviews} English reviews available there are {n_reviews_hdr} in the header! Click on English didn\'t work?')
        
        return (clicked, n_eng_reviews)
        
    
    def get_review_author_info(self, review_id):
        
        """
        giver a review id, collects this review's author's information
        returns a User instance
        """
        
        try:
            user_name = WebDriverWait(self.driver, self.WAIT_SEC) \
                                      .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                        f'div[data-reviewid="{review_id}"]>div>div>div[data-prwidget-name="reviews_member_info_resp"]>div.member_info>div>div.info_text>div'))).text
        except:
            user_name = None

        try:
            user_location = WebDriverWait(self.driver, self.WAIT_SEC) \
                                        .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                        f'div[data-reviewid="{review_id}"]>div>div>div[data-prwidget-name="reviews_member_info_resp"]>div.member_info>div>div.info_text>div.userLoc'))).text
        except:
            user_location = None

        try:
            clickable_user_area = WebDriverWait(self.driver, self.WAIT_SEC) \
                                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                f'div[data-reviewid="{review_id}"]>div>div>div[data-prwidget-name="reviews_member_info_resp"]>div.member_info>div>div.info_text')))
        except:
            print('can\'t find clickable user area!')
            clickable_user_area = None

        if clickable_user_area:

            status = self.click_and_wait(clickable_user_area, 2)

            if status:
                
                print('getting data from user pop-up..')
                
                try:
                    available_tags = self.driver.find_elements_by_css_selector('a.memberTagReviewEnhancements')
                except:
                    available_tags = None

                if available_tags:
                    traveller_types = [tag.text.strip().lower() for tag in available_tags]
                else:
                    traveller_types = []
                
                try:
                    age_and_gender = self.driver.find_element_by_css_selector('ul.memberdescriptionReviewEnhancements')
                except:
                    age_and_gender = None

                if age_and_gender:

                    try:
                        age = re.search(r'\d+\-\d+', age_and_gender.text).group(0)
                    except:
                        age = None

                    gender ='f' if 'woman' in age_and_gender.text else 'm' if 'man' in age_and_gender.text else None
                
                try:
                    real_name = self.driver.find_element_by_css_selector('h3.username.reviewsEnhancements').text.strip()
                except:
                    print('warning: can\'t get customer\'s real name from pop-up!')
                    real_name = None
                
                print('now close the pop-up')

                try:
                    close_x = WebDriverWait(self.driver, 2) \
                                     .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                                               'span.ui_overlay.ui_popover>div.ui_close_x')))
                except:
                    print('cannot find X..')
                    close_x = None

                if close_x:
                    res = self.click_and_wait(close_x, 2)
                    if res:
                        print('clicked on X')
            else:
                if real_name == 'A TripAdvisor Member':
                    print('anonymous user!')
                else:
                    print('warning: no user pop-up, need to investigate!')
                
        
        user = User()
        
        user.name = user_name
        user.loc = user_location
        
        if clickable_user_area:
            
            user.real_name = real_name
            user.age = age
            user.gender = gender
            user.tags = traveller_types
        
        return user
    
    def get_review_info(self, review_id):
        
        """
        giver a review id, collects this review's details
        returns a Review instance
        """
        
        try:
            bubble_rating = WebDriverWait(self.driver, self.WAIT_SEC) \
                                        .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                            f'div[data-reviewid="{review_id}"]>div>div>span.ui_bubble_rating')))
        except:
            print('didn\'t find bubble rating!')
            bubble_rating = None

        if bubble_rating:
            rating = int(re.search(r'(?<=bubble_)\d+', bubble_rating.get_attribute('class')).group(0))/10
        else:
            rating = None

        try:
            tit = WebDriverWait(self.driver, self.WAIT_SEC) \
                        .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                            f'div[data-reviewid="{review_id}"]>div>div>div.quote')))
        except:
            tit = None

        if tit:
            title = tit.text
        else:
            title = None

        try:
            date_exp = WebDriverWait(self.driver, self.WAIT_SEC) \
                        .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                            f'div[data-reviewid="{review_id}"]>div>div>div.prw_rup.prw_reviews_stay_date_hsx')))
        except:
            print('no date of experience found!')
            date_exp = None

        if date_exp:
            date_of_experience = arrow.get(date_exp.text.split(':')[-1].strip(), 'MMMM YYYY') \
                            .format('MM/YYYY')
        else:
            date_of_experience = None

        try:
            date_wri = WebDriverWait(self.driver, self.WAIT_SEC) \
                        .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                f'div[data-reviewid="{review_id}"]>div>div>span.ratingDate')))
        except:
            print('no date of writing found!')
            date_wri = None

        if date_wri:
            date_of_writing = arrow.get(date_wri.get_attribute('title'),'D MMMM YYYY').format('DD/MM/YYYY')
        else:
            date_of_writing = None

        # first try to click on "more" if it's there
        try:
            more_txt = self.driver.find_element_by_css_selector(f'div[data-reviewid="{review_id}"]>div>div>div>div.entry>p.partial_entry>span[onclick]')
        except:
            more_txt = None

        if more_txt:
            sta = self.click_and_wait(more_txt, 2)

        try:
            review_text = self.driver.find_element_by_css_selector(f'div[data-reviewid="{review_id}"]>div>div>div>div.entry>p.partial_entry').text
        except:
            print('can\'t find review text!')
            review_text = None
            
        review = Review()

        review.review_id = review_id
        review.rating = rating
        review.title = title
        review.text = review_text
        review.date_of_experience = date_of_experience
        review.date_of_writing = date_of_writing
        
        return review

    
    def get_users_and_reviews(self):
        
        """
        for all available attractions, visit attraction page and collect all reviews and user information
        """
        
        t0 = time.time()
        
        random_attractions = random.choices(self.attractions, k=12)
        
        collected_review_ids = set()
        
        for i, a in enumerate(random_attractions, 1):
            
            m, s = divmod(time.time() - t0, 60)
            h, m = divmod(m, 60)
            
            print(f'#{i}/{len(random_attractions)}: {a.name.upper()} (id:{a.attr_id})... elapsed time: {h:02.0f} h {m:02.0f} m {s:02.0f} s')
            
            # if no reviews are available, move on to next attraction
            if not a.reviews:
                continue
            
            self.driver.get(a.attr_url)    
            
            # select only eng reviews 
            clicked, n_eng_reviews = self.select_eng_reviews()
            
            if n_eng_reviews and (n_eng_reviews < self.min_reviews):
                print(f'not enough English reviews ({n_eng_reviews})! skipping..')
                continue
            
            if not n_eng_reviews:
                print(f'warning: number of English reviews is unavailable!')
                continue
            
            p = 0  # pages processed (for this attraction)
                  
            while 1:
                 
                c_url = self.driver.current_url
                
                print('this page:', p)
                print('url: ', c_url)
                
                review_ids_on_this_page = {_.get_attribute('data-reviewid') for _ in WebDriverWait(self.driver, self.WAIT_SEC) \
                                            .until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'div[data-reviewid]')))}
            
                tot_reviews_on_page = len(review_ids_on_this_page)
                print('reviews on this page: ', tot_reviews_on_page)
                
                collected_review_ids_on_this_page = set()

                while not (len(collected_review_ids_on_this_page) == tot_reviews_on_page):

                    for review_id in review_ids_on_this_page:

                        user = self.get_review_author_info(review_id)
                        review = self.get_review_info(review_id)
                        
                        # add some extras to review info
                        review.attr_id = a.attr_id
                        review.by_user = user.name
                        
                        # add review id to collected for this page
                        if review.review_id not in collected_review_ids_on_this_page:
                            collected_review_ids_on_this_page.add(review.review_id)
                        # .. and to collected globally        
                        if review.review_id not in collected_review_ids:
                            collected_review_ids.add(review.review_id)
                        # .. and the actual review to the review list
                        self.reviews.append(review)

                        if len(collected_review_ids)%self.save_every == 0:
                            print(f'{len(collected_review_ids)}/{n_eng_reviews} ({100*len(collected_review_ids)/n_eng_reviews:.1f}%) reviews...')
                            self.save(what=['users', 'reviews'])
                            print('saved users and reviews')

                # now time to click next.. 
                 
                previous_button, selected_button, next_button, last_page = self.check_pagination_reviews()
                
                # note: on the very last page the next button is disabled (not ckickable)
                if next_button and ('disabled' not in next_button.get_attribute('class')):
                    res  = self.click_and_wait(next_button, 3)
                    if res:
                        p += 1
                    else:
                        try:
                            offset_ = re.search(r'(?<=or)\d+(?=\-)', c_url).group(0)
                        except:
                            offset_ = None
                            print('no offset!')

                        if offset_:
                            self.driver.get(current_url.replace(f'-or{offset_}-', f'-or{str(int(offset_) + 10)}-'))

                else:
                    print(f'last page. collected {len(collected_review_ids)}/{n_eng_reviews} reviews so far')
                    self.save(what=['users', 'reviews'])
                    print('saved users and reviews')
                    break      

        self.driver.quit()
        
        return self       
    
    def save(self, what):
        
        
        if not os.path.exists('data'):
            os.mkdir('data')
        
        if ('attractions' in what) and self.attractions:
            json.dump([a.to_dict() for a in self.attractions], open(os.path.join('data', f'attractions_{self.location}.json'), 'w'))
        if ('users' in what) and self.users:
            json.dump([u.to_dict() for u in self.users], open(os.path.join('data', f'users_{self.location}.json'), 'w'))
        if ('reviews' in what) and self.reviews:
            json.dump([r.to_dict() for r in self.reviews], open(os.path.join('data', f'reviews_{self.location}.json'), 'w'))
        
        return self 

In [3]:
if __name__ == '__main__':
    
    ta = Tareviews(save_every=30) \
        .get_attrs_info(location='Melbourne', use_local=False) \
        .get_users_and_reviews()

attractions: 0, reviews: 0, users: 0
browsing the attraction list for MELBOURNE...
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
scrolled..
done. found 477 attractions
looking for additional attraction information...
attraction: melbourne cricket ground (mcg)...
1/477 (0.2%) done. elapsed time: 00 min 10 sec
attraction: eureka skydeck 88...
2/477 (0.4%) done. elapsed time: 00 min 16 sec
attraction: shrine of remembrance...
3/477 (0.6%) done. elapsed time: 00 min 27 sec
attraction: national gallery of victoria...
4/477 (0.8%) done. elapsed time: 00 min 34 sec
attraction: artvo...
5/477 (1.0%) done. elapsed time: 00 min 36 sec
attraction: city circle tram...
6/477 (1.3%) done. elapsed time: 00 min 44 sec
attraction: melbourne zoo...
7/477 (1.5%) done. elapsed time: 00 min 51 sec
attraction: old melbourne gaol...
8/477 (1.7%) done. elapsed time: 00 min 58 sec
attraction: 

UnboundLocalError: local variable 'real_name' referenced before assignment