## TripAdvisor Scraper

This is the second iteration of a scraper that collects reviews from TripAdvisor. What each method does exactly is hopefully clear from the code. 

### Getting Started

Make sure there's *chromedriver* in *webdriver* directory. 

File **tradvisor_location_ids.json** contains a dictionary mapping some Australian locations to their TripAdvisor IDs. For example, 

`
"act": {
    "canberra": "g255057"
  }
`

where *ACT* is a state, *Canberra* is a location in this state and *g255057* is a TripAdvisor ID for Canberra. Although there is a method **find_location_id** that finds a location ID for you (by location name and country), it may be more straightforward to simply add locations of interest and their IDs to that file manually. This is because Chrome currently is struggling to enforce behaviours related to handling notifications, geolocation access requests and such (more details in the method descriptions).

In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from collections import defaultdict
import re
import json
import os
import time
import arrow

import pandas as pd

from attraction import Attraction
from review import Review
from user import User

import scattertext as st

from unidecode import unidecode

Let's make a useful timer decorator

In [None]:
def timer(func):
    
    def wrapper(*args, **kwargs):
        
        # args becomes a tuple, kwargs becomes a dictionary
        
        t0 = time.time()
        res = func(*args, **kwargs)
        t1 = time.time()
        
        m, s = divmod(t1-t0, 60)
        
        print(f'[time: {m:.0f}:{s:.1f}]')
        
        return res
    
    return wrapper

In [None]:
 class CountryDetector:
        
    """
    This class is trying to figure out what country is mentioned in the location description string
        
    """
        
    def __init__(self, data_dir='data'):
        
        self.DATA_DIR = data_dir
        
        try:
            self.COUNTRIES = json.load(open(os.path.join(self.DATA_DIR, 'countries.json')))
        except:
            raise Exception(f'error! no country file in {data_dir}!')
            
        try:
            self.COUNTRY_ABBRS = json.load(open(os.path.join(self.DATA_DIR, 'country_abbrs.json')))
        except:
            raise Exception(f'error! no counry abbreviation file in {data_dir}!')
            
        try:
            self.STATES = json.load(open(os.path.join(self.DATA_DIR, 'states.json')))
        except:
            raise Exception(f'error! no state file in {data_dir}!')
            
        try:
            self.STATE_ABBRS = json.load(open(os.path.join(self.DATA_DIR, 'state_abbrs.json')))
        except:
            raise Exception(f'error! no state abbreviation file in {data_dir}!')
            
        try:
            self.CITIES = json.load(open(os.path.join(self.DATA_DIR, 'cities.json')))
        except:
            raise Exception(f'error! no city file in {data_dir}!')
            
    def _normalize(self, st: str) -> str:
        
        """
        return normalized string st
        """
        
        return re.sub(r'\s+', ' ', re.sub(r'[,\-\'\"\_\!\@\?]', '', unidecode(st)).lower().strip())
            
    def get_country(self, st) -> str:
        
        """
        figure out a country from a string st; return None if no country found or a string with all candidate
        country names
        """
        
        locations = [_.lower().strip() for _ in st.split(',') if _.strip()]
        
        if not locations:
            return None    
        
        # first try to find a country
        for l in locations:
            first_letter = l[0]
            found_countries = {self._normalize(l)} & {self._normalize(c) for c in self.COUNTRIES[first_letter]}
            if found_countries:
                return found_countries.pop()
            
        # since we are here, no country has been found; try to find country by abbreviation
        for l in locations:
            found_abbrs = {self._normalize(l)} & set(self.COUNTRY_ABBRS)
            if found_abbrs:
                return self.COUNTRY_ABBRS[found_abbrs.pop()]
            
        # no country abbreviation has been found and we are looking for states
        for l in locations:
            found_states = {self._normalize(l)} & set(self.STATES)
            if found_states:
                return self.STATES[found_states.pop()]
            
        # time to find state abbreviations
        for l in locations:
            found_states = {self._normalize(l)} & set(self.STATE_ABBRS)
            if found_states:
                return self.STATE_ABBRS[found_states.pop()]
            
        # no states; now look for cities
        for l in locations:
            first_letter = l[0]
            for c in self.CITIES[first_letter]:
                if self._normalize(c) == self._normalize(l):
                    found_countries = set(self.CITIES[first_letter][c])
                    return ' | '.join(found_countries)

In [21]:
 class Trip(CountryDetector):
        
    """
    This a the main scraped class
    """
    
    def __init__(self, filter):
        
        options = webdriver.ChromeOptions()
        
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        
        # here "2" is for "disable", "1" is "enable" and "0" is "default"
        prefs = {"profile.default_content_setting_values.notifications" : 2,
                 "profile.managed_default_content_settings.geolocation" : 1,
                 "profile.managed_default_content_settings.images": 2}
        
        options.add_experimental_option('prefs', prefs)

        options.add_argument('--headless')
        
        super().__init__()
        
        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
        self.FILTREVS = defaultdict()
        self.REVIEWS = defaultdict()
        
        # counter to see how many annoying things get killed
        self.KILLED = defaultdict(int)
        
        self.FILTERS = filter
        
        print(f'\n[FILTER]: {" | ".join([k + ": " + str(v) for k, v in self.FILTERS.items() if v])}\n')
        
        self.DATA_DIR = 'data'
        self.COLLECT_DIR = 'data-collected'
        
        if not os.path.exists(self.DATA_DIR):
            os.mkdir(self.DATA_DIR)
            
        if not os.path.exists(self.COLLECT_DIR):
            os.mkdir(self.COLLECT_DIR)
        
        self.LOCATION_IDS = json.load(open(os.path.join(self.DATA_DIR, 'tradvisor_location_ids.json')))
        
    def attr_id_from_url(self, url_):
        
        """
        extract attraction ID from attraction URL
        """
        
        try:
            id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
        except:
            id_ = None
            
        return id_
            
    def do_click(self, e, max_=3) -> bool:
        
        """
        try to click on element e and return True if it worked or False otherwise
        """
        
        # assume not clicked yet
        _clicked = False
        _c = 0

        while (not _clicked) and (_c < max_):

            try:
                e.click()
                _clicked = True
            except:
                try:
                    self.driver.find_element_by_css_selector('span.ui_overlay>div.ui_close_x').click()
                    self.KILLED['overlay'] += 1
                except:
                    # try to catch and close all sliders
                    els = list(self.driver.find_elements_by_css_selector('div[class^="QSISlider"]>div'))
                    for i, d in enumerate(els):
                        if d.text.strip().lower() == 'Not right now, thanks.'.strip().lower():
                            try:
                                els[i-1].click()
                                self.KILLED['slide'] += 1
                                break
                            except:
                                continue
                    for _ in self.driver.find_elements_by_css_selector('div.sbx_close[onclick]'):
                        try:
                            _.click()
                            self.KILLED['infobar'] += 1
                        except:
                            pass        
            _c += 1

        return _clicked
    
    def process_paginator(self, pg, off=30, pref='oa'):
        
        """
        given a paginator pg, find out what's the total number of pages, create page URLs
        """
        
        p_numbers = [int(s.text.strip()) for s in pg.find_elements_by_css_selector('div') if s.text.strip().isdigit()]
        
        if p_numbers:
            total_pages = p_numbers[-1]
        else:
            raise Exception('no pages in paginator!')
        
        current_page_url = self.driver.current_url
        last_page_url = self.driver.current_url
        
        page_urls = [current_page_url]
        
        for _ in pg.find_elements_by_css_selector('div>a'):
            if _.text.isdigit():
                last_page_url = _.get_attribute('href')
                
        if total_pages > 1:
            for i in range(1, total_pages):
                # starts from page 2 (page 1 has no -oa[number]- part)
                page_urls.append(re.sub('Activities-', 'Activities-' + pref + str(off*i) + '-', current_page_url))
                
        return total_pages, page_urls
        
    def get_attraction_pages(self, loc):
        
        """
        go to the home page for location loc and collect all attractions; if loc is a state, collect attractions
        for every location in that state; 
        note that we only collect attraction ID and URL this time.
        
        """
        
        self.MAIN_LOCATION = loc
        
        # home_urls will be a list of tuples like 
        # [('tasmania', 'hobart', 'https://www.tripadvisor.com.au/Home-g255097'), 
        #  ('tasmania', 'launceston', 'https://www.tripadvisor.com.au/Home-g255344')...
        
        home_urls = []
        
        self.collected_attractions = defaultdict(lambda: defaultdict(lambda: defaultdict()))
        
        """
        {'tasmania': {'hobart': {attr_id: {url: 'https:/www...'}}}
        """
        
        # if loc is state
        if self.MAIN_LOCATION in self.LOCATION_IDS:
            for city in self.LOCATION_IDS[loc]:
                home_urls.append((self.MAIN_LOCATION, city, f'https://www.tripadvisor.com.au/Home-{self.LOCATION_IDS[loc][city]}'))
        else:
            for state_ in self.LOCATION_IDS:
                if self.MAIN_LOCATION in self.LOCATION_IDS[state_]:
                    home_urls.append((state_, self.MAIN_LOCATION, f'https://www.tripadvisor.com.au/Home-{self.LOCATION_IDS[state_][loc]}'))
                    
        if not home_urls:
            print(f'no attractions to pick for {loc.upper()}!')
            return self
        
        for state_, city_, homeurl in home_urls:
            
            print(f'collecting attractions for {city_.upper()}, {state_.upper()}...')
            
            self.driver.get(homeurl)
            
            # ---- find and click the Things to Do icon; assume it MUST be there
            
            thingstodo_clicked = False
            
            while not thingstodo_clicked:
                
                try:
                    things_to_do_icon = WebDriverWait(self.driver, 10) \
                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.ui_icon.attractions + span')))
                except:
                    raise Exception('failed to find Things to do icon!')
                
                if things_to_do_icon.text.strip().lower() != 'Things to do'.strip().lower():
                    continue
                    
                res = self.do_click(things_to_do_icon)
                    
                if not res:
                    print('failed to click Things to do icon! retrying..')
                else:
                    thingstodo_clicked = True
                
            # ok, so clicked the icon; now the question is whether there are many 'top' things to 
            # do or just a few; the latter means no need to look for the See More button
            
            moveon = False
            few_attractions = False
            
            # try to click See More first
            
            while 1:
                
                try:
                    c_mo = WebDriverWait(self.driver, 10) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                    'div[class|="attractions-attraction-overview-main-TopPOIs__see_more"]')))
                except:
                    # if there's no See More it's just 1 page with attractions
                    page_urls = [self.driver.current_url]
                    few_attractions = True
                    break
                
                seeless = False
                
                while not seeless:
                    
                    res = self.do_click(c_mo)
                    
                    # didn't click..
                    if not res:

                        try:
                            # there is this See More button but still folded, so apparently we didn't click properly
                            self.driver.find_element_by_css_selector('span.ui_icon.single-chevron-down')
                        except:
                            seeless = True
                    # look like clicked See More..
                    else:
                        try:
                            WebDriverWait(self.driver, 10) \
                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                    'span.ui_icon.single-chevron-up')))

                            seeless = True
                        except:
                            print('no See Less, need to click See more again')
                break                
            
            # paginator only available if there are many attractions; however, there may be no paginator if See More
            # was clicked but there are still too few attractions
            
            if not few_attractions:
                
                try:
                    pg = WebDriverWait(self.driver, 10) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                    'div[class|="attractions-attraction-overview-main-Pagination__container"]')))
                except:
                    pg = None
                
                if pg:
                    total_pages, page_urls = self.process_paginator(pg) 
                else:
                    page_urls = [self.driver.current_url]

            print('visiting attraction pages..')

            for i, attr_page_url in enumerate(page_urls, 1):
                
                print(f'page {i}/{len(page_urls)}..', end='')

                if attr_page_url != self.driver.current_url:
                    self.driver.get(attr_page_url)
                
                if i == 1:
                    
                    attr_topick_css = 'div[class|="attractions-attraction-overview-pois-PoiGrid__wrapper"]' + \
                                      '>li[class^="attractions-attraction-overview-pois-PoiCard__item"]' + \
                                      '>div[class|="attractions-attraction-overview-pois-PoiCard__card_info"]'
                            
                    attr_topick = len(self.driver.find_elements_by_css_selector(attr_topick_css))
                    
                    while attr_topick:
                        
                        for attr_card in self.driver.find_elements_by_css_selector(attr_topick_css):

                            try:
                                url_ = attr_card.find_element_by_css_selector('div>a[class|="attractions-attraction-overview-pois-PoiInfo__name"]') \
                                                        .get_attribute('href')
                            except:
                                print('didn\'t get attraction url! moving on to next attraction card..')
                                continue
                            
                            id_ = self.attr_id_from_url(url_)
                            
                            if not id_:
                                print('failed to extract attraction id!')
                                continue
                            
                            # here we add attraction URL to the dictionary
                            try:
                                self.collected_attractions[state_][city_][id_] = {'url': url_}
                                attr_topick -= 1
                            except:
                                pass

                elif i > 1:
                    
                    attr_topick_css = 'div.attraction_list>div' + \
                                      '>div>div.listing>div.listing_details>div.listing_info'
                        
                    attr_topick = len(self.driver.find_elements_by_css_selector(attr_topick_css))
                    
                    while attr_topick:
                        
                        for attr_card in self.driver.find_elements_by_css_selector(attr_topick_css):

                            try:
                                url_ = attr_card.find_element_by_css_selector('div.tracking_attraction_title.listing_title>a').get_attribute('href')
                            except:
                                print('didn\'t get attraction url! moving on to next attraction card..')
                                continue
                                
                            try:
                                id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                            except:
                                print('failed to extract attraction id!')
                                continue
                            
                            try:
                                self.collected_attractions[state_][city_][id_] = {'url': url_}
                                attr_topick -= 1
                            except:
                                pass
            
                print('ok')
    
        return self

    def select_filters(self) -> int:
        
        """
        apply filter on the attraction page and return the number of reviews available
        after the filter has been applied
        """
        
        d = {'traveller_rating': {'data-name': 'ta_rating',
                                  'input-values': {'Excellent': '5',
                                                   'Very good': '4',
                                                   'Average': '3',
                                                   'Poor': '2',
                                                   'Terrible': '1'},
                                 'pick': self.FILTERS['traveller_rating']},
            'traveller_type': {'data-name': 'traveler_filter',
                               'input-values': {'Families': '3',
                                                'Couples': '2',
                                                'Solo': '5',
                                                'Business': '1',
                                                'Friends': '4'},
                              'pick': self.FILTERS['traveller_type']},
            'time_of_year': {'data-name': 'season',
                             'input-values': {'Mar-May': '1',
                                              'Jun-Aug': '2',
                                              'Sep-Nov': '3',
                                              'Dec-Feb': '4'},
                            'pick': self.FILTERS['time_of_year']},
            'language': {'data-name': 'language',
                         'input-values': {'English': 'en',
                                          'Japanese': 'ja'},
                         'pick': self.FILTERS['language']}}

        def is_selected(css_selector_st):
            
            """
            is a check box in css_selector_st ticked? 
            """

            try:
                WebDriverWait(self.driver, 5) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                   css_selector_st + '>input[checked="checked"]')))
                return True

            except:
                return False

        def _click(css_selector_st, max_attempts=3):

            times_tried = 0

            flag_before = is_selected(css_selector_st)
            flag_after = flag_before

            while (times_tried <= max_attempts) and (flag_after == flag_before):

                times_tried += 1   

                try:
                    e = WebDriverWait(self.driver, 20) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector_st)))
                except:
                    e = None
                    print(f'failed to find {css_selector_st}!')
                
                if e:
                    res = self.do_click(e)

                flag_after = is_selected(css_selector_st)    

            return (flag_after != flag_before)

        for filt in d:

            value = d[filt]['pick']
            
            # note that even if value is None, attempt to uncheck everything that might be checked
            to_uncheck = [other_value for other_value in d[filt]['input-values'] if other_value != value]

            if to_uncheck:

                for other_value in to_uncheck:

                    tr_pick = d[filt]['input-values'][other_value]
                    dname = d[filt]['data-name']
                    st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                    if is_selected(st):
                        res = _click(st)

            if value:

                tr_pick = d[filt]['input-values'][value]
                dname = d[filt]['data-name']
                st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                if is_selected(st):
                    continue
                else:
                    _selected =  _click(st)

        try:
            lang_code = d['language']['input-values'][d['language']['pick']] 
            css_count = f'div.choices[data-name="language"]>div[data-value="{lang_code}"]>label.label>span.count'
            c_txt = WebDriverWait(self.driver, 6) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, css_count))).text.strip()
            c = int(re.sub(r'[(,)]','',c_txt))
        except:
            # if no review count it's because there are not reviews left after filtering
            return 0
            
        return int(c)
    
    @timer
    def process_user_profile(self, user_profile_url, get_info=True, get_reviews=True, verbose=False):
        
        """
        returns a tuple of two dictionaries: reviewer information and all reviews written by this reviewer
        """
        
        self.driver.get(user_profile_url)
        
        user_info = defaultdict()
        
        if get_info:
            
            _st = 'social-member-common-MemberName'

            try:
                user_info['display_name'] = self.driver.find_element_by_css_selector(f'span[class^="{_st}__display_name"]').text.strip()
            except:
                print('found no display name!')

            try:
                user_info['user_name'] = self.driver.find_element_by_css_selector(f'span[class^="{_st}__user_name"]').text.strip()
            except:
                print('found no user name!')

            _st = 'social-member-MemberStats__stat_item'
            
            contrib_a = None
            
            try:
                for _ in self.driver.find_elements_by_css_selector(f'div[class^="{_st}"]'):

                    item_title = _.find_element_by_css_selector(f'span[class^="{_st}_title"]').text
                    
                    try:
                        item_count_a = _.find_element_by_css_selector(f'span[class^="{_st}_count"]>a')
                    except:
                        try:
                            item_count_a = _.find_element_by_css_selector(f'span[class^="{_st}_count"]')
                        except:
                            print('no item count found!')
                    
                    item_count = int(item_count_a.text.replace(',',''))
                
                    if item_title.strip().lower() == 'contributions':
                        
                        contrib_a = item_count_a
            except:
                print('failed to collect all reviewer stats!')
                        
            contrib_a.click()
                        
            time.sleep(2)
             
            popup = WebDriverWait(self.driver, 10) \
                     .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                                              'div#c_contributions')))
             
            for _a in self.driver.find_elements_by_css_selector('li>a'):

                try:
                    _href = _a.get_attribute('href')
                    if 'reviews' in _href:
                        user_info['reviews'] = int([w for w in _a.text.replace(',','').split() if w.isdigit()][0])
                        break
                except:
                    continue

            try:
                user_info['location'] = self.driver.find_element_by_css_selector('span[class^="social-member-common-MemberHometown__member_info"]').text
            except:
                print('failed to get reviewer location!')

            try:
                user_info['info'] = self.driver.find_element_by_css_selector('div[class^="social-member-MemberBio__member_info"]').text
            except:
                print('failed to get reviewer info!')
                
        if get_reviews and user_info.get('reviews', None):
            
            print(f'collecting {user_info["reviews"]:,} reviews by {user_info["user_name"]}...')
            
            self.driver.get(user_profile_url + '?tab=reviews')
            
            try:
                smore = WebDriverWait(self.driver, 10) \
                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                         'span.ui_icon.single-chevron-down')))
            except:
                smore = None
            
            if smore:
                
                if verbose:
                    print('trying to click Show More in the list of reviews...', end='')
    
                while 1:
                    
                    res = self.do_click(smore)
                    time.sleep(5)
                    
                    if res:
                        try:
                            _ = self.driver.find_element_by_css_selector('span.ui_icon.single-chevron-down')
                        except:
                            if verbose:
                                print('clicked')
                                break
                
            # scroll down to see all reviews
            height = self.driver.execute_script("return document.body.scrollHeight")
            
            try:
                ratio_photos_reviews = len(self.driver.find_elements_by_css_selector('div[class^="social-sections-CardSection__card_section"]>div>div>div[class^="social-sections-FeedSectionPhotoCarousel"]'))/ \
                                        len(self.driver.find_elements_by_css_selector('div[class^="social-sections-CardSection__card_section"]'))
            except:
                ratio_photos_reviews = None
                
            SLEEPS = 6 if ratio_photos_reviews > 0.5 else 3
                
            print(f'photos to reviews is {ratio_photos_reviews}; sleep time {SLEEPS} sec')
            
            print('scrolling down...')
            
            review_urls = set()
            
            scrll = 0
            
            avail_review_tracker = []
            
            # new reviews added per review list update
            RPP = 20
            
            # ideally, keep going untill collected all reviews (as per user's contribution summary); however, it seems that
            # sometimes these expected review totals are incorrect - here's an example: 
            # https://www.tripadvisor.com.au/Profile/JVD-Netherlands 
            
            while len(review_urls) < user_info['reviews']:
                
                scrll += 1
                
                # self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                
                self.driver.execute_script(f'window.scrollBy(0, {80 + (scrll - 1)*20})')
                    
                time.sleep(SLEEPS)
                
                for a in self.driver.find_elements_by_css_selector('div[class^="social-sections-ReviewSection__review_wrap"]>a[class][href]'):
                    
                    _h = a.get_attribute('href')
                    if _h not in review_urls:
                        review_urls.add(_h)
                        
                if len(review_urls) not in avail_review_tracker:
                    avail_review_tracker.append(len(review_urls))
                
                if len(review_urls) != avail_review_tracker[-1]:
                    print(f'got {len(review_urls)}/{user_info["reviews"]} review urls..')
                
                title_count = len(self.driver.find_elements_by_css_selector('div[class^="social-sections-ReviewSection__title"]'))
                rating_count = len(self.driver.find_elements_by_css_selector('div[class^="social-sections-ReviewSection__review"]>span[class^="ui_bubble_rating"]'))
                    
                if ((title_count == user_info['reviews']) or (rating_count == user_info['reviews'])) and (len(review_urls) < user_info['reviews']):
                    print(f'looks like there are some reviews without urls..')
                    print(f'got {len(review_urls)}/{user_info["reviews"]} review urls..')
                    break
                    
                # if we're missing less than RPP reviews there will probably be no more 
                # review list updates..
                
                if user_info['reviews'] - len(review_urls) < RPP:
                    print(f'enough scrolling because collected {len(review_urls)} (only {user_info["reviews"] - len(review_urls)} may be missing)')
                    break
            
            print('starting to collect reviews..')
            
            # store all reviews in here:
            rs = defaultdict(lambda: defaultdict())
            
            for i, url in enumerate(review_urls, 1):
                
                self.driver.get(url)
                
                time.sleep(3)
                
                att = 0
                
                while att < 3:
                    try:
                        rc = self.driver.find_element_by_css_selector('div.review-container')
                        break
                    except:
                        att += 1
                        continue
                        
                if att == 3:
                    print(f'no review container at {url}!')
                    continue
                
                review_id = rc.get_attribute('data-reviewid') 
                
                rs[review_id]['review_url'] = url
                    
                try:
                    rs[review_id]['review_date'] = arrow.get(rc.find_element_by_css_selector('span.ratingDate').get_attribute('title'), 'D MMMM YYYY').format('YYYY-MM-DD')
                except:
                    print('no review date')
                    
                try:
                    rs[review_id]['experience_date'] = arrow.get(rc.find_element_by_css_selector('div.prw_reviews_stay_date_hsx').text, 'MMMM YYYY').format('YYYY-MM')
                except:
                    print(f'no experience date for {url}')
                    
                try:
                    self.driver.find_element_by_css_selector('p.partial_entry>span.moreBtn').click()
                    time.sleep(1)
                except:
                    pass
                
                try:
                    rs[review_id]['review_title'] = self.driver.find_element_by_css_selector('div.reviewSelector>div>div>div.quote>h1#HEADING').text
                except:
                    try:
                        rs[review_id]['review_title'] = self.driver.find_element_by_css_selector('div.quote').text
                    except:
                        print('no review title')
                        break
                    
                try:
                    what_reviewed_url = self.driver.find_element_by_css_selector('div.altHeadInline>a[href]').get_attribute('href')
                except:
                    try:
                        what_reviewed_url = self.driver.find_element_by_css_selector('span.altHeadInline>a[href]').get_attribute('href')
                    except:
                        print('no url for what the review is for')
                        break
                        
                rs[review_id]['reviewed_url'] = what_reviewed_url
                    
                POSSIBLE_TYPES = 'hotel attraction restaurant airline tour cruise'.split()

                for t in POSSIBLE_TYPES:
                    if t + '_' in what_reviewed_url.lower():
                        rs[review_id]['review_type'] = t
      
                if not rs[review_id].get('review_type', None):
                    rs[review_id]['review_type'] = 'other' 
                
                try:
                    rs[review_id]['text'] = self.driver.find_element_by_css_selector('span[class^="fullText"]').text
                except:
                    try:
                        rs[review_id]['text'] = self.driver.find_element_by_css_selector('p.partial_entry').text
                    except:
                        print('no review text')
                        break
                
                try:
                    rating_line = self.driver.find_element_by_css_selector('div.reviewItemInline>span.ui_bubble_rating').get_attribute('class')
                except:
                    try:
                        rating_line = self.driver.find_element_by_css_selector('div.reviewSelector>div>div>span.ui_bubble_rating').get_attribute('class')
                    except:
                        print('no review rating')
                        break

                rs[review_id]['rating'] = re.search(r'\d{1}', rating_line).group(0)
                
                # get the country where the reviewed stuff is located
                if rs[review_id]['review_type'] in ['attraction', 'restaurant', 'hotel']:
                    
                    self.driver.get(rs[review_id]['reviewed_url'])
                
                    try:
                        rs[review_id]['reviewed_country'] = self.driver.find_element_by_css_selector('ul.breadcrumbs>li.breadcrumb:nth-of-type(2)').text.strip()
                    except:
                        print(f'failed to find reviewed country at {rs[review_id]["reviewed_url"]}')
                
                if (i%10==0) or (i==len(review_urls)):
                    print(f'done: {i}/{len(review_urls)} ({100*i/len(review_urls):05.1f}%)')
                
            return user_info, rs

    
    def get_reviews_this_attraction(self, attr_url, min_total_reviews):
        
        # first go to the attraction page
        self.driver.get(attr_url)
        
        a_id = self.attr_id_from_url(attr_url)
        
        attr_details = defaultdict(name=None, total_reviews=None, category=None, rating_chart=None)
                    
        try:
            attr_details['name'] = self.driver.find_element_by_css_selector('div.attractionsHeader>h1#HEADING').text
        except:
            try:
                # it's probably a tour provider
                tour_operator = self.driver.find_element_by_css_selector('h1[id="HEADING"][class|="attractions-supplier-profile-"]').text.strip()
                print(f'looks like a tour operator: {tour_operator}')
            except:
                pass
            
        # if this is attraction without name, something is wrong and we are not collecting anything
        if not attr_details['name']:
            return None
        
        # how namy reviews does this attractio have?
        # if there's no rating (so no reviews) simply return 0 reviews right away
        try:
            self.driver.find_element_by_css_selector('div.section.rating>a.ui_bubble_rating.noReviewsBubbles')
            revs = 0
        except BaseException as e:
            # so there are some reviews..
            try:
                # 21,121 Reviews
                revs = int(self.driver.find_element_by_css_selector('div.ratingContainer>a>span.reviewCount') \
                            .text.replace(',','').split()[0])
            except:
                print('failed to find the total number of reviews!')
                revs = 0
        
        if revs < min_total_reviews:
            # too few reviews, doesn't make sense to proceed with this attraction
            print(f'only {revs:,} reviews, skipping..')
            return None
        
        # save total reviews for this attraction after filtering
        nreviews_filtered = self.select_filters()
        
        print(f'filtered reviews: {nreviews_filtered:,}')

        if not nreviews_filtered:
            print('skipping..')
            return None

        try:
            attr_details['total_reviews'] = self.driver.find_element_by_css_selector('div.ratingContainer').text.lower()
        except:
            pass

        try:
            attr_details['category'] = self.driver.find_element_by_css_selector('span.attractionCategories').text
        except:
            pass

        try:
            attr_details['rating_chart'] = [tuple(l.strip().split('\n')) for l in self.driver.find_element_by_css_selector('ul.ratings_chart').text.split('%') if l.strip()]
        except:
            pass

        # reviews for THIS ATTRACTION
        reviews = defaultdict(lambda: defaultdict())
        reviews_on_page = [0]
        
        # keep going until total collected reviews reaches total filtered reviews
        while len(reviews) < nreviews_filtered:

            try:
                reviews_this_page = int(WebDriverWait(self.driver, 10) \
                    .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                            'div[data-contextchoice="DETAIL"]>div.pagination-details'))).text.split('-')[1].strip().split()[0]) 
            except:
                if nreviews_filtered <= 10:
                    reviews_this_page = nreviews_filtered
                else:
                    try:
                        # just count rating timestamps if any
                        reviews_this_page = len(self.driver.find_elements_by_css_selector('span.ratingDate'))
                    except:
                        raise Exception('failed to pick the number of reviews on this page!') 
            
            reviews_on_page.append(reviews_this_page)
            
            to_pick = reviews_on_page[-1] - reviews_on_page[-2]
            
            picked_reviews = set()

            while len(picked_reviews) < to_pick:
                
                # first unfold all reviews on the page

                for c in self.driver.find_elements_by_css_selector('div.entry>p.partial_entry>span[class~="ulBlueLinks"][onclick]'):

                    self.do_click(c)

                # and now collect all full review texts
                review_ids_this_page = set()
        
                n_review_blocks = len(self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'))
                
                while len(review_ids_this_page) < n_review_blocks:
                        
                        for c in self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'):
                            
                            try:
                                _id = c.get_attribute('data-reviewid')
                            except:
                                continue
                                
                            if _id:
                                review_ids_this_page.add(_id)
                            else:
                                try:
                                    # id here looks like review_489766616
                                    _id = c.get_attribute('id').split('_')[-1]
                                except BaseException as e:
                                    print(str(e))
                
                # a set of review ids to collect on this page
                review_ids_to_collect_this_page = review_ids_this_page - set(reviews)

                if review_ids_to_collect_this_page:

                    tot_revs = len(review_ids_to_collect_this_page)

                else:
                    tot_revs = 0
                    print('no new reviews on this page!')

                p = 0

                while p < tot_revs:

                    for c in self.driver.find_elements_by_css_selector('div.reviewSelector'):        

                        try:

                            review_id = c.get_attribute('data-reviewid')
                            
                            if not review_id:
                                continue

                            if review_id in review_ids_to_collect_this_page:

                                reviews[review_id]['text'] = c.find_element_by_css_selector('div>div>div.entry>p.partial_entry').text.strip()
                                
                                infotext_css = 'div.member_info>div>div.info_text'
                                
                                try:
                                    # this is where a username should sit
                                    username = c.find_element_by_css_selector(infotext_css).text.split('\n')[0]
                                except:
                                    # it turns out that sometimes there are legacy anonymous users called
                                    # A TripAdvisor Member; check if it's one of these - note a slightly different
                                    # structure
                                    try:
                                        username = c.find_element_by_css_selector('div.member_info>div.info_text').text.split('\n')[0]
                                        if 'member' in username.lower():
                                            print(f'found an anonymous user called {username}')
                                    except:
                                        break
                                
                                reviews[review_id]['user'] = username
                                reviews[review_id]['user_profile_url'] = f'https://www.tripadvisor.com.au/Profile/{username}'
                                
                                try:
                                    userloc = c.find_element_by_css_selector(infotext_css + '>div.userLoc').text
                                    reviews[review_id]['user_loc'] = userloc
                                    # which country?
                                    reviews[review_id]['user_country'] = self.get_country(userloc)
                                except:
                                    reviews[review_id]['user_loc'] = None
                                    
                                try:
                                    reviews[review_id]['review_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>span.ratingDate')
                                                                                    .get_attribute('title'), 'D MMMM YYYY').format('YYYY-MM-DD')
                                except:
                                    reviews[review_id]['review_date'] = None
                                    
                                try:
                                    reviews[review_id]['experience_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>div.prw_reviews_stay_date_hsx').text, 'MMMM YYYY').format('YYYY-MM')
                                except:
                                    reviews[review_id]['experience_date'] = None
                                
                                try:
                                    class_full = c.find_element_by_css_selector('div>div>div>span.ui_bubble_rating').get_attribute('class')
                                    
                                    reviews[review_id]['rating'] = int(re.search(r'(?<=_)\d{1}', class_full).group(0))
                                except:
                                    reviews[review_id]['rating'] = None
                                          
                                reviews[review_id]['attr_name'] = attr_details['name']
                                reviews[review_id]['attr_id'] = a_id
                                reviews[review_id]['attr_loc'] = city_
                                
                                # remove this reviews id from the set of review ids to collct
                                review_ids_to_collect_this_page -= {review_id}
                                # and put this id into the set of already collected ids
                                picked_reviews.add(review_id)
                                # increment the total of collected reviews on this page
                                p += 1

                            else:
                                continue
                        except:
                            continue
        return reviews
    
    def _review_count_this_attr(self) -> int:
        
        """
        assuming you're on the attraction page, returns how many reviews there are in total for this attractions
        """
        
        revs = 0
        
        # if there's no rating (so no reviews) simply return 0 reviews right away
        
        try:
            # if this one if present there are no reviews
            self.driver.find_element_by_css_selector('div.section.rating>a.ui_bubble_rating.noReviewsBubbles')
            return revs
        except:
            pass

        # otherwise, there have to be some reviews..
        
        try:
            # the piece of text that says how many reviews looks like this: 21,121 Reviews
            revs = int(self.driver.find_element_by_css_selector('div.ratingContainer>a>span.reviewCount') \
                            .text.replace(',','').split()[0])
        except:
            print('failed to find the total number of reviews!')
        
        return revs
        
            
    def get_reviews_from_attraction_pages(self, min_total_reviews=50, max_attr=1000):
        
        """
        assuming that we already have attraction URLs in self.collected_attractions, visit every URL and 
        grab more details about the attraction as well as all reviews
        
        note: max_attr is useful for testing
        """
        
        # total number of attractions to check out based on the total of unique attaraction IDs 
        natt = len({a_id for state_ in self.collected_attractions
                        for city_ in self.collected_attractions[state_]
                             for a_id in self.collected_attractions[state_][city_]})
        
        # counter for processed attractions
        catt = 0
        
        for state_ in self.collected_attractions:
            for city_ in self.collected_attractions[state_]:
                for a_id in self.collected_attractions[state_][city_]:
                    
                    catt += 1
                    
                    if catt > max_attr:
                        print(f'stopped because done {max_attr:,} attractions')
                        break
                        
                    self.driver.get(self.collected_attractions[state_][city_][a_id]['url'])
                    
                    attr_details = defaultdict(name=None, total_reviews=None, category=None, rating_chart=None)
                    
                    try:
                        attr_details['name'] = self.driver.find_element_by_css_selector('div.attractionsHeader>h1#HEADING').text
                    except:
                        try:
                            # it's probably a tour provider
                            tour_operatior_name = self.driver.find_element_by_css_selector('h1[id="HEADING"][class|="attractions-supplier-profile-"]').text.strip()
                            print(f'looks like a tour operator: {tour_operatior_name}; skipping')
                            continue
                        except:
                            print(f'failed to find name for attraction {a_id}, skipping..')
                            continue
                            
                    print(f'attraction #{catt:,}/{natt:,}: [ID: {a_id}][{attr_details["name"]}]...')
                    
                    # how many reviews does this attraction have?
                    revs = self._review_count_this_attr()
                    
                    if revs < min_total_reviews:
                        # too few reviews, doesn't make sense to proceed with this attraction
                        print(f'only {revs:,} reviews, skipping..')
                        continue
                    
                    # save total reviews for this attraction after filtering
                    self.FILTREVS[a_id] = self.select_filters()
                    
                    print(f'filtered reviews: {self.FILTREVS[a_id]:,}')

                    if not self.FILTREVS[a_id]:
                        print('skipping..')
                        continue
                    
                    # note that total reviews doesn't depend on filtering
                    try:
                        attr_details['total_reviews'] = int(self.driver.find_element_by_css_selector('div.ratingContainer').text.split()[0].replace(',',''))
                    except:
                        pass

                    try:
                        attr_details['category'] = self.driver.find_element_by_css_selector('span.attractionCategories').text.lower().strip()
                    except:
                        pass

                    try:
                        attr_details['rating_chart'] = [tuple(l.strip().split('\n')) for l in self.driver.find_element_by_css_selector('ul.ratings_chart').text.split('%') if l.strip()]
                    except:
                        pass
                    
                    # add new attraction info
                    self.collected_attractions[state_][city_][a_id]['attr_name'] = attr_details['name']
                    self.collected_attractions[state_][city_][a_id]['category'] = attr_details['category']
                    self.collected_attractions[state_][city_][a_id]['total_reviews'] = attr_details['total_reviews']

                    # reviews for THIS ATTRACTION
                    reviews = defaultdict(lambda: defaultdict())
                    reviews_on_page = [0]

                    while len(reviews) < self.FILTREVS[a_id]:

                        try:
                            reviews_this_page = int(WebDriverWait(self.driver, 10) \
                                .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                        'div[data-contextchoice="DETAIL"]>div.pagination-details'))).text.split('-')[1].strip().split()[0]) 
                        except:
                            if self.FILTREVS[a_id] <= 10:
                                reviews_this_page = self.FILTREVS[a_id]
                            else:
                                try:
                                    # just count rating timestamps if any
                                    reviews_this_page = len(self.driver.find_elements_by_css_selector('span.ratingDate'))
                                except:
                                    raise Exception('failed to pick the number of reviews on this page!') 
                        
                        reviews_on_page.append(reviews_this_page)
                        
                        to_pick = reviews_on_page[-1] - reviews_on_page[-2]
                        
                        picked_reviews = set()

                        while len(picked_reviews) < to_pick:
                            
                            # first unfold all reviews on the page

                            for c in self.driver.find_elements_by_css_selector('div.entry>p.partial_entry>span[class~="ulBlueLinks"][onclick]'):

                                self.do_click(c)

                            # and now collect all full review texts
                            review_ids_this_page = set()
                
                            n_review_blocks = len(self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'))
                            
                            while len(review_ids_this_page) < n_review_blocks:
                                    
                                    for c in self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'):
                                        
                                        try:
                                            _id = c.get_attribute('data-reviewid')
                                        except:
                                            continue
                                            
                                        if _id:
                                            review_ids_this_page.add(_id)
                                        else:
                                            try:
                                                # id here looks like review_489766616
                                                _id = c.get_attribute('id').split('_')[-1]
                                            except BaseException as e:
                                                print(str(e))
                            
                            # a set of review ids to collect on this page
                            review_ids_to_collect_this_page = review_ids_this_page - set(reviews)

                            if review_ids_to_collect_this_page:

                                tot_revs = len(review_ids_to_collect_this_page)

                            else:
                                tot_revs = 0
                                print('no new reviews on this page!')

                            p = 0

                            while p < tot_revs:

                                for c in self.driver.find_elements_by_css_selector('div.reviewSelector'):        

                                    try:

                                        review_id = c.get_attribute('data-reviewid')
                                        
                                        if not review_id:
                                            continue

                                        if review_id in review_ids_to_collect_this_page:

                                            reviews[review_id]['text'] = c.find_element_by_css_selector('div>div>div.entry>p.partial_entry').text.strip()
                                            
                                            infotext_css = 'div.member_info>div>div.info_text'
                                            
                                            try:
                                                # this is where a username should sit
                                                username = c.find_element_by_css_selector(infotext_css).text.split('\n')[0]
                                            except:
                                                # it turns out that sometimes there are legacy anonymous users called
                                                # A TripAdvisor Member; check if it's one of these - note a slightly different
                                                # structure
                                                try:
                                                    username = c.find_element_by_css_selector('div.member_info>div.info_text').text.split('\n')[0]
                                                    if 'member' in username.lower():
                                                        print(f'found an anonymous user called {username}')
                                                except:
                                                    break
                                            
                                            reviews[review_id]['user'] = username
                                            reviews[review_id]['user_profile_url'] = f'https://www.tripadvisor.com.au/Profile/{username}'
                                            
                                            try:
                                                userloc = c.find_element_by_css_selector(infotext_css + '>div.userLoc').text
                                                reviews[review_id]['user_loc'] = userloc
                                                reviews[review_id]['user_country'] = self.get_country(userloc)
                                            except:
                                                reviews[review_id]['user_loc'] = None
                                                
                                            try:
                                                reviews[review_id]['review_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>span.ratingDate')
                                                                                                .get_attribute('title'), 'D MMMM YYYY').format('YYYY-MM-DD')
                                            except:
                                                reviews[review_id]['review_date'] = None
                                                
                                            try:
                                                reviews[review_id]['experience_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>div.prw_reviews_stay_date_hsx').text, 'MMMM YYYY').format('YYYY-MM')
                                            except:
                                                reviews[review_id]['experience_date'] = None
                                            
                                            try:
                                                class_full = c.find_element_by_css_selector('div>div>div>span.ui_bubble_rating').get_attribute('class')
                                                
                                                reviews[review_id]['rating'] = int(re.search(r'(?<=_)\d{1}', class_full).group(0))
                                            except:
                                                reviews[review_id]['rating'] = None
                                                
                                            try:
                                                reviews[review_id]['votes'] = int(c.find_element_by_css_selector('span.ui_icon.thumbs-up-fill + span').text.strip())
                                            except:
                                                # if there's no helpful notes there's no thumbs up icon
                                                reviews[review_id]['votes'] = 0
                                                
                                            try:
                                                reviews[review_id]['contributions'] = int(c.find_element_by_css_selector('span.ui_icon.pencil-paper + span').text.strip())
                                            except:
                                                print('failed to get total contributions')
                                                reviews[review_id]['contributions'] = 0
                                                      
                                            reviews[review_id]['attr_name'] = attr_details['name']
                                            reviews[review_id]['attr_id'] = a_id
                                            reviews[review_id]['attr_loc'] = city_
                                            
                                            # remove this reviews id from the set of review ids to collct
                                            review_ids_to_collect_this_page -= {review_id}
                                            # and put this id into the set of already collected ids
                                            picked_reviews.add(review_id)
                                            # increment the total of collected reviews on this page
                                            p += 1

                                        else:
                                            continue
                                    except:
                                        continue

                        # now try to click Next
                        npage_url = None

                        if self.FILTREVS[a_id] <= 10:
                            last_page_url = self.driver.current_url
                        else:
                            try:
                                last_page_url = list(self.driver.find_elements_by_css_selector('div.mobile-more>div>div.unified.ui_pagination>div.pageNumbers>a[href]'))[-1].get_attribute('href')
                            except:
                                last_page_url = self.driver.current_url 

                        if self.driver.current_url != last_page_url:

                            # https://www.tripadvisor.com.au/Attraction_Review-g255097-d1063162-Reviews-or20-Mount_Wellington-Hobart_Greater_Hobart_Tasmania.html

                            try:
                                pref = int(re.search(r'(?<=-Reviews-or)\d+', self.driver.current_url).group(0))
                            except:
                                pref = None

                            if pref:
                                next_page_url = self.driver.current_url.replace('or' + str(pref), 'or' + str(pref+10))
                            else:
                                next_page_url = self.driver.current_url.replace('Reviews-', 'Reviews-' + 'or' + str(10) + '-')

                            if next_page_url != self.driver.current_url:
                                self.driver.get(next_page_url)

                        # add reviews for THIS attraction to the dictionary of ALL reviews
                        self.REVIEWS.update(reviews)
        
        self.driver.close()
        
        return self
        
    def save_reviews(self):
        
        if not len(self.REVIEWS):
            print('no reviews so nothing to save...')
            return self
            
        file = '-'.join(['reviews', self.MAIN_LOCATION.replace(" ","_").upper(), 
                         self.FILTERS['traveller_type'], self.FILTERS['traveller_rating'], 
                         self.FILTERS['time_of_year']]) + '.json'
            
        json.dump(self.REVIEWS, open(os.path.join(self.COLLECT_DIR, file), 'w'))
        
        print(f'saved {len(self.REVIEWS):,} reviews')
            
        return self
    
    def save_attractions(self):
    
        json.dump(self.collected_attractions, 
                  open(os.path.join(self.COLLECT_DIR, 
                                    f'attractions-{self.MAIN_LOCATION.replace(" ","_").upper()}.json'), 'w'))
        
    def find_location_id(self, loc_name, loc_country):
        
        """
        find a TripAdvisor location ID by location name and country
        
        note: this method works if TripAdvisor has been given access to your location; otherwise, it's showing an annoying
              request to give permissions; there seems to be no reliable way to get rid of it, at least the Chrome options
              that are supposed to do it are not working properly
        """
    
        self.driver.get('https://www.tripadvisor.com.au')

        whereto_pill = WebDriverWait(self.driver, 20) \
                            .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                                                     'div[class^="brand-trip-search-geopill-"]>div.ui_pill')))

        whereto_pill.click()

        txtinput = WebDriverWait(self.driver, 20) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                    'div[class^="input-text-input-UnderlinedTextInput__input_container"]' \
                                     '>input[type="text"][placeholder]')))

        txtinput.clear()
        txtinput.send_keys(', '.join([loc_name, loc_country]))
        txtinput.send_keys(Keys.ENTER)

        whereto_pill = WebDriverWait(self.driver, 20) \
                .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                                                     'div[class^="brand-trip-search-geopill-"]>div.ui_pill')))

        if whereto_pill.text.lower().strip() == loc_name.lower().strip():
            # current url is now expected to look like 'https://www.tripadvisor.com.au/Home-g255060'
            try:
                loc_id = re.search(r'(?<=-g)\d+', t.driver.current_url).group(0)
                return loc_name, loc_id
            except:
                print(f'reached {loc_name} main page but failed to extract attraction ID!')
                
    def get_hotel_info(self, loc_hotels_url='https://www.tripadvisor.com.au/Hotels-g255060-Sydney_New_South_Wales-Hotels.html'):
        
        self.driver.get(loc_hotels_url)
        
        try:
            total_hotels = int(self.driver.find_element_by_css_selector('div[class^="hotels-sort-filter-header"]>span>span[class^="hotels-sort-filter-header"]') \
                               .text.split()[0].replace(',',''))
        except:
            print(f'failed to find total number of accommodations, quitting..')
            return None
        
        pginator = self.driver.find_element_by_css_selector('div.unified.ui_pagination.standard_pagination.ui_section.listFooter[data-numpages]')
        ppages = int(pginator.get_attribute('data-numpages'))
        
        off = 30
        
        page_urls = [self.driver.current_url]
        
        for p in range(2, ppages + 1):
            page_urls.append(f'https://www.tripadvisor.com.au/Hotels-g255060-oa{30*(p-1)}-Sydney_New_South_Wales-Hotels.html')
        
        hls = []
        
        for i, p in enumerate(page_urls, 1):
            
            print(f'page #{i:.0f}...')
                
            # if this doesn't throw an exception, it's the last page where Next is disabled; then no
            # need to get next page
            
            try:
                self.driver.find_element_by_css_selector('span.nav.next.ui_button.primary.disabled')
            except:
                if i > 1:
                    self.driver.get(p)
                    
            time.sleep(5)
            
            for h in self.driver.find_elements_by_css_selector('div[class^="prw_rup"][data-prwidget-name][data-mlv]>div>div[data-locationid][data-url]'):
                
                hotel = defaultdict()
                
                try:
                    hotel['name'] = h.find_element_by_css_selector('div>div>div.listing_title').text.strip()
                except:
                    print('failed to find hotel name!')
                    continue
                    
                try:
                    hotel['url'] = 'https://www.tripadvisor.com.au' + h.get_attribute('data-url')
                except:
                    print(f'failed to find hotel URL for {hotel["name"]}!')
                    continue
                    
                try:
                    hotel['rating'] = float(re.search(r'(?<=e_)\d+', h.find_element_by_css_selector('div>div.main-cols>div.info-col>div>a[data-clicksource="BubbleRating"]').get_attribute('class')).group(0))/10.0
                except:
                    print(f'failed to find rating for {hotel["name"]}!')
                    
                hls.append(hotel)
              
        return hls

In [None]:
if __name__ == '__main__':
    
    t = Trip(filter={'traveller_rating': 'Excellent', 
                      'traveller_type':'Friends', 
                       'time_of_year': 'Jun-Aug',  # note that this refers to review date (and NOT experience date)
                       'language':'English'}) \
                    .get_attraction_pages('tasmania') \
                        .get_reviews_from_attraction_pages(min_total_reviews=170, max_attr=2) \
                            .save_reviews() \
                                .save_attractions()

In [25]:
t = Trip(filter={'traveller_rating': 'Excellent', 
                      'traveller_type':'Friends', 
                       'time_of_year': 'Jun-Aug',  # note that this refers to review date (and NOT experience date)
                       'language':'English'})


[FILTER]: traveller_rating: Excellent | traveller_type: Friends | time_of_year: Jun-Aug | language: English



In [None]:
ui, ur = t.process_user_profile('https://www.tripadvisor.com.au/Profile/JVD-Netherlands', verbose=True)

In [26]:
t.get_hotel_info()

page #1...
page #2...
page #3...
page #4...
page #5...
page #6...
page #7...
page #8...
page #9...
failed to find rating for DD Apartments Fish Market!
failed to find rating for Buxton House!
failed to find rating for Stylish Waterfront Apt in Rhodes!
failed to find rating for Camperdown Rooms!
failed to find rating for The Lady Hampshire!
failed to find rating for Sky Garden Studios!
failed to find rating for DD Apartments on Darling Harbour!
page #10...
failed to find rating for Getaway Holiday Bankstown!
failed to find rating for Cozy 5 Bed House In Sydney!
page #11...
page #12...
failed to find rating for Sweet Home!
page #13...
failed to find rating for Astra Apartments North Sydney!
failed to find rating for The Bayside Hotel!
failed to find rating for Bedlam Mates Place Backpackers!
failed to find rating for Bligh Hotel!
failed to find rating for Blue Sky House in Western Sydney!
failed to find rating for BridgeStreet Windsor on Kent!
failed to find rating for Cass White Hotel!


[defaultdict(None,
             {'name': 'Shangri-La Hotel Sydney',
              'url': 'https://www.tripadvisor.com.au/Hotel_Review-g255060-d256528-Reviews-Shangri_La_Hotel_Sydney-Sydney_New_South_Wales.html',
              'rating': 4.5}),
 defaultdict(None,
             {'name': 'PARKROYAL Darling Harbour Sydney',
              'url': 'https://www.tripadvisor.com.au/Hotel_Review-g255060-d257797-Reviews-PARKROYAL_Darling_Harbour_Sydney-Sydney_New_South_Wales.html',
              'rating': 4.5}),
 defaultdict(None,
             {'name': 'Cambridge Hotel Sydney',
              'url': 'https://www.tripadvisor.com.au/Hotel_Review-g255060-d255423-Reviews-Cambridge_Hotel_Sydney-Sydney_New_South_Wales.html',
              'rating': 4.0}),
 defaultdict(None,
             {'name': 'Sofitel Sydney Darling Harbour',
              'url': 'https://www.tripadvisor.com.au/Hotel_Review-g255060-d11687464-Reviews-Sofitel_Sydney_Darling_Harbour-Sydney_New_South_Wales.html',
              'rating': 4.5