In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow
import random

import numpy as np

import subprocess
import zipfile

from attraction import Attraction
from review import Review
from user import User

In [2]:
 class Trip:
    
    def __init__(self, filter):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        prefs = {"profile.default_content_setting_values.notifications" : 2}
        options.add_experimental_option("prefs",prefs)

        # options.add_argument('--headless')
        
        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
        self.FILTREVS = defaultdict()
        self.REVIEWS = defaultdict()
        
        # counter to see how many annoying things get killed
        self.KILLED = defaultdict(int)
        
        self.FILTERS = filter
        
        print(f'filter: {" | ".join([k + ":" + str(v) for k, v in self.FILTERS.items() if v])}')
        
        self.LOCATION_IDS = json.load(open('tradvisor_location_ids.json'))
    
    def do_click(self, e, max_=3):
        
        """
        try to click on element e and return True if it worked or False otherwise
        """
        
        # assume not click
        _clicked = False
        _c = 0

        while (not _clicked) and (_c < max_):

            try:
                e.click()
                _clicked = True
            except:
                try:
                    self.driver.find_element_by_css_selector('span.ui_overlay>div.ui_close_x').click()
                    self.KILLED['overlay'] += 1
                except:
                    # try to catch and close all sliders
                    els = list(self.driver.find_elements_by_css_selector('div[class^="QSISlider"]>div'))
                    for i, d in enumerate(els):
                        if d.text.strip().lower() == 'Not right now, thanks.'.strip().lower():
                            try:
                                els[i-1].click()
                                self.KILLED['slide'] += 1
                                break
                            except:
                                continue
                    for _ in self.driver.find_elements_by_css_selector('div.sbx_close[onclick]'):
                        try:
                            _.click()
                            self.KILLED['infobar'] += 1
                        except:
                            pass        
            _c += 1

        return _clicked
    
    def process_paginator(self, pg, off=30, pref='oa'):
        
        print('processing paginator..')
        
        p_numbers = [int(s.text.strip()) for s in pg.find_elements_by_css_selector('div') if s.text.strip().isdigit()]
        
        if p_numbers:
            total_pages = p_numbers[-1]
        else:
            raise Exception('no pages in paginator!')
        
        current_page_url = self.driver.current_url
        last_page_url = self.driver.current_url
        
        page_urls = [current_page_url]
        
        for _ in pg.find_elements_by_css_selector('div>a'):
            if _.text.isdigit():
                last_page_url = _.get_attribute('href')
                
        if total_pages > 1:
            for i in range(1, total_pages):
                # starts from page 2 (page 1 has no -oa[number]- part)
                page_urls.append(re.sub('Activities-', 'Activities-' + pref + str(off*i) + '-', current_page_url))
                
        return total_pages, page_urls
        
    def get_attraction_pages(self, loc):
        
        """
        go to the home page for location loc and collect all attractions; if loc is a state, collect attractions
        for every location in that state
        
        """
        
        # home_urls will be a list of tuples like 
        # [('tasmania', 'hobart', 'https://www.tripadvisor.com.au/Home-g255097'), 
        #  ('tasmania', 'launceston', 'https://www.tripadvisor.com.au/Home-g255344')...
        
        home_urls = []
        
        self.collected_attractions = defaultdict(lambda: defaultdict(lambda: defaultdict()))
        
        """
        {'tasmania': {'hobart': {attr_id: {name: market,
                                            url: 'https:/www...'}}}
        """
        
        # if loc is state
        if loc in self.LOCATION_IDS:
            for city in self.LOCATION_IDS[loc]:
                home_urls.append((loc, city, f'https://www.tripadvisor.com.au/Home-{self.LOCATION_IDS[loc][city]}'))
        else:
            for state_ in self.LOCATION_IDS:
                if loc in self.LOCATION_IDS[state_]:
                    home_urls.append((state_, loc, f'https://www.tripadvisor.com.au/Home-{self.LOCATION_IDS[state_][loc]}'))
                    
        if not home_urls:
            print(f'no attractions to pick for {loc.upper()}!')
            return self
        
        for state_, city_, homeurl in home_urls:
            
            print('getting attractions for ' + '/'.join([state_, city_, homeurl]))
            
            self.driver.get(homeurl)
            
            # ---- find and click the Things to Do icon; assume it MUST be there
            
            thingstodo_clicked = False
            
            while not thingstodo_clicked:
                
                try:
                    things_to_do_icon = WebDriverWait(self.driver, 10) \
                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.ui_icon.attractions + span')))
                    if things_to_do_icon.text.strip().lower() == 'Things to do'.strip().lower():
                        print('this is the right things to do span')
                except:
                    raise Exception('failed to find Things to do icon!')

                res = self.do_click(things_to_do_icon)

                if not res:
                    print('failed to click Things to do icon! retrying..')
                else:
                    thingstodo_clicked = True
                
            # ------- ok, so clicked the icon; now the question is whether there are many 'top' things to 
            # do or just a few; the latter means no need to look for the See More button
            
            moveon = False
            few_attractions = False
            
            # try to click See More first
            
            while 1:
                
                try:
                    c_mo = WebDriverWait(self.driver, 10) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                    'div[class|="attractions-attraction-overview-main-TopPOIs__see_more"]')))

                    print('See More is available')
                except:
                    # if there's no See More it's just 1 page with attractions
                    page_urls = [self.driver.current_url]
                    few_attractions = True
                    break
                    
                # if we got to here, there is See More, let's try to click it..  
                print('trying to click See More..')
                
                seeless = False
                
                while not seeless:
                    
                    print('clicking See more now..')
                    res = self.do_click(c_mo)
                    
                    # didn't click..
                    if not res:
                        print('did NOT click See More')
                        try:
                            # there is this See More button but still folded, so apparently we didn't click properly
                            self.driver.find_element_by_css_selector('span.ui_icon.single-chevron-down')
                            print('There is See more, folded')
                        except:
                            print('there is no See More now..')
                            seeless = True
                    # look like clicked See More..
                    else:
                        print('clicked SEE MORE')
                        try:
                            print('checking for See Less')
                            WebDriverWait(self.driver, 10) \
                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                    'span.ui_icon.single-chevron-up')))

                            seeless = True
                            print('See Less is there')
                        except:
                            print('but no See Less, need to click See more again')
                break                
            
            # paginator only available if there are many attractions; however, there may be no paginator if See More
            # was clicked but there are still too few attractions
            
            if not few_attractions:
                
                try:
                    pg = WebDriverWait(self.driver, 10) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                    'div[class|="attractions-attraction-overview-main-Pagination__container"]')))
                except:
                    pg = None
                    print('no paginator even though there was See More..')
                
                if pg:
                    total_pages, page_urls = self.process_paginator(pg) 
                else:
                    page_urls = [self.driver.current_url]

            print('visiting attraction pages..')

            for i, attr_page_url in enumerate(page_urls, 1):
                
                print(f'page {i}/{len(page_urls)}..', end='')

                if attr_page_url != self.driver.current_url:
                    self.driver.get(attr_page_url)
                
                if i == 1:
                    
                    attr_topick_css = 'div[class|="attractions-attraction-overview-pois-PoiGrid__wrapper"]' + \
                                      '>li[class^="attractions-attraction-overview-pois-PoiCard__item"]' + \
                                      '>div[class|="attractions-attraction-overview-pois-PoiCard__card_info"]'
                            
                    attr_topick = len(self.driver.find_elements_by_css_selector(attr_topick_css))
                    
                    while attr_topick:
                        
                        for attr_card in self.driver.find_elements_by_css_selector(attr_topick_css):

                            try:
                                url_ = attr_card.find_element_by_css_selector('div>a[class|="attractions-attraction-overview-pois-PoiInfo__name"]') \
                                                        .get_attribute('href')
                            except:
                                print('didn\'t get attraction url! moving on to next attraction card..')
                                continue
                            
                            try:
                                id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                            except:
                                print('failed to extract attraction id!')
                                continue
                            
                            try:
                                self.collected_attractions[state_][city_][id_] = {'url': url_}
                                attr_topick -= 1
                            except:
                                pass

                elif i > 1:
                    
                    attr_topick_css = 'div.attraction_list>div' + \
                                      '>div>div.listing>div.listing_details>div.listing_info'
                        
                    attr_topick = len(self.driver.find_elements_by_css_selector(attr_topick_css))
                    
                    while attr_topick:
                        
                        for attr_card in self.driver.find_elements_by_css_selector(attr_topick_css):

                            try:
                                url_ = attr_card.find_element_by_css_selector('div.tracking_attraction_title.listing_title>a').get_attribute('href')
                            except:
                                print('didn\'t get attraction url! moving on to next attraction card..')
                                continue
                                
                            try:
                                id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                            except:
                                print('failed to extract attraction id!')
                                continue
                            
                            try:
                                self.collected_attractions[state_][city_][id_] = {'url': url_}
                                attr_topick -= 1
                            except:
                                pass
            
                print('ok')
    
        return self

    def select_filters(self):
        
        """
        apply filter on the attraction page and return the number of reviews available
        after the filter has been applied
        """
        
        d = {'traveller_rating': {'data-name': 'ta_rating',
                                  'input-values': {'Excellent': '5',
                                                   'Very good': '4',
                                                   'Average': '3',
                                                   'Poor': '2',
                                                   'Terrible': '1'},
                                 'pick': self.FILTERS['traveller_rating']},
            'traveller_type': {'data-name': 'traveler_filter',
                               'input-values': {'Families': '3',
                                                'Couples': '2',
                                                'Solo': '5',
                                                'Business': '1',
                                                'Friends': '4'},
                              'pick': self.FILTERS['traveller_type']},
            'time_of_year': {'data-name': 'season',
                             'input-values': {'Mar-May': '1',
                                              'Jun-Aug': '2',
                                              'Sep-Nov': '3',
                                              'Dec-Feb': '4'},
                            'pick': self.FILTERS['time_of_year']},
            'language': {'data-name': 'language',
                         'input-values': {'English': 'en',
                                          'Japanese': 'ja'},
                         'pick': self.FILTERS['language']}}

        def is_selected(css_selector_st):

            try:
                WebDriverWait(self.driver, 5) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                   css_selector_st + '>input[checked="checked"]')))
                return True

            except:
                return False

        def _click(css_selector_st, max_attempts=3):

            times_tried = 0

            flag_before = is_selected(css_selector_st)
            flag_after = flag_before

            while (times_tried <= max_attempts) and (flag_after == flag_before):

                times_tried += 1   

                try:
                    e = WebDriverWait(self.driver, 20) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector_st)))
                except:
                    e = None
                    print(f'failed to find {css_selector_st}!')
                
                if e:
                    res = self.do_click(e)

                flag_after = is_selected(css_selector_st)    

            return (flag_after != flag_before)


        for filt in d:

            value = d[filt]['pick']

            # uncheck everything else
            to_uncheck = [other_value for other_value in d[filt]['input-values'] if other_value != value]

            if to_uncheck:

                for other_value in to_uncheck:

                    tr_pick = d[filt]['input-values'][other_value]
                    dname = d[filt]['data-name']
                    st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                    if is_selected(st):
                        res = _click(st)

            if value:

                tr_pick = d[filt]['input-values'][value]
                dname = d[filt]['data-name']
                st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                if is_selected(st):
                    continue
                else:
                    _selected =  _click(st)

        try:
            lang_code = d['language']['input-values'][d['language']['pick']] 
            css_count = f'div.choices[data-name="language"]>div[data-value="{lang_code}"]>label.label>span.count'
            c_txt = WebDriverWait(self.driver, 6) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, css_count))).text.strip()
            c = int(re.sub(r'[(,)]','',c_txt))
        except:
            # if no review count it's because there are not reviews left after filtering
            return 0
            
        return int(c)
            
    def attraction_pages(self, min_total_reviews=50):
        
        for state_ in self.collected_attractions:
            for city_ in self.collected_attractions[state_]:
                for a_id in self.collected_attractions[state_][city_]:

                    self.driver.get(self.collected_attractions[state_][city_][a_id]['url'])
                    
                    attr_details = defaultdict(name=None, total_reviews=None, category=None, rating_chart=None)
                    
                    try:
                        attr_details['name'] = self.driver.find_element_by_css_selector('div.attractionsHeader>h1#HEADING').text
                    except:
                        print(f'failed to find name for attraction {a_id}, skipping..')
                        continue
                        
                    print(f'{a_id}: {attr_details["name"]}...')
                    
                    # how namy reviews does this attractio have?
                    # if there's no rating (so no reviews) simply return 0 reviews right away
                    try:
                        self.driver.find_element_by_css_selector('div.section.rating>a.ui_bubble_rating.noReviewsBubbles')
                        revs = 0
                    except BaseException as e:
                        # so there are some reviews..
                        try:
                            # 21,121 Reviews
                            revs = int(self.driver.find_element_by_css_selector('div.ratingContainer>a>span.reviewCount') \
                                        .text.replace(',','').split()[0])
                        except:
                            print('failed to find the total number of reviews!')
                            revs = 0
                    
                    if revs < min_total_reviews:
                        # too few reviews, doesn't make sense to proceed with this attraction
                        print(f'only {revs:,} reviews, skipping..')
                        continue
                        
                    self.FILTREVS[a_id] = self.select_filters()
                    
                    print(f'filtered reviews: {self.FILTREVS[a_id]:,}')

                    if not self.FILTREVS[a_id]:
                        print('skipping..')
                        continue

                    try:
                        attr_details['total_reviews'] = self.driver.find_element_by_css_selector('div.ratingContainer').text.lower()
                    except:
                        pass

                    try:
                        attr_details['category'] = self.driver.find_element_by_css_selector('span.attractionCategories').text
                    except:
                        pass

                    try:
                        attr_details['rating_chart'] = [tuple(l.strip().split('\n')) for l in self.driver.find_element_by_css_selector('ul.ratings_chart').text.split('%') if l.strip()]
                    except:
                        pass

                    # reviews for THIS ATTRACTION
                    reviews = defaultdict()
                    reviews_on_page = [0]

                    while len(reviews) < self.FILTREVS[a_id]:

                        try:
                            reviews_this_page = int(WebDriverWait(self.driver, 10) \
                                .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                        'div[data-contextchoice="DETAIL"]>div.pagination-details'))).text.split('-')[1].strip().split()[0])
                            reviews_on_page.append(reviews_this_page)
                        except:
                            if self.FILTREVS[a_id] <= 10:
                                reviews_on_page.append(self.FILTREVS[a_id])
                            else:
                                raise Exception('failed to pick the number of reviews on this page!') 

                        picked_reviews = set()
                        to_pick = reviews_on_page[-1] - reviews_on_page[-2]

                        while len(picked_reviews) < to_pick:

                            # first unfold all reviews on the page

                            for c in self.driver.find_elements_by_css_selector('div.entry>p.partial_entry>span[class~="ulBlueLinks"][onclick]'):

                                self.do_click(c)

                            # and now collect all full review texts
                            review_ids_this_page = set()
                
                            while not review_ids_this_page:
                        
                                try:
                                    review_ids_this_page = {c.get_attribute('data-reviewid') for c in 
                                                         self.driver.find_elements_by_css_selector('div.reviewSelector')}
                                except:
                                    pass

                            review_ids_to_collect_this_page = review_ids_this_page - set(reviews)

                            if review_ids_to_collect_this_page:

                                tot_revs = len(review_ids_to_collect_this_page)

                            else:
                                tot_revs = 0
                                print('no new reviews on this page!')

                            p = 0

                            while p < tot_revs:

                                for c in self.driver.find_elements_by_css_selector('div.reviewSelector'):        

                                    try:

                                        review_id = c.get_attribute('data-reviewid')

                                        if review_id in review_ids_to_collect_this_page:

                                            reviews[review_id] = {'text': c.find_element_by_css_selector('div>div>div.entry>p.partial_entry').text.strip()}
                                            reviews[review_id].update({'user': c.find_element_by_css_selector('div.member_info').text})
                                            picked_reviews.add(review_id)

                                            p += 1

                                        else:
                                            print('this review already collected')
                                            continue
                                    except:
                                        continue

                        # now try to click Next
                        npage_url = None

                        if self.FILTREVS[a_id] <= 10:
                            last_page_url = self.driver.current_url
                        else:
                            try:
                                last_page_url = list(self.driver.find_elements_by_css_selector('div.mobile-more>div>div.unified.ui_pagination>div.pageNumbers>a[href]'))[-1].get_attribute('href')
                            except:
                                last_page_url = self.driver.current_url 

                        if self.driver.current_url != last_page_url:

                            # https://www.tripadvisor.com.au/Attraction_Review-g255097-d1063162-Reviews-or20-Mount_Wellington-Hobart_Greater_Hobart_Tasmania.html

                            try:
                                pref = int(re.search(r'(?<=-Reviews-or)\d+', self.driver.current_url).group(0))
                            except:
                                pref = None

                            if pref:
                                next_page_url = self.driver.current_url.replace('or' + str(pref), 'or' + str(pref+10))
                            else:
                                next_page_url = self.driver.current_url.replace('Reviews-', 'Reviews-' + 'or' + str(10) + '-')

                            if next_page_url != self.driver.current_url:
                                self.driver.get(next_page_url)

                        # add reviews for THIS attraction to the dictionary of ALL reviews
                        self.REVIEWS.update(reviews)
                
        return self
        
    def save_reviews(self):
            
        file = '_'.join(['reviews', self.FILTERS['traveller_type'], self.FILTERS['traveller_rating'], self.FILTERS['time_of_year']]) + '.json'
            
        json.dump(self.REVIEWS, open(file, 'w'))
            
        return self

In [3]:
if __name__ == '__main__':
    
    t = Trip(filter={'traveller_rating': 'Excellent', 
                      'traveller_type':'Solo', 
                       'time_of_year': 'Jun-Aug',  
                       'language':'English'}).get_attraction_pages('tasmania').attraction_pages(min_total_reviews=500).save_reviews()

filter: traveller_rating:Excellent | traveller_type:Solo | time_of_year:Jun-Aug | language:English
getting attractions for tasmania/hobart/https://www.tripadvisor.com.au/Home-g255097
this is the right things to do span
clicked ok
See More is available
trying to click See More..
clicking See more now..
clicked ok
clicked SEE MORE
checking for See Less
See Less is there
processing paginator..
visiting attraction pages..
page 1/5..ok
page 2/5..ok
page 3/5..ok
page 4/5..ok
page 5/5..ok
getting attractions for tasmania/launceston/https://www.tripadvisor.com.au/Home-g255344
this is the right things to do span
clicked ok
See More is available
trying to click See More..
clicking See more now..
clicked ok
clicked SEE MORE
checking for See Less
See Less is there
processing paginator..
visiting attraction pages..
page 1/3..ok
page 2/3..ok
page 3/3..ok
getting attractions for tasmania/swansea/https://www.tripadvisor.com.au/Home-g504332
this is the right things to do span
clicked ok
visiting attrac