In [4]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow
import random

import numpy as np

import subprocess
import zipfile

from attraction import Attraction
from review import Review
from user import User

In [25]:
class Trip:
    
    def __init__(self):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        prefs = {"profile.default_content_setting_values.notifications" : 2}
        options.add_experimental_option("prefs",prefs)

        # options.add_argument('--headless')
        
        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
    
    def do_click(self, e):
    
        _clicked = False
        _c = 0

        while (not _clicked) and (_c < 3):

            try:
                e.click()
                _clicked = True
            except:
                try:
                    self.driver.find_element_by_css_selector('span.ui_overlay>div.ui_close_x').click()
                    print('killed an overlay!')
                except:
                    els = list(self.driver.find_elements_by_css_selector('div[class^="QSISlider"]>div'))
                    for i, d in enumerate(els):
                        if d.text.strip().lower() == 'Not right now, thanks.'.strip().lower():
                            try:
                                els[i-1].click()
                                _clicked = True
                                print('killed a slide!')
                                break
                            except:
                                continue
                _c += 1

        return _clicked
    
    def process_paginator(self, pg, off=30, pref='oa'):
        
        p_numbers = [int(s.text.strip()) for s in pg.find_elements_by_css_selector('div') if s.text.strip().isdigit()]
        
        if p_numbers:
            total_pages = p_numbers[-1]
        else:
            raise Exception('no pages in paginator!')
        
        current_page_url = self.driver.current_url
        last_page_url = self.driver.current_url
        
        page_urls = [current_page_url]
        
        for _ in pg.find_elements_by_css_selector('div>a'):
            if _.text.isdigit():
                last_page_url = _.get_attribute('href')
                
        if total_pages > 1:
            for i in range(1, total_pages):
                # starts from page 2 (page 1 has no -oa[number]- part)
                page_urls.append(re.sub('Activities-', 'Activities-' + pref + str(off*i) + '-', current_page_url))
                
        return total_pages, page_urls
        
    def get_attraction_pages(self, location_home_url):
    
        self.driver.get(location_home_url)
        
        try:
            things_to_do_icon = WebDriverWait(self.driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.ui_icon.attractions + span')))
        except:
            raise Exception('failed to find Things to do icon!')
        
        res = self.do_click(things_to_do_icon)
        
        if not res:
            raise Exception('failed to click Things to do icon!')

        try:
            see_more_still_folded = WebDriverWait(self.driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                 'span.single-chevron-down')))
        except:
            see_more_still_folded = None
        
        if see_more_still_folded:

            try:
                see_more_cl = self.driver.find_element_by_css_selector('div[class|="attractions-attraction-overview-main-TopPOIs__see_more"]')
            except:
                raise Exception('failed to find See More option!')
            
            res = self.do_click(see_more_cl)
            
            if not res:
                raise Exception('failed to click See More option!')
            
        try:
            pg = WebDriverWait(self.driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                'div[class|="attractions-attraction-overview-main-Pagination__container"]')))
        except:
            raise Exception('no paginator!')
            
        total_pages, page_urls = self.process_paginator(pg)
        
        print(f'total attraction pages: {total_pages}')
        
        print('page ', end='')
        
        self.collected_attractions = defaultdict()
     
        for i, attr_page_url in enumerate(page_urls, 1):
              
            print(f'#{i:02.0f}...', end='')
        
            if attr_page_url != self.driver.current_url:
                self.driver.get(attr_page_url)
        
            if i == 1:
            
                for attr_card in self.driver.find_elements_by_css_selector('div[class|="attractions-attraction-overview-pois-PoiGrid__wrapper"]'
                                                                          '>li[class^="attractions-attraction-overview-pois-PoiCard__item"]'
                                                                          '>div[class|="attractions-attraction-overview-pois-PoiCard__card_info"]'):
                
                    try:
                        url_ = attr_card.find_element_by_css_selector('div>a[class|="attractions-attraction-overview-pois-PoiInfo__name"]').get_attribute('href')
                        id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                        self.collected_attractions[id_] = {'url': url_}
                    except:
                        print('failed to extract url or id!')
                        continue       
                          
            else:
            
                for attr_card in self.driver.find_elements_by_css_selector('div.attraction_list>div'
                                                                          '>div>div.listing>div.listing_details>div.listing_info'):
                
                    try:
                        url_ = attr_card.find_element_by_css_selector('div.tracking_attraction_title.listing_title>a').get_attribute('href')
                        id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                        self.collected_attractions[id_] = {'url': url_}
                    except:
                        print('failed to extract url or id!')
                        continue
        
        print('\n')
        print(f'done. collected: {len(self.collected_attractions):,} attractions')
    
        return self

    def select_filters(self, traveller_rating=None, 
                               traveller_type='Solo', 
                               time_of_year=None,  
                               language='English', 
                               max_attempts=3):
    
        d = {'traveller_rating': {'data-name': 'ta_rating',
                                  'input-values': {'Excellent': '5',
                                                   'Very good': '4',
                                                   'Average': '3',
                                                   'Poor': '2',
                                                   'Terrible': '1'},
                                 'pick': traveller_rating},
            'traveller_type': {'data-name': 'traveler_filter',
                               'input-values': {'Families': '3',
                                                'Couples': '2',
                                                'Solo': '5',
                                                'Business': '1',
                                                'Friends': '4'},
                              'pick': traveller_type},
            'time_of_year': {'data-name': 'season',
                             'input-values': {'Mar-May': '1',
                                              'Jun-Aug': '2',
                                              'Sep-Nov': '3',
                                              'Dec-Feb': '4'},
                            'pick': time_of_year},
            'language': {'data-name': 'language',
                         'input-values': {'English': 'en',
                                          'Japanese': 'ja'},
                         'pick': language}}

        def is_selected(css_selector_st):

            try:
                WebDriverWait(self.driver, 10) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                   css_selector_st + '>input[checked="checked"]')))
                return True

            except:
                return False

        def _click(css_selector_st, max_attempts=3):

            times_tried = 0

            flag_before = is_selected(css_selector_st)
            flag_after = flag_before

            while (times_tried <= max_attempts) and (flag_after == flag_before):

                times_tried += 1   

                try:
                    e = WebDriverWait(self.driver, 20) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector_st)))
                except:
                    print(f'failed to find {css_selector_st}!')

                e.click()

                flag_after = is_selected(css_selector_st)    

            return (flag_after != flag_before)


        for filt in d:

            value = d[filt]['pick']

            # uncheck everything else
            to_uncheck = [other_value for other_value in d[filt]['input-values'] if other_value != value]
            print('need to uncheck ', to_uncheck)

            if to_uncheck:

                for other_value in to_uncheck:
                    print(f'unchecking {other_value}..')
                    tr_pick = d[filt]['input-values'][other_value]
                    dname = d[filt]['data-name']
                    st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                    if is_selected(st):
                        res = _click(st)
                        if not res:
                            print(f'problem unchecking {other_value}') 
                        else:
                            print('unchecked')

            if value:

                tr_pick = d[filt]['input-values'][value]
                dname = d[filt]['data-name']
                st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                print(f'selecting {filt}={value}...', end='')

                if is_selected(st):
                    print('ok')
                    continue
                else:
                    _selected =  _click(st)

                if _selected:
                    print('ok')

        try:
            lang_code = d['language']['input-values'][d['language']['pick']] 
            css_count = f'div.choices[data-name="language"]>div[data-value="{lang_code}"]>label.label>span.count'
            c_txt = WebDriverWait(self.driver, 10) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, css_count))).text.strip()
            c = int(re.sub(r'[(,)]','',c_txt))
            print(f'reviews: {c:,}')
        except:
            print('failed to get review count!')
            
    def attraction_pages(self):
        
        for a_id in self.collected_attractions:
            
            print('attraction ', a_id)
            
            self.driver.get(self.collected_attractions[a_id]['url'])
            
            self.select_filters(traveller_rating='Excellent', traveller_type='Solo', time_of_year='Sep-Nov')
            
            break

In [26]:
if __name__ == '__main__':
    
    t = Trip().get_attraction_pages('https://www.tripadvisor.com.au/Home-g255097').attraction_pages()

total attraction pages: 5
page #01...#02...#03...#04...#05...

done. collected: 147 attractions
attraction  d1063162
need to uncheck  ['Very good', 'Average', 'Poor', 'Terrible']
unchecking Very good..
unchecking Average..
unchecking Poor..
unchecking Terrible..
selecting traveller_rating=Excellent...ok
need to uncheck  ['Families', 'Couples', 'Business', 'Friends']
unchecking Families..
unchecking Couples..
unchecking Business..
unchecking Friends..
selecting traveller_type=Solo...ok
need to uncheck  ['Mar-May', 'Jun-Aug', 'Dec-Feb']
unchecking Mar-May..
unchecking Jun-Aug..
unchecking Dec-Feb..
selecting time_of_year=Sep-Nov...ok
need to uncheck  ['Japanese']
unchecking Japanese..
selecting language=English...ok
reviews: 54


In [190]:
def process_attraction_page(page_url):
    
    driver.get(page_url)
    
    attr_details = defaultdict()
    
    try:
        attr_details['name'] = driver.find_element_by_css_selector('h1#HEADING').text
    except:
        attr_details['name'] = None
        
    try:
        attr_details['total_reviews'] = driver.find_element_by_css_selector('div.ratingContainer').text.lower()
    except:
        attr_details['total_reviews'] = None
        
    try:
        attr_details['category'] = driver.find_element_by_css_selector('span.attractionCategories').text
    except:
        attr_details['category'] = None
        
    try:
        attr_details['rating_chart'] = [tuple(l.strip().split('\n')) for l in driver.find_element_by_css_selector('ul.ratings_chart').text.split('%') if l.strip()]
    except:
        attr_details['rating_chart'] = None
    
    # filtered reviews
    review_progress_text = WebDriverWait(driver, 10) \
                    .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                            'div[data-contextchoice="DETAIL"]>div.pagination-details'))).text

    if review_progress_text:
        
        try:
            total_filtered_reviews = int(review_progress_text.replace(',','').split()[-2])
        except:
            total_filtered_reviews = None   
            
        try:
            reviews_this_page = int(review_progress_text.split('-')[1].strip().split()[0])
        except:
            reviews_this_page = None
    
    print('filtered_reviews=',total_filtered_reviews)
    print('reviews_this_page=', reviews_this_page)
    
    reviews = defaultdict()
    
    picked_reviews = set()
    
    while len(picked_reviews) < reviews_this_page:
        
        # first unfold all reviews on the page
        
        for c in driver.find_elements_by_css_selector('div.reviewSelector'):
            
            try:
                span_show_more_text = c.find_element_by_css_selector('div>div>div.entry>p.partial_entry>span[onclick]')
            except:
                span_show_more_text = None

            if span_show_more_text and (span_show_more_text.text.lower().strip() == 'more'):

                try:
                    span_show_more_text.click()

                    fold_option = WebDriverWait(driver, 10) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                'div>div>div.entry>span[onclick]')))
                    if fold_option.text.lower().strip() == 'show less':
                        print('clicked to show more')
                except:
                    print('failed to click on More to unfold review text!')
                    
        # and now collect all full review texts
        
        for c in driver.find_elements_by_css_selector('div.reviewSelector'):        
        
            try:
                
                review_id = c.get_attribute('data-reviewid')
                
                if review_id not in picked_reviews:
                    picked_reviews.add(review_id)
                else:
                    continue
            except:
                continue
            
            # now get the review text
            reviews[review_id] = {'text': c.find_element_by_css_selector('div>div>div.entry>p.partial_entry').text.strip()}
            reviews[review_id].update({'user': c.find_element_by_css_selector('div.member_info').text})
            
    print(reviews)
    
    npage_url = None
    
    for a in driver.find_elements_by_css_selector('div.unified.ui_pagination>a[data-page-number]'):
        print('a text=', a.text.strip().lower())
        if a.text.strip().lower() == 'next':
            npage_url = a.get_attribute('href')
            if npage_url:
                break
                              
    print(npage_url)
       