In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow
import random

import numpy as np

import subprocess
import zipfile

from attraction import Attraction
from review import Review
from user import User

In [2]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--incognito')
options.add_argument('--start-maximized')
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs",prefs)

# options.add_argument('--headless')

In [239]:
driver = webdriver.Chrome('webdriver/chromedriver', options=options)

In [17]:
driver.get('https://www.tripadvisor.com.au/Attraction_Review-g255060-d257278-Reviews-Sydney_Opera_House-Sydney_New_South_Wales.html')

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=49457): Max retries exceeded with url: /session/510120c124c2c22fc35c671fa123c135/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1157b2e48>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [240]:
def select_filters(traveller_rating=None, 
                   traveller_type='Solo', 
                   time_of_year=None,  
                   language='English', 
                   max_attempts=3):
    
    d = {'traveller_rating': {'data-name': 'ta_rating',
                              'input-values': {'Excellent': '5',
                                               'Very good': '4',
                                               'Average': '3',
                                               'Poor': '2',
                                               'Terrible': '1'},
                             'pick': traveller_rating},
        'traveller_type': {'data-name': 'traveler_filter',
                           'input-values': {'Families': '3',
                                            'Couples': '2',
                                            'Solo': '5',
                                            'Business': '1',
                                            'Friends': '4'},
                          'pick': traveller_type},
        'time_of_year': {'data-name': 'season',
                         'input-values': {'Mar-May': '1',
                                          'Jun-Aug': '2',
                                          'Sep-Nov': '3',
                                          'Dec-Feb': '4'},
                        'pick': time_of_year},
        'language': {'data-name': 'language',
                     'input-values': {'English': 'en',
                                      'Japanese': 'ja'},
                     'pick': language}}
    
    def is_selected(css_selector_st):
        
        try:
            WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                               css_selector_st + '>input[checked="checked"]')))
            return True
        
        except:
            return False
        
    def _click(css_selector_st, max_attempts=3):
        
        times_tried = 0
        
        flag_before = is_selected(css_selector_st)
        flag_after = flag_before
        
        while (times_tried <= max_attempts) and (flag_after == flag_before):
                
            times_tried += 1   
            print('clicking on ..', css_selector_st)
            
            try:
                e = WebDriverWait(driver, 20) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector_st)))
            except:
                print(f'failed to find {css_selector_st}!')
                
            e.click()
            
            flag_after = is_selected(css_selector_st)    
            
        return (flag_after != flag_before)

        
    for filt in d:
        
        value = d[filt]['pick']
        
        # uncheck everything else
        to_uncheck = [other_value for other_value in d[filt]['input-values'] if other_value != value]
        print('need to uncheck ', to_uncheck)
        
        if to_uncheck:
            
            for other_value in to_uncheck:
                print(f'unchecking {other_value}..')
                tr_pick = d[filt]['input-values'][other_value]
                dname = d[filt]['data-name']
                st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'
                
                if is_selected(st):
                    res = _click(st)
                    if not res:
                        print(f'problem unchecking {other_value}') 
                    else:
                        print('unchecked')
        
        if value:
            
            tr_pick = d[filt]['input-values'][value]
            dname = d[filt]['data-name']
            st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'
                
            print(f'selecting {filt}={value}...', end='')
            
            if is_selected(st):
                print('ok')
                continue
            else:
                _selected =  _click(st)
    
            if _selected:
                print('ok')
                
    try:
        lang_code = d['language']['input-values'][d['language']['pick']] 
        css_count = f'div.choices[data-name="language"]>div[data-value="{lang_code}"]>label.label>span.count'
        c_txt = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, css_count))).text.strip()
        c = int(re.sub(r'[(,)]','',c_txt))
        print(f'reviews: {c:,}')
    except:
        print('failed to get review count!')

In [70]:
select_filters(traveller_rating='Excellent', traveller_type='Solo', time_of_year='Sep-Nov')

need to uncheck  ['Very good', 'Average', 'Poor', 'Terrible']
unchecking Very good..
unchecking Average..
unchecking Poor..
unchecking Terrible..
selecting traveller_rating=Excellent...clicking on .. div.choices[data-name="ta_rating"]>div[data-value="5"]
ok
need to uncheck  ['Families', 'Couples', 'Business', 'Friends']
unchecking Families..
unchecking Couples..
unchecking Business..
unchecking Friends..
selecting traveller_type=Solo...clicking on .. div.choices[data-name="traveler_filter"]>div[data-value="5"]
ok
need to uncheck  ['Mar-May', 'Jun-Aug', 'Dec-Feb']
unchecking Mar-May..
unchecking Jun-Aug..
unchecking Dec-Feb..
selecting time_of_year=Sep-Nov...clicking on .. div.choices[data-name="season"]>div[data-value="3"]
ok
need to uncheck  ['Japanese']
unchecking Japanese..
selecting language=English...ok
reviews: 17


In [258]:
def process_paginator(pg):
    print(pg.text)
    p_numbers = [int(_) for _ in pg.text.strip().split() if _.isdigit()]
            
    print(f'min page: {p_numbers[0]}')
    print(f'last page: {p_numbers[-1]}')
    

def get_attraction_pages(location_home_url):
    
    driver.get(location_home_url)
    
    things_to_do_icon = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.ui_icon.attractions + span')))
    
    ok = False
    while not ok: 
        try:
            things_to_do_icon.click()
            ok = True
        except:
            kill_slide()
            kill_overlay
            
    showmore = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                 'span.single-chevron-down')))
    for o in driver.find_elements_by_css_selector('div[class|="attractions-attraction-overview-main-TopPOIs__see_more"]'):
        try:
            o.click()
            print(o.text)
        except:
            pass
    
    try:
        pr.click()
        print('clicked see more')
    except:
        kill_slide()
        kill_overlay
            
    el = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                 'span.single-chevron-up')))
    print(el, el.text)
            
    
    pg = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                'div[class|="attractions-attraction-overview-main-Pagination__container"]')))

    
    for s in pg.find_elements_by_css_selector('div'):
        print(s.text)
    
    current_p = None
    for _ in pg.find_elements_by_css_selector('div>span'):
        print(_.text)
        if _.text.isdigit():
            current_p = int(_.text)
    print(f'current page: {current_p}')
    
    current_page_url = driver.current_url
    print(f'current page url: {current_page_url}')
    
    last_page_url = None
    for _ in pg.find_elements_by_css_selector('div>a'):
        if _.text.isdigit():
            last_page_url = _.get_attribute('href')
            
    print(f'last page url: {last_page_url}')
    
    n = int(re.search(r'(?<=Activities-oa)\d+', last_page_url).group(0))
    
    total_pages = n//30 + 1
    
    print(f'total pages: {total_pages}')
    
    page_urls = [current_page_url]
    
    if total_pages > 1:
        
        for i in range(1, total_pages):
            # starts from page 2 (page 1 has no -oa[number]- part)
            page_urls.append(re.sub('Activities-', 'Activities-oa' + str(30*i) + '-', current_page_url))
            
    print(page_urls)
        

In [259]:
get_attraction_pages('https://www.tripadvisor.com.au/Home-g255097')

See more


TimeoutException: Message: 


In [37]:
def kill_slide():
    
    els = list(driver.find_elements_by_css_selector('div[class^="QSISlider"]>div'))
    
    for i, d in enumerate(els):
        
        if d.text.strip().lower() == 'Not right now, thanks.'.strip().lower():

            try:
                els[i-1].click()
                print('killed a slide!')
                break
            except:
                print('failed to kill a slide - couldn\'t click!')
                continue
            
def kill_overlay():
    
    x = None
    
    try:
        x = driver.find_element_by_css_selector('span.ui_overlay>div.ui_close_x')
    except:
        pass
    
    while x:
        
        try:
            x.click()
            print('killed an overlay!')
        except:
            print('failed to kill an overlay - couldn\'t click X!')

In [38]:
kill_slide()

In [62]:
def process_attraction_list(attrattion_page_url_lst):
    
    collected_attractions = defaultdict()
     
    for i, attr_page_url in enumerate(attrattion_page_url_lst, 1):
        
        print(f'page #{i:02.0f}...', end='')
        
        if attr_page_url != driver.current_url:
            driver.get(attr_page_url)
        
        new_attr_ids = 0
        
        if i == 1:
            
            try:
                see_more = WebDriverWait(driver, 10) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                                               'div[class|="attractions-attraction-overview-main-TopPOIs__see_more"')))
            except:
                
                see_more = None
            
            if see_more and (see_more.text.strip().lower() == 'see more'):
                
                see_more_clicked = False
                
                while not see_more_clicked:
                    
                    try:
                        see_more.click()
                        see_more_clicked = True
                    except:
                        kill_slide()
                        kill_overlay()
            
            for attr_card in driver.find_elements_by_css_selector('div[class|="attractions-attraction-overview-pois-PoiGrid__wrapper"]'
                                                                  '>li[class^="attractions-attraction-overview-pois-PoiCard__item"]'
                                                                  '>div[class|="attractions-attraction-overview-pois-PoiCard__card_info"]'):
                
                try:
                    url_ = attr_card.find_element_by_css_selector('div>a[class|="attractions-attraction-overview-pois-PoiInfo__name"]').get_attribute('href')
                    id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                except:
                    print('failed to extract url or id!')
                    continue
                
                collected_attractions[id_] = {'url': url_}
                new_attr_ids += 1
                          
        else:
            
            for attr_card in driver.find_elements_by_css_selector('div.attraction_list>div'
                                                                  '>div>div.listing>div.listing_details>div.listing_info'):
                
                try:
                    url_ = attr_card.find_element_by_css_selector('div.tracking_attraction_title.listing_title>a').get_attribute('href')
                    id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                except:
                    print('failed to extract url or id!')
                    continue
                    
                collected_attractions[id_] = {'url': url_}
                new_attr_ids += 1
                
        print(f'{new_attr_ids} attractions found')
        
    print(f'done. total attractions collected: {len(collected_attractions)}')
    
    return collected_attractions

In [183]:
a = process_attraction_list(['https://www.tripadvisor.com.au/Attractions-g255097-Activities-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa30-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa60-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa90-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa120-Hobart_Greater_Hobart_Tasmania.html'])

page #01...30 attractions found
page #02...30 attractions found
page #03...30 attractions found
page #04...30 attractions found
page #05...27 attractions found
done. total attractions collected: 147


In [66]:
a['d256538']

{'url': 'https://www.tripadvisor.com.au/Attraction_Review-g255097-d256538-Reviews-Bonorong_Wildlife_Sanctuary-Hobart_Greater_Hobart_Tasmania.html'}

In [69]:
driver.get(a['d256538']['url'])

In [190]:
def process_attraction_page(page_url):
    
    driver.get(page_url)
    
    attr_details = defaultdict()
    
    try:
        attr_details['name'] = driver.find_element_by_css_selector('h1#HEADING').text
    except:
        attr_details['name'] = None
        
    try:
        attr_details['total_reviews'] = driver.find_element_by_css_selector('div.ratingContainer').text.lower()
    except:
        attr_details['total_reviews'] = None
        
    try:
        attr_details['category'] = driver.find_element_by_css_selector('span.attractionCategories').text
    except:
        attr_details['category'] = None
        
    try:
        attr_details['rating_chart'] = [tuple(l.strip().split('\n')) for l in driver.find_element_by_css_selector('ul.ratings_chart').text.split('%') if l.strip()]
    except:
        attr_details['rating_chart'] = None
        
    print(attr_details)
    
    # filtered reviews
    review_progress_text = WebDriverWait(driver, 10) \
                    .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                            'div[data-contextchoice="DETAIL"]>div.pagination-details'))).text

    if review_progress_text:
        
        try:
            total_filtered_reviews = int(review_progress_text.replace(',','').split()[-2])
        except:
            total_filtered_reviews = None   
            
        try:
            reviews_this_page = int(review_progress_text.split('-')[1].strip().split()[0])
        except:
            reviews_this_page = None
    
    print('filtered_reviews=',total_filtered_reviews)
    print('reviews_this_page=', reviews_this_page)
    
    reviews = defaultdict()
    
    picked_reviews = set()
    
    while len(picked_reviews) < reviews_this_page:
        
        # first unfold all reviews on the page
        
        for c in driver.find_elements_by_css_selector('div.reviewSelector'):
            
            try:
                span_show_more_text = c.find_element_by_css_selector('div>div>div.entry>p.partial_entry>span[onclick]')
            except:
                span_show_more_text = None

            if span_show_more_text and (span_show_more_text.text.lower().strip() == 'more'):

                try:
                    span_show_more_text.click()

                    fold_option = WebDriverWait(driver, 10) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                'div>div>div.entry>span[onclick]')))
                    if fold_option.text.lower().strip() == 'show less':
                        print('clicked to show more')
                except:
                    print('failed to click on More to unfold review text!')
                    
        # and now collect all full review texts
        
        for c in driver.find_elements_by_css_selector('div.reviewSelector'):        
        
            try:
                
                review_id = c.get_attribute('data-reviewid')
                
                if review_id not in picked_reviews:
                    picked_reviews.add(review_id)
                else:
                    continue
            except:
                continue
            
            # now get the review text
            reviews[review_id] = {'text': c.find_element_by_css_selector('div>div>div.entry>p.partial_entry').text.strip()}
            reviews[review_id].update({'user': c.find_element_by_css_selector('div.member_info').text})
            
    print(reviews)
    
    npage_url = None
    
    for a in driver.find_elements_by_css_selector('div.unified.ui_pagination>a[data-page-number]'):
        print('a text=', a.text.strip().lower())
        if a.text.strip().lower() == 'next':
            npage_url = a.get_attribute('href')
            if npage_url:
                break
                              
    print(npage_url)
       

In [191]:
process_attraction_page(a['d256538']['url'])

defaultdict(None, {'name': 'Bonorong Wildlife Sanctuary', 'total_reviews': '1,842 reviews', 'category': 'Nature & Parks, Nature & Wildlife Areas', 'rating_chart': [('Excellent', '84'), ('Very good', '12'), ('Average', '2'), ('Poor', '1'), ('Terrible', '1')]})
filtered_reviews= 1739
reviews_this_page= 10
clicked to show more
defaultdict(None, {'688818028': {'text': 'Bonorong had an incredible variety of Tasmanian wildlife. However, we could only touch some animals if we paid additional fees.', 'user': 'B L\nRose Hill, Kansas\n91'}, '688788636': {'text': 'If you look at their website, you would think that it is a premium opportunity to encounter Tasmanian Devils, including babies, but we only saw one. I suggest that Un Zoo or other locations could be better for Devil experiences. Bonorong is a fantastic venue for international tourists to feed & touch kangaroos, I can’t think of anywhere better in Australia. We were ripped off $110 entry fee for myself &\nmy 3 daughters. I suggest that t