In [36]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow
import random

import numpy as np

import subprocess
import zipfile

from attraction import Attraction
from review import Review
from user import User

In [37]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--incognito')
options.add_argument('--start-maximized')
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs",prefs)

# options.add_argument('--headless')

In [38]:
driver = webdriver.Chrome('webdriver/chromedriver', options=options)

In [39]:
driver.get('https://www.tripadvisor.com.au/Attraction_Review-g255060-d257278-Reviews-Sydney_Opera_House-Sydney_New_South_Wales.html')

In [40]:
def select_filters(traveller_rating=None, 
                   traveller_type='Solo', 
                   time_of_year=None,  
                   language='English', 
                   max_attempts=3):
    
    d = {'traveller_rating': {'data-name': 'ta_rating',
                              'input-values': {'Excellent': '5',
                                               'Very good': '4',
                                               'Average': '3',
                                               'Poor': '2',
                                               'Terrible': '1'},
                             'pick': traveller_rating},
        'traveller_type': {'data-name': 'traveler_filter',
                           'input-values': {'Families': '3',
                                            'Couples': '2',
                                            'Solo': '5',
                                            'Business': '1',
                                            'Friends': '4'},
                          'pick': traveller_type},
        'time_of_year': {'data-name': 'season',
                         'input-values': {'Mar-May': '1',
                                          'Jun-Aug': '2',
                                          'Sep-Nov': '3',
                                          'Dec-Feb': '4'},
                        'pick': time_of_year},
        'language': {'data-name': 'language',
                     'input-values': {'English': 'en',
                                      'Japanese': 'ja'},
                     'pick': language}}
    
    def is_selected(css_selector_st):
        
        try:
            WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                               css_selector_st + '>input[checked="checked"]')))
            return True
        
        except:
            return False
        
    def _click(css_selector_st, max_attempts=3):
        
        times_tried = 0
        
        flag_before = is_selected(css_selector_st)
        flag_after = flag_before
        
        while (times_tried <= max_attempts) and (flag_after == flag_before):
                
            times_tried += 1   
            print('clicking on ..', css_selector_st)
            
            try:
                e = WebDriverWait(driver, 20) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector_st)))
            except:
                print(f'failed to find {css_selector_st}!')
                
            e.click()
            
            flag_after = is_selected(css_selector_st)    
            
        return (flag_after != flag_before)

        
    for filt in d:
        
        value = d[filt]['pick']
        
        # uncheck everything else
        to_uncheck = [other_value for other_value in d[filt]['input-values'] if other_value != value]
        print('need to uncheck ', to_uncheck)
        
        if to_uncheck:
            
            for other_value in to_uncheck:
                print(f'unchecking {other_value}..')
                tr_pick = d[filt]['input-values'][other_value]
                dname = d[filt]['data-name']
                st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'
                
                if is_selected(st):
                    res = _click(st)
                    if not res:
                        print(f'problem unchecking {other_value}') 
                    else:
                        print('unchecked')
        
        if value:
            
            tr_pick = d[filt]['input-values'][value]
            dname = d[filt]['data-name']
            st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'
                
            print(f'selecting {filt}={value}...', end='')
            
            if is_selected(st):
                print('ok')
                continue
            else:
                _selected =  _click(st)
    
            if _selected:
                print('ok')
                
    try:
        lang_code = d['language']['input-values'][d['language']['pick']] 
        css_count = f'div.choices[data-name="language"]>div[data-value="{lang_code}"]>label.label>span.count'
        c_txt = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, css_count))).text.strip()
        c = int(re.sub(r'[(,)]','',c_txt))
        print(f'reviews: {c:,}')
    except:
        print('failed to get review count!')

In [41]:
select_filters(traveller_rating='Excellent', traveller_type='Solo', time_of_year='Sep-Nov')

need to uncheck  ['Very good', 'Average', 'Poor', 'Terrible']
unchecking Very good..
unchecking Average..
unchecking Poor..
unchecking Terrible..
selecting traveller_rating=Excellent...clicking on .. div.choices[data-name="ta_rating"]>div[data-value="5"]
ok
need to uncheck  ['Families', 'Couples', 'Business', 'Friends']
unchecking Families..
unchecking Couples..
unchecking Business..
unchecking Friends..
selecting traveller_type=Solo...clicking on .. div.choices[data-name="traveler_filter"]>div[data-value="5"]
ok
need to uncheck  ['Mar-May', 'Jun-Aug', 'Dec-Feb']
unchecking Mar-May..
unchecking Jun-Aug..
unchecking Dec-Feb..
selecting time_of_year=Sep-Nov...clicking on .. div.choices[data-name="season"]>div[data-value="3"]
ok
need to uncheck  ['Japanese']
unchecking Japanese..
selecting language=English...ok
reviews: 305


In [52]:
def get_attraction_pages(location_home_url):
    
    driver.get(location_home_url)
    
    things_to_do_icon = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.ui_icon.attractions + span')))
    
    ok = False
    while not ok: 
        try:
            things_to_do_icon.click()
            ok = True
        except:
            kill_slide()
            
    ok = False
    while not ok: 
        try:
            WebDriverWait(driver, 10) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[class|="attractions-attraction-overview-main-TopPOIs__see_more"'))).click()
            ok = True
        except:
            kill_slide()
    
    pg = WebDriverWait(driver, 10) \
                        .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                'div[class|="attractions-attraction-overview-main-Pagination__wrapper"]>'
                                'div[class|="attractions-attraction-overview-main-Pagination__container"]')))
    current_p = None
    for _ in pg.find_elements_by_css_selector('div>span'):
        if _.text.isdigit():
            current_p = int(_.text)
    print(f'current page: {current_p}')
    
    current_page_url = driver.current_url
    print(f'current page url: {current_page_url}')
    
    last_page_url = None
    for _ in pg.find_elements_by_css_selector('div>a'):
        if _.text.isdigit():
            last_page_url = _.get_attribute('href')
            
    print(f'last page url: {last_page_url}')
    
    n = int(re.search(r'(?<=Activities-oa)\d+', last_page_url).group(0))
    
    total_pages = n//30 + 1
    
    print(f'total pages: {total_pages}')
    
    page_urls = [current_page_url]
    
    if total_pages > 1:
        
        for i in range(1, total_pages):
            # starts from page 2 (page 1 has no -oa[number]- part)
            page_urls.append(re.sub('Activities-', 'Activities-oa' + str(30*i) + '-', current_page_url))
            
    print(page_urls)
        

In [53]:
get_attraction_pages('https://www.tripadvisor.com.au/Home-g255097')

current page: 1
current page url: https://www.tripadvisor.com.au/Attractions-g255097-Activities-Hobart_Greater_Hobart_Tasmania.html
last page url: https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa120-Hobart_Greater_Hobart_Tasmania.html
total pages: 5
['https://www.tripadvisor.com.au/Attractions-g255097-Activities-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa30-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa60-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa90-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa120-Hobart_Greater_Hobart_Tasmania.html']


In [34]:
def kill_slide():
    
    click_this_to_say_no = None
    
    try:
        click_this_to_say_no = driver.find_element_by_css_selector('div[class^="QSISlider"]>div:nth-of-type(8)>div')
    except:
        pass
    
    if click_this_to_say_no:
        
        no_span = None
        
        try:
            no_span = driver.find_element_by_css_selector('div[class^="QSISlider"]>div:nth-of-type(9)>div>div>strong>span>span')
        except:
            pass
        
        if no_span:
            print(f'clicking on "{no_span.text}"')
            click_this_to_say_no.click()

In [35]:
kill_slide()

In [66]:
def process_attraction_list(attrattion_page_url_lst):
    
    for i, attr_page_url in enumerate(attrattion_page_url_lst,1):
        
        print(f'p#{i:02.0f}: {attr_page_url}')
        
        if attr_page_url != driver.current_url:
            driver.get(attr_page_url)
            
        if i == 1:
            
            ok = False
            while not ok: 
                try:
                    WebDriverWait(driver, 10) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[class|="attractions-attraction-overview-main-TopPOIs__see_more"'))).click()
                    ok = True
                except:
                    kill_slide()
            
            for attr_card in driver.find_elements_by_css_selector('div[class|="attractions-attraction-overview-pois-PoiGrid__wrapper"]'
                                                              '>li[class^="attractions-attraction-overview-pois-PoiCard__item"]'
                                                              '>div[class|="attractions-attraction-overview-pois-PoiCard__card_info"]'):
                
                print(attr_card.text)
                
                WebDriverWait(driver, 10) \
                        .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[class|="attractions-attraction-overview-main-TopPOIs__see_more"'))).click()
                    
                time.sleep(2)
                category_ = attr_card.find_element_by_css_selector('div[class|="attractions-attraction-overview-pois-PoiInfo__geo_category_container"]').text
                name_ = attr_card.find_element_by_css_selector('div>a[class|="attractions-attraction-overview-pois-PoiInfo__name"]').text
                url_ = attr_card.find_element_by_css_selector('div>a[class|="attractions-attraction-overview-pois-PoiInfo__name"]').get_attribute('href')
            
                print('name: ', name_)
                print('category: ', category_)
                print('url: ', url_)
        else:
            
            for attr_card in driver.find_elements_by_css_selector('div[class^="attraction_element_"]>div>div.listing>div.listing_details>div.listing_info'):
                
                category_ = attr_card.find_element_by_css_selector('div.tag_line').text
                name_ = attr_card.find_element_by_css_selector('div.tracking_attraction_title').text
                url_ = attr_card.find_element_by_css_selector('div.tag_line>div>a').get_attribute('href')
                
                print('name: ', name_)
                print('category: ', category_)
                print('url: ', url_)


In [67]:
process_attraction_list(['https://www.tripadvisor.com.au/Attractions-g255097-Activities-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa30-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa60-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa90-Hobart_Greater_Hobart_Tasmania.html', 'https://www.tripadvisor.com.au/Attractions-g255097-Activities-oa120-Hobart_Greater_Hobart_Tasmania.html'])

p#01: https://www.tripadvisor.com.au/Attractions-g255097-Activities-Hobart_Greater_Hobart_Tasmania.html
NATURE & PARKS
Mount Wellington
7,145 reviews
name:  Mount Wellington
category:  NATURE & PARKS
url:  https://www.tripadvisor.com.au/Attraction_Review-g255097-d1063162-Reviews-Mount_Wellington-Hobart_Greater_Hobart_Tasmania.html
NATURE & PARKS
Bonorong Wildlife Sanctuary
1,840 reviews
name:  Bonorong Wildlife Sanctuary
category:  NATURE & PARKS
url:  https://www.tripadvisor.com.au/Attraction_Review-g255097-d256538-Reviews-Bonorong_Wildlife_Sanctuary-Hobart_Greater_Hobart_Tasmania.html
FOOD & DRINK
Cascade Brewery
1,444 reviews
name:  Cascade Brewery
category:  FOOD & DRINK
url:  https://www.tripadvisor.com.au/Attraction_Review-g255097-d256543-Reviews-Cascade_Brewery-Hobart_Greater_Hobart_Tasmania.html
NATURE & PARKS
Royal Tasmanian Botanical Gardens
2,437 reviews
name:  Royal Tasmanian Botanical Gardens
category:  NATURE & PARKS
url:  https://www.tripadvisor.com.au/Attraction_Review-