In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from collections import defaultdict
import re
import json
import os
import time
import arrow
import random

import numpy as np
import pandas as pd

import subprocess
import zipfile

from attraction import Attraction
from review import Review
from user import User

import scattertext as st
import spacy

nlp = spacy.load('en')

In [5]:
 class Trip:
    
    def __init__(self, filter):
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--incognito')
        options.add_argument('--start-maximized')
        prefs = {"profile.default_content_setting_values.notifications" : 2}
        options.add_experimental_option("prefs",prefs)

        options.add_argument('--headless')
        
        self.driver = webdriver.Chrome('webdriver/chromedriver', options=options)
        
        self.FILTREVS = defaultdict()
        self.REVIEWS = defaultdict()
        
        # counter to see how many annoying things get killed
        self.KILLED = defaultdict(int)
        
        self.FILTERS = filter
        
        print(f'filter: {" | ".join([k + ":" + str(v) for k, v in self.FILTERS.items() if v])}')
        
        self.DATA_DIR = 'data'
        self.COLLECT_DIR = 'data-collected'
        
        if not os.path.exists(self.DATA_DIR):
            os.mkdir(self.DATA_DIR)
            
        if not os.path.exists(self.COLLECT_DIR):
            os.mkdir(self.COLLECT_DIR)
        
        self.LOCATION_IDS = json.load(open(os.path.join(self.DATA_DIR, 'tradvisor_location_ids.json')))
        
    def attr_id_from_url(self, url_):
        
        """
        extract attraction ID from attraction URL
        """
        
        try:
            id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
        except:
            id_ = None
            
        return id_
            
    def do_click(self, e, max_=3):
        
        """
        try to click on element e and return True if it worked or False otherwise
        """
        
        # assume not click
        _clicked = False
        _c = 0

        while (not _clicked) and (_c < max_):

            try:
                e.click()
                _clicked = True
            except:
                try:
                    self.driver.find_element_by_css_selector('span.ui_overlay>div.ui_close_x').click()
                    self.KILLED['overlay'] += 1
                except:
                    # try to catch and close all sliders
                    els = list(self.driver.find_elements_by_css_selector('div[class^="QSISlider"]>div'))
                    for i, d in enumerate(els):
                        if d.text.strip().lower() == 'Not right now, thanks.'.strip().lower():
                            try:
                                els[i-1].click()
                                self.KILLED['slide'] += 1
                                break
                            except:
                                continue
                    for _ in self.driver.find_elements_by_css_selector('div.sbx_close[onclick]'):
                        try:
                            _.click()
                            self.KILLED['infobar'] += 1
                        except:
                            pass        
            _c += 1

        return _clicked
    
    def process_paginator(self, pg, off=30, pref='oa'):
        
        p_numbers = [int(s.text.strip()) for s in pg.find_elements_by_css_selector('div') if s.text.strip().isdigit()]
        
        if p_numbers:
            total_pages = p_numbers[-1]
        else:
            raise Exception('no pages in paginator!')
        
        current_page_url = self.driver.current_url
        last_page_url = self.driver.current_url
        
        page_urls = [current_page_url]
        
        for _ in pg.find_elements_by_css_selector('div>a'):
            if _.text.isdigit():
                last_page_url = _.get_attribute('href')
                
        if total_pages > 1:
            for i in range(1, total_pages):
                # starts from page 2 (page 1 has no -oa[number]- part)
                page_urls.append(re.sub('Activities-', 'Activities-' + pref + str(off*i) + '-', current_page_url))
                
        return total_pages, page_urls
        
    def get_attraction_pages(self, loc):
        
        """
        go to the home page for location loc and collect all attractions; if loc is a state, collect attractions
        for every location in that state
        
        """
        
        self.MAIN_LOCATION = loc
        
        # home_urls will be a list of tuples like 
        # [('tasmania', 'hobart', 'https://www.tripadvisor.com.au/Home-g255097'), 
        #  ('tasmania', 'launceston', 'https://www.tripadvisor.com.au/Home-g255344')...
        
        home_urls = []
        
        self.collected_attractions = defaultdict(lambda: defaultdict(lambda: defaultdict()))
        
        """
        {'tasmania': {'hobart': {attr_id: {name: market,
                                            url: 'https:/www...'}}}
        """
        
        # if loc is state
        if self.MAIN_LOCATION in self.LOCATION_IDS:
            for city in self.LOCATION_IDS[loc]:
                home_urls.append((self.MAIN_LOCATION, city, f'https://www.tripadvisor.com.au/Home-{self.LOCATION_IDS[loc][city]}'))
        else:
            for state_ in self.LOCATION_IDS:
                if self.MAIN_LOCATION in self.LOCATION_IDS[state_]:
                    home_urls.append((state_, self.MAIN_LOCATION, f'https://www.tripadvisor.com.au/Home-{self.LOCATION_IDS[state_][loc]}'))
                    
        if not home_urls:
            print(f'no attractions to pick for {loc.upper()}!')
            return self
        
        for state_, city_, homeurl in home_urls:
            
            print(f'collecting attractions for {city_.upper()}, {state_.upper()}...')
            
            self.driver.get(homeurl)
            
            # ---- find and click the Things to Do icon; assume it MUST be there
            
            thingstodo_clicked = False
            
            while not thingstodo_clicked:
                
                try:
                    things_to_do_icon = WebDriverWait(self.driver, 10) \
                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.ui_icon.attractions + span')))
                except:
                    raise Exception('failed to find Things to do icon!')
                
                if things_to_do_icon.text.strip().lower() != 'Things to do'.strip().lower():
                    continue
                    
                res = self.do_click(things_to_do_icon)
                    
                if not res:
                    print('failed to click Things to do icon! retrying..')
                else:
                    thingstodo_clicked = True
                
            # ------- ok, so clicked the icon; now the question is whether there are many 'top' things to 
            # do or just a few; the latter means no need to look for the See More button
            
            moveon = False
            few_attractions = False
            
            # try to click See More first
            
            while 1:
                
                try:
                    c_mo = WebDriverWait(self.driver, 10) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
                                    'div[class|="attractions-attraction-overview-main-TopPOIs__see_more"]')))
                except:
                    # if there's no See More it's just 1 page with attractions
                    page_urls = [self.driver.current_url]
                    few_attractions = True
                    break
                
                seeless = False
                
                while not seeless:
                    
                    res = self.do_click(c_mo)
                    
                    # didn't click..
                    if not res:

                        try:
                            # there is this See More button but still folded, so apparently we didn't click properly
                            self.driver.find_element_by_css_selector('span.ui_icon.single-chevron-down')
                        except:
                            seeless = True
                    # look like clicked See More..
                    else:
                        try:
                            WebDriverWait(self.driver, 10) \
                                .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                    'span.ui_icon.single-chevron-up')))

                            seeless = True
                        except:
                            print('no See Less, need to click See more again')
                break                
            
            # paginator only available if there are many attractions; however, there may be no paginator if See More
            # was clicked but there are still too few attractions
            
            if not few_attractions:
                
                try:
                    pg = WebDriverWait(self.driver, 10) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                    'div[class|="attractions-attraction-overview-main-Pagination__container"]')))
                except:
                    pg = None
                
                if pg:
                    total_pages, page_urls = self.process_paginator(pg) 
                else:
                    page_urls = [self.driver.current_url]

            print('visiting attraction pages..')

            for i, attr_page_url in enumerate(page_urls, 1):
                
                print(f'page {i}/{len(page_urls)}..', end='')

                if attr_page_url != self.driver.current_url:
                    self.driver.get(attr_page_url)
                
                if i == 1:
                    
                    attr_topick_css = 'div[class|="attractions-attraction-overview-pois-PoiGrid__wrapper"]' + \
                                      '>li[class^="attractions-attraction-overview-pois-PoiCard__item"]' + \
                                      '>div[class|="attractions-attraction-overview-pois-PoiCard__card_info"]'
                            
                    attr_topick = len(self.driver.find_elements_by_css_selector(attr_topick_css))
                    
                    while attr_topick:
                        
                        for attr_card in self.driver.find_elements_by_css_selector(attr_topick_css):

                            try:
                                url_ = attr_card.find_element_by_css_selector('div>a[class|="attractions-attraction-overview-pois-PoiInfo__name"]') \
                                                        .get_attribute('href')
                            except:
                                print('didn\'t get attraction url! moving on to next attraction card..')
                                continue
                            
                            id_ = self.attr_id_from_url(url_)
                            
                            if not id_:
                                print('failed to extract attraction id!')
                                continue
                            
                            try:
                                self.collected_attractions[state_][city_][id_] = {'url': url_}
                                attr_topick -= 1
                            except:
                                pass

                elif i > 1:
                    
                    attr_topick_css = 'div.attraction_list>div' + \
                                      '>div>div.listing>div.listing_details>div.listing_info'
                        
                    attr_topick = len(self.driver.find_elements_by_css_selector(attr_topick_css))
                    
                    while attr_topick:
                        
                        for attr_card in self.driver.find_elements_by_css_selector(attr_topick_css):

                            try:
                                url_ = attr_card.find_element_by_css_selector('div.tracking_attraction_title.listing_title>a').get_attribute('href')
                            except:
                                print('didn\'t get attraction url! moving on to next attraction card..')
                                continue
                                
                            try:
                                id_ = re.search(r'(?<=-)d\d+(?=-)', url_).group(0)
                            except:
                                print('failed to extract attraction id!')
                                continue
                            
                            try:
                                self.collected_attractions[state_][city_][id_] = {'url': url_}
                                attr_topick -= 1
                            except:
                                pass
            
                print('ok')
    
        return self

    def select_filters(self) -> int:
        
        """
        apply filter on the attraction page and return the number of reviews available
        after the filter has been applied
        """
        
        d = {'traveller_rating': {'data-name': 'ta_rating',
                                  'input-values': {'Excellent': '5',
                                                   'Very good': '4',
                                                   'Average': '3',
                                                   'Poor': '2',
                                                   'Terrible': '1'},
                                 'pick': self.FILTERS['traveller_rating']},
            'traveller_type': {'data-name': 'traveler_filter',
                               'input-values': {'Families': '3',
                                                'Couples': '2',
                                                'Solo': '5',
                                                'Business': '1',
                                                'Friends': '4'},
                              'pick': self.FILTERS['traveller_type']},
            'time_of_year': {'data-name': 'season',
                             'input-values': {'Mar-May': '1',
                                              'Jun-Aug': '2',
                                              'Sep-Nov': '3',
                                              'Dec-Feb': '4'},
                            'pick': self.FILTERS['time_of_year']},
            'language': {'data-name': 'language',
                         'input-values': {'English': 'en',
                                          'Japanese': 'ja'},
                         'pick': self.FILTERS['language']}}

        def is_selected(css_selector_st):

            try:
                WebDriverWait(self.driver, 5) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, 
                                                                   css_selector_st + '>input[checked="checked"]')))
                return True

            except:
                return False

        def _click(css_selector_st, max_attempts=3):

            times_tried = 0

            flag_before = is_selected(css_selector_st)
            flag_after = flag_before

            while (times_tried <= max_attempts) and (flag_after == flag_before):

                times_tried += 1   

                try:
                    e = WebDriverWait(self.driver, 20) \
                            .until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector_st)))
                except:
                    e = None
                    print(f'failed to find {css_selector_st}!')
                
                if e:
                    res = self.do_click(e)

                flag_after = is_selected(css_selector_st)    

            return (flag_after != flag_before)

        for filt in d:

            value = d[filt]['pick']
            
            # note that even if value is None, attempt to uncheck everything that might be checked
            to_uncheck = [other_value for other_value in d[filt]['input-values'] if other_value != value]

            if to_uncheck:

                for other_value in to_uncheck:

                    tr_pick = d[filt]['input-values'][other_value]
                    dname = d[filt]['data-name']
                    st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                    if is_selected(st):
                        res = _click(st)

            if value:

                tr_pick = d[filt]['input-values'][value]
                dname = d[filt]['data-name']
                st = f'div.choices[data-name="{dname}"]>div[data-value="{tr_pick}"]'

                if is_selected(st):
                    continue
                else:
                    _selected =  _click(st)

        try:
            lang_code = d['language']['input-values'][d['language']['pick']] 
            css_count = f'div.choices[data-name="language"]>div[data-value="{lang_code}"]>label.label>span.count'
            c_txt = WebDriverWait(self.driver, 6) \
                            .until(EC.presence_of_element_located((By.CSS_SELECTOR, css_count))).text.strip()
            c = int(re.sub(r'[(,)]','',c_txt))
        except:
            # if no review count it's because there are not reviews left after filtering
            return 0
            
        return int(c)
    
    def get_reviews_this_attraction(self, attr_url, min_total_reviews):
        
        # first go to the attraction page
        self.driver.get(attr_url)
        
        a_id = self.attr_id_from_url(attr_url)
        
        attr_details = defaultdict(name=None, total_reviews=None, category=None, rating_chart=None)
                    
        try:
            attr_details['name'] = self.driver.find_element_by_css_selector('div.attractionsHeader>h1#HEADING').text
        except:
            try:
                # it's probably a tour provider
                tour_operator = self.driver.find_element_by_css_selector('h1[id="HEADING"][class|="attractions-supplier-profile-"]').text.strip()
                print(f'looks like a tour operator: {tour_operator}')
            except:
                pass
            
        # if this is attraction without name, something is wrong and we are not collecting anything
        if not attr_details['name']:
            return None
        
        # how namy reviews does this attractio have?
        # if there's no rating (so no reviews) simply return 0 reviews right away
        try:
            self.driver.find_element_by_css_selector('div.section.rating>a.ui_bubble_rating.noReviewsBubbles')
            revs = 0
        except BaseException as e:
            # so there are some reviews..
            try:
                # 21,121 Reviews
                revs = int(self.driver.find_element_by_css_selector('div.ratingContainer>a>span.reviewCount') \
                            .text.replace(',','').split()[0])
            except:
                print('failed to find the total number of reviews!')
                revs = 0
        
        if revs < min_total_reviews:
            # too few reviews, doesn't make sense to proceed with this attraction
            print(f'only {revs:,} reviews, skipping..')
            return None
        
        # save total reviews for this attraction after filtering
        nreviews_filtered = self.select_filters()
        
        print(f'filtered reviews: {nreviews_filtered:,}')

        if not nreviews_filtered:
            print('skipping..')
            return None

        try:
            attr_details['total_reviews'] = self.driver.find_element_by_css_selector('div.ratingContainer').text.lower()
        except:
            pass

        try:
            attr_details['category'] = self.driver.find_element_by_css_selector('span.attractionCategories').text
        except:
            pass

        try:
            attr_details['rating_chart'] = [tuple(l.strip().split('\n')) for l in self.driver.find_element_by_css_selector('ul.ratings_chart').text.split('%') if l.strip()]
        except:
            pass

        # reviews for THIS ATTRACTION
        reviews = defaultdict(lambda: defaultdict())
        reviews_on_page = [0]
        
        # keep going until total collected reviews reaches total filtered reviews
        while len(reviews) < nreviews_filtered:

            try:
                reviews_this_page = int(WebDriverWait(self.driver, 10) \
                    .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                            'div[data-contextchoice="DETAIL"]>div.pagination-details'))).text.split('-')[1].strip().split()[0]) 
            except:
                if nreviews_filtered <= 10:
                    reviews_this_page = nreviews_filtered
                else:
                    try:
                        # just count rating timestamps if any
                        reviews_this_page = len(self.driver.find_elements_by_css_selector('span.ratingDate'))
                    except:
                        raise Exception('failed to pick the number of reviews on this page!') 
            
            reviews_on_page.append(reviews_this_page)
            
            to_pick = reviews_on_page[-1] - reviews_on_page[-2]
            
            picked_reviews = set()

            while len(picked_reviews) < to_pick:
                
                # first unfold all reviews on the page

                for c in self.driver.find_elements_by_css_selector('div.entry>p.partial_entry>span[class~="ulBlueLinks"][onclick]'):

                    self.do_click(c)

                # and now collect all full review texts
                review_ids_this_page = set()
        
                n_review_blocks = len(self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'))
                
                while len(review_ids_this_page) < n_review_blocks:
                        
                        for c in self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'):
                            
                            try:
                                _id = c.get_attribute('data-reviewid')
                            except:
                                continue
                                
                            if _id:
                                review_ids_this_page.add(_id)
                            else:
                                try:
                                    # id here looks like review_489766616
                                    _id = c.get_attribute('id').split('_')[-1]
                                except BaseException as e:
                                    print(str(e))
                
                # a set of review ids to collect on this page
                review_ids_to_collect_this_page = review_ids_this_page - set(reviews)

                if review_ids_to_collect_this_page:

                    tot_revs = len(review_ids_to_collect_this_page)

                else:
                    tot_revs = 0
                    print('no new reviews on this page!')

                p = 0

                while p < tot_revs:

                    for c in self.driver.find_elements_by_css_selector('div.reviewSelector'):        

                        try:

                            review_id = c.get_attribute('data-reviewid')
                            
                            if not review_id:
                                continue

                            if review_id in review_ids_to_collect_this_page:

                                reviews[review_id]['text'] = c.find_element_by_css_selector('div>div>div.entry>p.partial_entry').text.strip()
                                
                                infotext_css = 'div.member_info>div>div.info_text'
                                
                                try:
                                    # this is where a username should sit
                                    username = c.find_element_by_css_selector(infotext_css).text.split('\n')[0]
                                except:
                                    # it turns out that sometimes there are legacy anonymous users called
                                    # A TripAdvisor Member; check if it's one of these - note a slightly different
                                    # structure
                                    try:
                                        username = c.find_element_by_css_selector('div.member_info>div.info_text').text.split('\n')[0]
                                        if 'member' in username.lower():
                                            print(f'found an anonymous user called {username}')
                                    except:
                                        break
                                
                                reviews[review_id]['user'] = username
                                reviews[review_id]['user_profile_url'] = f'https://www.tripadvisor.com.au/Profile/{username}'
                                
                                try:
                                    userloc = c.find_element_by_css_selector(infotext_css + '>div.userLoc').text
                                    reviews[review_id]['user_loc'] = userloc
                                except:
                                    reviews[review_id]['user_loc'] = None
                                    
                                try:
                                    reviews[review_id]['review_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>span.ratingDate')
                                                                                    .get_attribute('title'), 'D MMMM YYYY').format('YYYY-MM-DD')
                                except:
                                    reviews[review_id]['review_date'] = None
                                    
                                try:
                                    reviews[review_id]['experience_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>div.prw_reviews_stay_date_hsx').text, 'MMMM YYYY').format('YYYY-MM')
                                except:
                                    reviews[review_id]['experience_date'] = None
                                
                                try:
                                    class_full = c.find_element_by_css_selector('div>div>div>span.ui_bubble_rating').get_attribute('class')
                                    
                                    reviews[review_id]['rating'] = int(re.search(r'(?<=_)\d{1}', class_full).group(0))
                                except:
                                    reviews[review_id]['rating'] = None
                                          
                                reviews[review_id]['attr_name'] = attr_details['name']
                                reviews[review_id]['attr_id'] = a_id
                                reviews[review_id]['attr_loc'] = city_
                                
                                # remove this reviews id from the set of review ids to collct
                                review_ids_to_collect_this_page -= {review_id}
                                # and put this id into the set of already collected ids
                                picked_reviews.add(review_id)
                                # increment the total of collected reviews on this page
                                p += 1

                            else:
                                continue
                        except:
                            continue
        return reviews
    
    def _review_count_this_attr(self) -> int:
        
        """
        assuming you're on the attraction page, returns how many reviews there are in total for this attractions
        """
        
        revs = 0
        
        # if there's no rating (so no reviews) simply return 0 reviews right away
        
        try:
            # if this one if present there are no reviews
            self.driver.find_element_by_css_selector('div.section.rating>a.ui_bubble_rating.noReviewsBubbles')
            return revs
        except:
            pass

        # otherwise, there have to be some reviews..
        
        try:
            # the piece of text that says how many reviews looks like this: 21,121 Reviews
            revs = int(self.driver.find_element_by_css_selector('div.ratingContainer>a>span.reviewCount') \
                            .text.replace(',','').split()[0])
        except:
            print('failed to find the total number of reviews!')
        
        return revs
        
            
    def get_reviews_from_attraction_pages(self, min_total_reviews=50):
        
        # total number of attractions to check out
        natt = len({a_id for state_ in self.collected_attractions
                        for city_ in self.collected_attractions[state_]
                             for a_id in self.collected_attractions[state_][city_]})
        
        # counter for processed attractions
        catt = 0
        
        for state_ in self.collected_attractions:
            for city_ in self.collected_attractions[state_]:
                for a_id in self.collected_attractions[state_][city_]:
                    
                    catt += 1
                        
                    self.driver.get(self.collected_attractions[state_][city_][a_id]['url'])
                    
                    attr_details = defaultdict(name=None, total_reviews=None, category=None, rating_chart=None)
                    
                    try:
                        attr_details['name'] = self.driver.find_element_by_css_selector('div.attractionsHeader>h1#HEADING').text
                        self.collected_attractions[state_][city_][a_id]['attr_name'] = attr_details['name']
                    except:
                        try:
                            # it's probably a tour provider
                            tour_operatior_name = self.driver.find_element_by_css_selector('h1[id="HEADING"][class|="attractions-supplier-profile-"]').text.strip()
                            print(f'looks like a tour operator: {tour_operatior_name}; skipping')
                            continue
                        except:
                            print(f'failed to find name for attraction {a_id}, skipping..')
                            continue
                            
                    print(f'attraction #{catt:,}/{natt:,}: [ID: {a_id}][{attr_details["name"]}]...')
                    
                    # how many reviews does this attraction have?
                    revs = self._review_count_this_attr()
                    
                    if revs < min_total_reviews:
                        # too few reviews, doesn't make sense to proceed with this attraction
                        print(f'only {revs:,} reviews, skipping..')
                        continue
                    
                    # save total reviews for this attraction after filtering
                    self.FILTREVS[a_id] = self.select_filters()
                    
                    print(f'filtered reviews: {self.FILTREVS[a_id]:,}')

                    if not self.FILTREVS[a_id]:
                        print('skipping..')
                        continue

                    try:
                        attr_details['total_reviews'] = self.driver.find_element_by_css_selector('div.ratingContainer').text.lower()
                    except:
                        pass

                    try:
                        attr_details['category'] = self.driver.find_element_by_css_selector('span.attractionCategories').text
                    except:
                        pass

                    try:
                        attr_details['rating_chart'] = [tuple(l.strip().split('\n')) for l in self.driver.find_element_by_css_selector('ul.ratings_chart').text.split('%') if l.strip()]
                    except:
                        pass

                    # reviews for THIS ATTRACTION
                    reviews = defaultdict(lambda: defaultdict())
                    reviews_on_page = [0]

                    while len(reviews) < self.FILTREVS[a_id]:

                        try:
                            reviews_this_page = int(WebDriverWait(self.driver, 10) \
                                .until(EC.visibility_of_element_located((By.CSS_SELECTOR, 
                                        'div[data-contextchoice="DETAIL"]>div.pagination-details'))).text.split('-')[1].strip().split()[0]) 
                        except:
                            if self.FILTREVS[a_id] <= 10:
                                reviews_this_page = self.FILTREVS[a_id]
                            else:
                                try:
                                    # just count rating timestamps if any
                                    reviews_this_page = len(self.driver.find_elements_by_css_selector('span.ratingDate'))
                                except:
                                    raise Exception('failed to pick the number of reviews on this page!') 
                        
                        reviews_on_page.append(reviews_this_page)
                        
                        to_pick = reviews_on_page[-1] - reviews_on_page[-2]
                        
                        picked_reviews = set()

                        while len(picked_reviews) < to_pick:
                            
                            # first unfold all reviews on the page

                            for c in self.driver.find_elements_by_css_selector('div.entry>p.partial_entry>span[class~="ulBlueLinks"][onclick]'):

                                self.do_click(c)

                            # and now collect all full review texts
                            review_ids_this_page = set()
                
                            n_review_blocks = len(self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'))
                            
                            while len(review_ids_this_page) < n_review_blocks:
                                    
                                    for c in self.driver.find_elements_by_css_selector('div[class="reviewSelector"][id^="review"]'):
                                        
                                        try:
                                            _id = c.get_attribute('data-reviewid')
                                        except:
                                            continue
                                            
                                        if _id:
                                            review_ids_this_page.add(_id)
                                        else:
                                            try:
                                                # id here looks like review_489766616
                                                _id = c.get_attribute('id').split('_')[-1]
                                            except BaseException as e:
                                                print(str(e))
                            
                            # a set of review ids to collect on this page
                            review_ids_to_collect_this_page = review_ids_this_page - set(reviews)

                            if review_ids_to_collect_this_page:

                                tot_revs = len(review_ids_to_collect_this_page)

                            else:
                                tot_revs = 0
                                print('no new reviews on this page!')

                            p = 0

                            while p < tot_revs:

                                for c in self.driver.find_elements_by_css_selector('div.reviewSelector'):        

                                    try:

                                        review_id = c.get_attribute('data-reviewid')
                                        
                                        if not review_id:
                                            continue

                                        if review_id in review_ids_to_collect_this_page:

                                            reviews[review_id]['text'] = c.find_element_by_css_selector('div>div>div.entry>p.partial_entry').text.strip()
                                            
                                            infotext_css = 'div.member_info>div>div.info_text'
                                            
                                            try:
                                                # this is where a username should sit
                                                username = c.find_element_by_css_selector(infotext_css).text.split('\n')[0]
                                            except:
                                                # it turns out that sometimes there are legacy anonymous users called
                                                # A TripAdvisor Member; check if it's one of these - note a slightly different
                                                # structure
                                                try:
                                                    username = c.find_element_by_css_selector('div.member_info>div.info_text').text.split('\n')[0]
                                                    if 'member' in username.lower():
                                                        print(f'found an anonymous user called {username}')
                                                except:
                                                    break
                                            
                                            reviews[review_id]['user'] = username
                                            reviews[review_id]['user_profile_url'] = f'https://www.tripadvisor.com.au/Profile/{username}'
                                            
                                            try:
                                                userloc = c.find_element_by_css_selector(infotext_css + '>div.userLoc').text
                                                reviews[review_id]['user_loc'] = userloc
                                            except:
                                                reviews[review_id]['user_loc'] = None
                                                
                                            try:
                                                reviews[review_id]['review_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>span.ratingDate')
                                                                                                .get_attribute('title'), 'D MMMM YYYY').format('YYYY-MM-DD')
                                            except:
                                                reviews[review_id]['review_date'] = None
                                                
                                            try:
                                                reviews[review_id]['experience_date'] = arrow.get(c.find_element_by_css_selector('div>div>div>div.prw_reviews_stay_date_hsx').text, 'MMMM YYYY').format('YYYY-MM')
                                            except:
                                                reviews[review_id]['experience_date'] = None
                                            
                                            try:
                                                class_full = c.find_element_by_css_selector('div>div>div>span.ui_bubble_rating').get_attribute('class')
                                                
                                                reviews[review_id]['rating'] = int(re.search(r'(?<=_)\d{1}', class_full).group(0))
                                            except:
                                                reviews[review_id]['rating'] = None
                                                      
                                            reviews[review_id]['attr_name'] = attr_details['name']
                                            reviews[review_id]['attr_id'] = a_id
                                            reviews[review_id]['attr_loc'] = city_
                                            
                                            # remove this reviews id from the set of review ids to collct
                                            review_ids_to_collect_this_page -= {review_id}
                                            # and put this id into the set of already collected ids
                                            picked_reviews.add(review_id)
                                            # increment the total of collected reviews on this page
                                            p += 1

                                        else:
                                            continue
                                    except:
                                        continue

                        # now try to click Next
                        npage_url = None

                        if self.FILTREVS[a_id] <= 10:
                            last_page_url = self.driver.current_url
                        else:
                            try:
                                last_page_url = list(self.driver.find_elements_by_css_selector('div.mobile-more>div>div.unified.ui_pagination>div.pageNumbers>a[href]'))[-1].get_attribute('href')
                            except:
                                last_page_url = self.driver.current_url 

                        if self.driver.current_url != last_page_url:

                            # https://www.tripadvisor.com.au/Attraction_Review-g255097-d1063162-Reviews-or20-Mount_Wellington-Hobart_Greater_Hobart_Tasmania.html

                            try:
                                pref = int(re.search(r'(?<=-Reviews-or)\d+', self.driver.current_url).group(0))
                            except:
                                pref = None

                            if pref:
                                next_page_url = self.driver.current_url.replace('or' + str(pref), 'or' + str(pref+10))
                            else:
                                next_page_url = self.driver.current_url.replace('Reviews-', 'Reviews-' + 'or' + str(10) + '-')

                            if next_page_url != self.driver.current_url:
                                self.driver.get(next_page_url)

                        # add reviews for THIS attraction to the dictionary of ALL reviews
                        self.REVIEWS.update(reviews)
        
        self.driver.close()
        
        return self
        
    def save_reviews(self):
        
        if not len(self.REVIEWS):
            print('no reviews so nothing to save...')
            return self
            
        file = '-'.join(['reviews', self.MAIN_LOCATION.replace(" ","_").upper(), 
                         self.FILTERS['traveller_type'], self.FILTERS['traveller_rating'], 
                         self.FILTERS['time_of_year']]) + '.json'
            
        json.dump(self.REVIEWS, open(os.path.join(self.COLLECT_DIR, file), 'w'))
        
        print(f'saved {len(self.REVIEWS):,} reviews')
            
        return self
    
    def save_attractions(self):
    
        json.dump(self.collected_attractions, 
                  open(os.path.join(self.COLLECT_DIR, 
                                    f'attractions-{self.MAIN_LOCATION.replace(" ","_").upper()}.json'), 'w'))

In [6]:
 class CountryDetector:
        
    def __init__(self):
        
        self.DATA_DIR = 'data'
        
        self.us_states = pd.read_csv(os.path.join(self.DATA_DIR, 'usa_states.csv'))
        self.us_cities = set(pd.read_csv(os.path.join(self.DATA_DIR, 'usa_cities.csv'))['city'])
        self.uk_counties = set(pd.read_csv(os.path.join(self.DATA_DIR, 'uk_counties.csv'))['county'])
        self.uk_cities = set(pd.read_csv(os.path.join(self.DATA_DIR, 'uk_cities.csv'))['city'])
        
        self.countries = pd.read_csv('data/country_abbrs.csv')
        
        self.reviews_as_df = pd.DataFrame()
    
    def ukus(self, loc_str: str) -> str:
        
        """
        return 'UK' or 'USA' depending on what's been found in the string loc_str
        """
        
        if not isinstance(loc_str, str):
            return None
    
        st =  ' ' + re.sub(r'[,.:;]', ' ', re.sub(r'\s+', ' ', loc_str.lower().strip())) + ' '

        sp = lambda x: ' ' + x + ' '
        
        uk_versions = {'united kingdom', 'uk'}
        us_versions = {'united states', 'us'}

        if any([sp('united kingdom') in st, sp('uk') in st]):
            return 'UK'
        elif any([sp('usa') in st, sp('united states') in st, sp('us') in st, sp('nyc') in st]):
            return 'USA'

        _us_states = set()
        _us_cities = set()
        _uk_counties = set()
        _uk_cities = set()

        for row in self.us_states.iterrows():
            if (sp(row[1][0]) in st)  or (sp(row[1][1]) in st):
                _us_states.add(row[1][0])

        for c in self.us_cities:
            if  sp(c) in st:
                _us_cities.add(c)

        for c in self.uk_counties:
            if  sp(c) in st:
                _uk_counties.add(c)

        for c in self.uk_cities:
            if  sp(c) in st:
                _uk_cities.add(c)
        
        other_countries = sum(sp(c) in st for c in set(self.countries['country']) - set({'united kingdom', 'united states'}))
        
        if other_countries:
            return None
        
        if _us_states:
            return 'USA'
        elif _uk_counties:
            return 'UK'
        if _us_cities and (not _us_cities & _uk_cities):
            return 'USA'
        if _uk_cities and (not _uk_cities & _us_cities):
            return 'UK'
        
    def proc_review_files(self):
        
        ukus_df_list = []
        rev_dict = defaultdict()
        
        for f in os.listdir('data-collected/'):
            
            if ('.json' in f) and ('reviews' in f):
                
                try:
                    state_, type_ = f.split('-')[1:3]
                    
                    revs = json.load(open('data-collected/' + f))
                    
                    _ = pd.DataFrame.from_dict(revs, orient='index')
                    _['type'] = 'friends' if 'friends' in f.lower() else 'solo' if 'solo' in f.lower() else None
                    
                    ukus_df_list.append(_)
                    
                except:
                    pass
                
        # collected all reviews in a single dictionary; not let's find the UK/US ones
        
        self.reviews_as_df = pd.concat(ukus_df_list)
        
        self.reviews_as_df['ukus'] = self.reviews_as_df['user_loc'].apply(self.ukus)
            
        print(f'{len(self.reviews_as_df.index):,} reviews')
            
        return self
    
    def create_corpus(self, traveller_type, only_nouns=True):
        
        df = self.reviews_as_df[self.reviews_as_df['ukus'].isin(['UK', 'USA']) & (self.reviews_as_df['type'] == traveller_type)]
        
        if only_nouns:
            df['text'] = df['text'].apply(lambda x: ' '.join([w.lemma_ for w in nlp(x.lower()) if w.pos_ == 'NOUN']))
            
        corpus = st.CorpusFromPandas(df, category_col='ukus', text_col='text', nlp=nlp).build()
        
        html = st.produce_scattertext_explorer(corpus, 
                                       category='UK',  # actual column name
                                       category_name='UK' + ' ' + traveller_type.title(),  # extended name for the category
                                       not_category_name='USA' + ' ' + traveller_type.title(),  # extended name for the OTHER category
                                       width_in_pixels=1000)
        
        if not os.path.exists('analysis'):
            os.mkdir('analysis')
            
        open(f'analysis/{traveller_type}.html', 'wb').write(html.encode('utf-8'))
        
        return self

In [None]:
if __name__ == '__main__':
    
    t = Trip(filter={'traveller_rating': 'Excellent', 
                      'traveller_type':'Friends', 
                       'time_of_year': 'Jun-Aug',  # note that this refers to review date (and NOT experience date)
                       'language':'English'}) \
                    .get_attraction_pages('tasmania') \
                        .get_reviews_from_attraction_pages(min_total_reviews=170) \
                            .save_reviews() \
                                .save_attractions()

filter: traveller_rating:Excellent | traveller_type:Friends | time_of_year:Jun-Aug | language:English
collecting attractions for HOBART, TASMANIA...
visiting attraction pages..
page 1/5..ok
page 2/5..ok
page 3/5..ok
page 4/5..ok
page 5/5..ok
collecting attractions for LAUNCESTON, TASMANIA...
visiting attraction pages..
page 1/3..ok
page 2/3..ok
page 3/3..ok
collecting attractions for SWANSEA, TASMANIA...
visiting attraction pages..
page 1/1..ok
collecting attractions for PORT ARTHUR, TASMANIA...
visiting attraction pages..
page 1/2..ok
page 2/2..ok
collecting attractions for GEORGE TOWN, TASMANIA...
visiting attraction pages..
page 1/1..ok
collecting attractions for CRADLE MOUNTAIN LAKE ST CLAIR NP, TASMANIA...
visiting attraction pages..
page 1/1..ok
collecting attractions for COLES BAY, TASMANIA...
visiting attraction pages..
page 1/1..ok
collecting attractions for BICHENO, TASMANIA...
visiting attraction pages..
page 1/1..ok
collecting attractions for RICHMOND, TASMANIA...
visiting 

In [None]:
country_det = CountryDetector().proc_review_files().create_corpus(traveller_type='friends')