In [None]:
import requests
import pandas as pd
import re
import nltk
import geopandas as gpd
import datetime
import time 

from math import radians, sqrt, sin, cos, asin
from ast import literal_eval
from requests_html import HTMLSession
from bs4 import BeautifulSoup, SoupStrainer
from shapely.geometry import Point

In [None]:
def scrape_page(url):
    #knowing that we have found the right url, we will now scrape the name, reviews and rating of a place.
    session = HTMLSession()
    response = session.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #we look into 4 categories; 'ShowUser', 'Restaurant', 'Hotel' and 'Attraction'.
    #all categories have a slightly different page layout. Therefore we need to have a different approach to the problem.
    
    if '/ShowUser' in url:
        name = soup.find('div',{'id': "heading"}).text
        getrating = soup.find('div',{'class':"prw_rup prw_common_bubble_rating rating"})
        rating = float(str(getrating.next).split('<span alt="')[1].split()[0].replace(',','.'))
        reviews = soup.find('span',{'class':'reviewCount'}).text.split()[0]
    elif '/Restaurant' in url:
        name = soup.find('h1',{'data-test-target': "top-info-header"}).text
        rating = float(soup.find('svg',{'class': "RWYkj d H0"})['title'].split()[0].replace(',','.'))
        reviews = soup.find('span',{'class': "eBTWs"}).text.split()[0]
    elif '/Hotel' in url:
        name = soup.find('h1',{'id': "HEADING"}).text
        reviews = soup.find('span', {'class':"HFUqL"}).text.split()[0]
        getrating = soup.find('a', {'class':"fbhUA q eRJGA _T Gi"})
        strtxt = str(getrating.next).split('_')[-1][:2]
        rating = float(strtxt[0]+'.'+strtxt[1])
    else:
        name = soup.find('div',{'class': "Xewee"}).text
        stars = soup.find_all('div',{'class': "RTVWf o W f u w eeCyE"})
        reviews = soup.find('span',{'class': "cfIVb"}).text
        for star in stars:
            if 'aria-label' in star.attrs:
                rating = float(star['aria-label'].split()[0].replace(',','.'))

    #default the value of the variables if they were not caught while scraping the webpage
    if 'name' not in locals():
        name = None
    if 'rating' not in locals():
        rating = None        
    if 'reviews' not in locals():
        reviews = None
    else:
        reviews = int(reviews.replace('.',''))
    print(name, rating, reviews,'\n')
    return name, rating, reviews

def scrape_trip(term):
    #we will now try to catch the right url for a given place. We have to bear in mind, that tripadvisor is meant mainly for 'attractions',
    #restaurants and hotels. Not all places have a Tripadvisor page. Some squares (pleinen) do not have a tripadvisor page, but for some
    #squares this code catches the closest restaurant to this square, which is not the right representative of a location. Therefore, 
    #the output of this function still requires some manual screening.
    
    #The variable 'stopplaces' has been created to skip addresses, places with no tripadvisor page or any alternative. The common file
    #that was created (culturalfeaturesfinal) includes some places that do not have a tripadvisor page.
    stopplaces = ['muntplein ', 'johan cruijff boulevard', 'bijlmerbajes', 'spoorwegbassin', 'voc-kamer amsterdam', 'grote en kleine verfdoos', 'hallen van stork', 
                  'remise tollensstraat', 'vijzelstraat 31 ', 'nieuwe passeerdersstraat 1', 'kohnstammhuis', 'koningsplein ', 'goslerhuisje',
                  'grote synagoge ', 'vondelparkpaviljoen', 'zitbanken vrolikstraat 8', 'apollohal', 'sint-ritakerk ', 'schollenbrug ', 
                  'amsterdamse school ', 'amstelrust', 'toegangspoort frankendael', 'emmakerk ', 'johan van hulstbrug', 'spinhuis ', 'peperbrug', 
                  'villa heineken', 'wilhelmina gasthuis', 'willem witsen', 'zeemanlaboratorium', 'meerpadkerkje', 'hogesluis', 'neptunus ',
                  'dr. sarphatihuis', 'pakhuizen serdang, langkat en deli', 'overtoom 241, amsterdam', 'sint-rosaklooster', 
                  'eduard douwes dekker', 'teekenschool ', 'het nieuwe huis', 'de joodse invalide', 'brug 352', 'schellingwouderkerk', 
                  'berlage lyceum', 'oranje-nassau kazerne', 'algemene nederlandse diamantbewerkersbond', 'droogbak ', 'lloyd-complex', 'gemeentetram amsterdam', 'haarlemmerpoort ', 
                  'brug 283', 'amsterdamse effectenbeurs', 'de bazel ', 'europaschool', 'brug 604', 'tuindorp nieuwendam', 'kindertjesbrug', 'oostertoegang', 
                  'de 1100 roe', 'vondelpark 1', 'amsterdams lyceum', 'lucky luyk', 'koninklijke asscher diamant maatschappij', 'remise lekstraat', 'olympic experience amsterdam', 
                  'tuindorp oostzaan', 'bijzondere collecties van de universiteit van amsterdam', 'veem house for performance', 'rozentheater', 
                  'filmhuis cavia', 'bilderdijkpark', 'lange bretten', 'eendrachtspark', 'diemerpark', 'gerbrandypark ','natuurpark vrije geer', 
                  'nieuwe meer ', 'buikslotermeerplein', 'lindengracht', 'mosplein',  'haarlemmerplein', 'stadionplein', 'sierplein']
    if term in stopplaces:
        return None
    #we start our search with google.
    mainpage = 'http://www.google.nl/search?q=tripadvisor+'
    search = mainpage+(term.replace(" ", "+"))
    response = requests.get(search)    
    soup = BeautifulSoup(response.content, 'html.parser', parse_only = SoupStrainer('a'))
    for link in soup:
        #we are going through the first search page of google. We want to check for Dutch or English Tripadvisor pages.
        if ('https://www.tripadvisor.nl/' in link['href']) or ('https://www.tripadvisor.com/' in link['href'] and not ('/ShowTopic' in link['href'] or '/Location' in link['href'])):
            #Found a tripadvisor page! But is the place situated in Amsterdam?
            if ('amsterdam' in link.text.lower()) and not ('de 10 beste' in link.text.lower() or 'in de buurt' in link.text.lower()):
                #Take into account that we do not want a page of tripadvisor that proposes other places nearby.
                clean = link['href'].partition('https://')
                url = clean[1]+clean[2]
                attraction_options = ['(amsterdam)', 'beoordelingen, amsterdam', '- 2022']
                if ('/ShowUser' in url) or ('/Hotel' in url):
                    #links that contain either hotel or showuser are usually good to go! 
                    break
                elif '/Restaurant' in url:
                    #sometimes, a restaurant near a location is caught as the location itself, which is wrong
                    #we can already conclude that if the editing distance is at least 10, that we caught a restaurant or cafe
                    #for a given place. Take for example: XXXXplein. If the selected page is a restaurant near this place, namely 
                    #a cafe named 'De Blije XYZ op het XXXXplein', we know that we need to exclude this search.
                    edit_dist = nltk.edit_distance(link.text.lower().split(', amsterdam')[0], term)
                    if edit_dist < 10: 
                        #it seems the caught webpage is correct! However, for all results manual screening is required.
                        break
                    else:
                        #The url seems to be of a cafe or restaurant... We'll remove the url so we can continue the google search!
                        del url
                elif '/AttractionToursAndTickets' in url:
                    del url
                #check if the format of the page is correct to scrape. 
                elif any(option in link.text.lower() for option in attraction_options):
                    break
                elif '/Attraction' in link['href']:
                    #we now know that we've arrived to the attractions part.
                        break
                else:
                    del url
    if 'url' not in locals():
        #print(term, 'NO MATCH \n')
        return None
    else:
        if '/Locat' in url:
            return None
        #These locations have been added manually.
        elif term == 'stopera':
            return 'https://www.tripadvisor.nl/Attraction_Review-g188590-d245251-Reviews-Dutch_National_Opera_Ballet-Amsterdam_North_Holland_Province.html'
        elif term == 'waag':
            return 'https://www.tripadvisor.nl/Attraction_Review-g188590-d191192-Reviews-In_de_Waag-Amsterdam_North_Holland_Province.html'
        elif term == 'conservatorium van amsterdam':
            return 'https://www.tripadvisor.nl/Restaurant_Review-g188590-d3155520-Reviews-Conservatorium_Brasserie_Lounge_Amsterdam-Amsterdam_North_Holland_Province.html'
        elif term == 'ostadetheater':
            return 'https://www.tripadvisor.nl/Attraction_Review-g188590-d6454317-Reviews-Ostadetheater-Amsterdam_North_Holland_Province.html'
        else:
            return url

In [None]:
#SCRAPE THE BUILDINGS PROVIDED BY THE CULTURAL FEATURES FILE, THIS CONTAINS THE LIST OF POINTS OF INTEREST

CF = pd.read_csv("Culturalfeaturesfinal.csv")

#Predefine the columns for the Tripadvisor features
CF['TA_name'] = None
CF['TA_rating'] = None
CF['TA_reviews'] = None
CF['TA_url'] = None

for index, row in CF.iterrows():
    name = row['name']
    url = scrape_trip(name.lower())
    #print(index, name)
    if url != None:
        #print(url)
        name, rating, reviews = scrape_page(url)
        
        CF['TA_name'][index] = name
        CF['TA_rating'][index] = rating
        CF['TA_reviews'][index] = reviews
        CF['TA_url'][index] = url
