In [1]:
# Import BeautifulSoup and splinter
from bs4 import BeautifulSoup as soup
from splinter import Browser

# Import other dependencies
import time
import pandas as pd
import numpy as np
from pathlib import Path
import requests
import json

# Import geoapify api key
from config import geoapify_key

### Scraping Top 25 (page each for 4 common page layout categories  - Hotels, Beaches, Restaurants & Things to do)

In [27]:
# geoapify url endpoint
base_url = "https://api.geoapify.com/v1/geocode/search"

# List of urls and category used for scraping
url_list = []
category_list = []

url_list.append('https://www.tripadvisor.com/TravelersChoice-Beaches')
category_list.append('beaches')

url_list.append("https://www.tripadvisor.com/TravelersChoice-ThingsToDo")
category_list.append("things")

url_list.append("https://www.tripadvisor.com/TravelersChoice-Restaurants")
category_list.append("restaurants")

url_list.append("https://www.tripadvisor.com/TravelersChoice-Hotels")
category_list.append("hotels")

In [28]:
# loop thru url list that we need to scrape and get data (done only if csv of data is not present)

for index, item in enumerate(url_list):
    
    search_file = "static/data/" + category_list[index] + ".csv"
    csv_file = Path(search_file)

    if csv_file.exists():
        print(f"File {csv_file} already extracted. Skipping scraping")
    else:
                
        # Set up Splinter
        browser = Browser('chrome')

        # Visit the website - sleep provided to avoid continuous calls   
        browser.visit(item)
        time.sleep(30)       
        # Optional delay for loading the page
        browser.is_element_present_by_css('div.list_text', wait_time=1)

        # Scrape the website html
        html = browser.html

        # Create a BeautifulSoup object from the scraped HTML
        data = soup(html, 'html.parser')

        # Get Rank and name
        names = data.find_all(class_="mainName extra")
        namelist = []
        ranklist = []
        parts = []
        for i in names:
            href=i.find("a")    
            text_data=href.text   
            parts = text_data.split('.')    
            ranklist.append(parts[0])
            namelist.append(parts[1].strip())
    
        # Get City and Country (single string)
        cities = data.find_all(class_="smaller")
        citylist = []
        lat = []
        lon = []
        for i in cities:
            href=i.find("a")
            city_loc = href.text    
            citylist.append(city_loc)
            
            # call geoapify to get lat/lon for location
            params = {
                "text": city_loc,
                "apiKey": geoapify_key }
            # Run request
            response = requests.get(base_url, params=params).json()
            
            # Extract lat/lon
            latitude = response["features"][0]["properties"]["lat"]
            longitude = response["features"][0]["properties"]["lon"]

            # append to list
            lat.append(latitude)
            lon.append(longitude)
    

        # Get image urls
        images = data.find_all(class_="sizedThumb_container")
        iurls = []
        for i in images:
            href=i.find("img")
            iurls.append(href["src"])

        # Get description of restaurant/a customer review 
        desclist = []
        quot_tags = data.find_all(class_="quot")
        # removing classes with "quot quot2" used for second quote as we need just 1 quote per target
        quot_tags = [tag for tag in quot_tags if 'quot2' not in ''.join(tag['class'])]
        for texts in quot_tags:
            x = texts.find("i").next_sibling.strip()        
            desclist.append(x)

        # Get url to go retrieve rating and reviews - this will be used later to get details
        url_ary = []
        for lnk in data.find_all(class_="firstone"):
            href=lnk.find("a")    
            url_ary.append(href['href'])

        # Close browser
        browser.quit() 

        # Create array with category and count of rank array
        cat_list = np.repeat([category_list[index]], len(ranklist))

        # Load arrays as columns of dataframe
        df = pd.DataFrame({"category": cat_list, "rank": ranklist, "name": namelist, "location": citylist, 
                       "imageurl": iurls, "description": desclist, "latitude": lat, "longitude": lon, "ratingurl": url_ary})
        
        # save dataframe as csv 
        filename = "static/data/" + category_list[index] + ".csv"
        df.to_csv(filename, encoding="utf-8", index=False, header=True)
        

    # sleep for a minute before calling next url (providing break to avoid continuous pings to website)
    time.sleep(10)

# Final checks
browser.quit()

File static\data\beaches.csv already extracted. Skipping scraping
File static\data\things.csv already extracted. Skipping scraping
File static\data\restaurants.csv already extracted. Skipping scraping


### Scrape Destinations - missing some of the required elements so handled separately

In [13]:
url_dest = "https://www.tripadvisor.com/TravelersChoice-Destinations"
cat_dest = "destinations"
category_list.append("destinations")
    
search_file = "static/data/" + cat_dest + ".csv"
csv_file = Path(search_file)

if csv_file.exists():
    print(f"File {csv_file} already extracted. Skipping scraping")
else:
            
    # Set up Splinter
    browser = Browser('chrome')

    # Visit the website - sleep provided to avoid continuous calls   
    browser.visit(url_dest)
      
    # Optional delay for loading the page
    browser.is_element_present_by_css('div.list_text', wait_time=1)

    # Scrape the website html
    html = browser.html

    # Create a BeautifulSoup object from the scraped HTML
    data = soup(html, 'html.parser')
    
    # Close browser
    browser.quit() 

    # Get Rank and name  (City same as name)
    names = data.find_all(class_="mainName")
    citylist = []
    lat = []
    lon = []
    namelist = []
    ranklist = []
    parts = []
    for i in names:
        href=i.find("a")    
        text_data=href.text   
        parts = text_data.split('.')    
        ranklist.append(parts[0])
        namelist.append(parts[1].strip())
        
        city_loc = (parts[1].strip())
        citylist.append(city_loc)    
        # call geoapify to get lat/lon for location
        params = {
            "text": city_loc,
            "apiKey": geoapify_key }
        # Run request
        response = requests.get(base_url, params=params).json()
            
        # Extract lat/lon
        latitude = response["features"][0]["properties"]["lat"]
        longitude = response["features"][0]["properties"]["lon"]

        # append to list
        lat.append(latitude)
        lon.append(longitude)    

    # Get image urls
    images = data.find_all(class_="sizedThumb_container")
    iurls = []
    for i in images:
        href=i.find("img")
        iurls.append(href["src"])    
    
    # Get url to go retrieve rating and reviews - destinations is new so capture for df to csv load (data not used)
    url_ary = []
    desclist = []
    for lnk in data.find_all(class_="firstone"):
        href=lnk.find("a")    
        url_ary.append(href['href'])

        # Use rating url to get to next page and get description - rating not present due to this being a relatively new category
        desc_url = "https://www.tripadvisor.com" + href['href']
        # setup Splinter
        browser = Browser('chrome')

        # Visit the website - sleep provided to avoid continuous calls   
        browser.visit(desc_url)
        # Scrape the website html
        html = browser.html

        # Create a BeautifulSoup object from the scraped HTML
        raters = soup(html, 'html.parser')
        browser.quit()
        time.sleep(5)

        # Get desciption from here
        desc_text = raters.find(class_="GYFPJ wESPJ _J B- G- Wh _S")
        desclist.append(desc_text.text)

    
    # Create array with category and count of rank array
    cat_list = np.repeat([cat_dest], len(ranklist))

    # Load arrays as columns of dataframe
    df = pd.DataFrame({"category": cat_list, "rank": ranklist, "name": namelist, "location": citylist, 
                  "imageurl": iurls, "description": desclist, "latitude": lat, "longitude": lon, "ratingurl": url_ary})
        
    # save dataframe as csv 
    filename = "static/data/" + cat_dest + ".csv"
    df.to_csv(filename, encoding="utf-8", index=False, header=True)  

# Final checks
browser.quit()

### Scraping Reviews for Beaches Category

In [None]:
rates = []
rvcount = []
exclnt = []
vgood = []
averg = []
poor = []
trrbl = []
rank_key = []

# Get rating and misc info for all beaches
prefix = "https://www.tripadvisor.com"

beaches_df = pd.read_csv("static/data/beaches.csv")
# loop thru beaches and scrape rating information and store as csv
for index, row in beaches_df.iterrows():
    rank_key.append(row['rank'])
    addlink = row['ratingurl']
    url = prefix + addlink

    # setup Splinter
    browser = Browser('chrome')

    # Visit the website - sleep provided to avoid continuous calls   
    browser.visit(url)
    # Scrape the website html
    html = browser.html

    # Create a BeautifulSoup object from the scraped HTML
    raters = soup(html, 'html.parser')
    browser.quit()
    time.sleep(15)

    # extract data and populate arrays
    # get overall rating (has decimals)
    rating_text = raters.find(class_="biGQs _P fiohW hzzSG uuBRH").text
    rating = float(rating_text)
    rates.append(rating)

    # get review count
    rcount_text = raters.find(class_="yyzcQ").text
    # remove all ',' separators and cast as int
    rcount_text = rcount_text.replace(',', '')
    rvcount.append(int(rcount_text))  

    # get the different types of ratings and their values
    v1 = raters.find_all(class_="IMmqe")
    for i in v1:
        rv = i.find(class_="biGQs _P pZUbB osNWb").text
        rv = rv.replace(',','') 
        rt = i.find(class_="biGQs _P pZUbB hmDzD").text

        if rt == "Excellent":
            exclnt.append(rv)
        elif rt == 'Very good':
            vgood.append(rv)
        elif rt == 'Average':
            averg.append(rv)
        elif rt == 'Poor':
            poor.append(rv)
        elif rt == 'Terrible':
            trrbl.append(rv)


In [None]:
# add to dataframe and save as csv
catlist = np.repeat(["beaches"], len(rank_key))
df1 = pd.DataFrame({"category": catlist, "rank": rank_key, "rate": rates, "total_reviews": rvcount, "excellent": exclnt, 
                    "very_good": vgood, "average": averg, "poor": poor, "terrible": trrbl})
# save dataframe as csv 
filename = "static/data/beachreviews.csv"
df1.to_csv(filename, encoding="utf-8", index=False, header=True)

### Scraping Reviews for Things to do Category

In [None]:
rates = []
rvcount = []
exclnt = []
vgood = []
averg = []
poor = []
trrbl = []
rank_key = []

# Get rating and misc info for all things-to-do
prefix = "https://www.tripadvisor.com"

beaches_df = pd.read_csv("static/data/things.csv")
# loop thru things-to-do and scrape rating information and store as csv
for index, row in beaches_df.iterrows():
    rank_key.append(row['rank'])
    addlink = row['ratingurl']
    url = prefix + addlink

    # setup Splinter
    browser = Browser('chrome')

    # Visit the website - sleep provided to avoid continuous calls   
    browser.visit(url)
    # Scrape the website html
    html = browser.html

    # Create a BeautifulSoup object from the scraped HTML
    raters = soup(html, 'html.parser')
    browser.quit()
    time.sleep(15)

    # extract data and populate arrays
    # get overall rating (has decimals)
    rating_text = raters.find(class_="biGQs _P fiohW hzzSG uuBRH").text
    rating = float(rating_text)
    rates.append(rating)

    # get review count
    rary_text = raters.find_all('span', class_="biGQs _P pZUbB KxBGd")
    rcount_text = rary_text[-1].text

    # remove all ',' separators and cast as int
    rcount_text = rcount_text.replace(',', '')
    rcount_text = rcount_text.replace(' reviews', '')
    rvcount.append(int(rcount_text))  

    # get the different types of ratings and their values
    v1 = raters.find_all(class_="IMmqe")
    for i in v1:
        rv = i.find(class_="biGQs _P pZUbB osNWb").text  
        rv = rv.replace(',','') 
        rt = i.find(class_="biGQs _P pZUbB hmDzD").text 

        if rt == "Excellent":
            exclnt.append(rv)
        elif rt == 'Very good':
            vgood.append(rv)
        elif rt == 'Average':
            averg.append(rv)
        elif rt == 'Poor':
            poor.append(rv)
        elif rt == 'Terrible':
            trrbl.append(rv)

In [None]:
# add to dataframe and save as csv
catlist = np.repeat(["things"], len(rank_key))
df2 = pd.DataFrame({"category": catlist, "rank": rank_key, "rate": rates, "total_reviews": rvcount, "excellent": exclnt, 
                    "very_good": vgood, "average": averg, "poor": poor, "terrible": trrbl})
# save dataframe as csv 
filename = "static/data/thingsreviews.csv"
df2.to_csv(filename, encoding="utf-8", index=False, header=True)

### Scraping reviews for Restaurants category

In [None]:
rates = []
rvcount = []
exclnt = []
vgood = []
averg = []
poor = []
trrbl = []
rank_key = []

# Get rating and misc info for all restaurants
prefix = "https://www.tripadvisor.com"

inns_df = pd.read_csv("static/data/restaurants.csv")
# loop thru restaurants and scrape rating information and store as csv
for index, row in inns_df.iterrows():
    rank_key.append(row['rank'])
    addlink = row['ratingurl']
    url = prefix + addlink

    # setup Splinter
    browser = Browser('chrome')

    # Visit the website - sleep provided to avoid continuous calls   
    browser.visit(url)
    # Scrape the website html
    html = browser.html

    # Create a BeautifulSoup object from the scraped HTML
    raters = soup(html, 'html.parser')
    browser.quit()
    time.sleep(15)

    # extract data and populate arrays
    # get overall rating (has decimals)
    rating_text = raters.find('span', class_="ZDEqb").text
    rating = float(rating_text)
    rates.append(rating)

    # get review count
    rary_text = raters.find('a', class_="IcelI").text
    # remove all ',' separators and cast as int
    rcount_text = rary_text.replace(',', '')
    rcount_text = rcount_text.replace(' reviews', '')
    rvcount.append(int(rcount_text))  

    # get the different types of ratings and their values
    v1 = raters.find_all(class_="row_num is-shown-at-tablet")
    count = 0
    for i in v1:
        rv = i.text
        count += 1
        if count == 1:
            rt = 'Excellent'
            exclnt.append(rv)
        elif count == 2:
            rt = 'Very good'
            vgood.append(rv)
        elif count == 3:
            rt = 'Average'
            averg.append(rv)
        elif count == 4:
            rt = 'Poor'
            poor.append(rv)
        elif count == 5:
            rt = 'Terrible'
            trrbl.append(rv)    

In [None]:
# add to dataframe and save as csv
catlist = np.repeat(["restaurants"], len(rank_key))
df3 = pd.DataFrame({"category": catlist, "rank": rank_key, "rate": rates, "total_reviews": rvcount, "excellent": exclnt, 
                    "very_good": vgood, "average": averg, "poor": poor, "terrible": trrbl})
# save dataframe as csv 
filename = "static/data/restaurantreviews.csv"
df3.to_csv(filename, encoding="utf-8", index=False, header=True)

### Scrapping Reviews for Hotels category

In [29]:
rates = []
rvcount = []
exclnt = []
vgood = []
averg = []
poor = []
trrbl = []
rank_key = []

# Get rating and misc info for all restaurants
prefix = "https://www.tripadvisor.com"

inns_df = pd.read_csv("static/data/hotels.csv")
# loop thru restaurants and scrape rating information and store as csv
for index, row in inns_df.iterrows():
    rank_key.append(row['rank'])
    addlink = row['ratingurl']
    url_h = prefix + addlink

    # setup Splinter
    browser = Browser('chrome')

    # Visit the website - sleep provided to avoid continuous calls   
    browser.visit(url_h)
    # Scrape the website html
    html = browser.html

    # Create a BeautifulSoup object from the scraped HTML
    raters = soup(html, 'html.parser')
    browser.quit()
    time.sleep(5)

    # extract data and populate arrays
    # get overall rating (has decimals)
    rating_text = raters.find('span', class_="uwJeR P").text
    rating = float(rating_text)
    rates.append(rating)

    # get review count
    rvwcount_text = raters.find('span', class_="hkxYU q Wi z Wc").text    
    # remove all ',' separators and cast as int
    rcount_text = rvwcount_text.replace(',', '')
    rcount_text = rcount_text.replace(' reviews', '')
    rvcount.append(int(rcount_text))  

    # get the different types of ratings and their values
    v1 = raters.find_all(class_="NLuQa")
    count = 0
    for i in v1:
        rv = i.text
        count += 1
        if count == 1:
            rt = 'Excellent'
            exclnt.append(rv)
        elif count == 2:
            rt = 'Very good'
            vgood.append(rv)
        elif count == 3:
            rt = 'Average'
            averg.append(rv)
        elif count == 4:
            rt = 'Poor'
            poor.append(rv)
        elif count == 5:
            rt = 'Terrible'
            trrbl.append(rv)    

In [30]:
# add to dataframe and save as csv
catlist = np.repeat(["hotels"], len(rank_key))
df3 = pd.DataFrame({"category": catlist, "rank": rank_key, "rate": rates, "total_reviews": rvcount, "excellent": exclnt, 
                    "very_good": vgood, "average": averg, "poor": poor, "terrible": trrbl})
# save dataframe as csv 
filename = "static/data/hotelreviews.csv"
df3.to_csv(filename, encoding="utf-8", index=False, header=True)