In [1]:
# from seleniumwire import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
import pickle
import os
import urllib.parse
import random
import requests
from pprint import pprint
from dotenv import load_dotenv
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from Smartproxy_residential.extension import proxies
import json


# Week 4 Progress

In [71]:
### Constants ###

TRIP_HOTELLST_URL = "https://www.tripadvisor.com/Hotels-g"
TRIP_LOC_SEARCH = "https://api.content.tripadvisor.com/api/v1/location/search?"
BASE_URL = "https://www.tripadvisor.com"


In [72]:
os.chdir('..')

load_dotenv()

TRIPADVISOR_API = os.getenv('TRIPADVISOR_API')
SP_WEBCRAWL_USER = os.getenv('SP_WEBCRAWL_USER')
SP_WEBCRAWL_PAS = os.getenv('SP_WEBCRAWL_PAS')
SP_RESI_USER = os.getenv('SP_RESI_USER')
SP_RESI_PAS = os.getenv('SP_RESI_PAS')
PROXY_HOST = 'us.smartproxy.com'
PROXY_PORT = '10000'

In [65]:
def prep_cityname_api(cityname):
    '''
    Prepares string representing the city's name for API call by removing 
    whitespace, replacing spaces with %20, and converting to lowercase.
    '''
    cityname = cityname.strip()
    cityname = cityname.replace(" ", "%20")
    cityname = cityname.lower()
    
    return cityname


def tripadvisor_api_call(cityname):
    ''' 
    Calls the Tripadvisor API to search for the city and returns the JSON 
    response.
    '''
    cityname = prep_cityname_api(cityname)
    query_params = f"&searchQuery={cityname}" + "&category=geos&language=en"
    
    url = TRIP_LOC_SEARCH + f"key={TRIPADVISOR_API}" + query_params
    
    print("The url is: ", url)
    
    headers = {"accept": "application/json"}
    tripadvisor_response = requests.get(url, headers=headers)
    print(tripadvisor_response.text)
    
    return tripadvisor_response.json()


def tripadvisor_city_info(tripadvisor_response_json, cityname):
    ''' 
    Extracts the location_id, name, secondary_name, and state from the
    Tripadvisor API response JSON for the provided city.
    '''
    data = tripadvisor_response_json['data']
    
    if len(data) == 0:
        raise ValueError(f"Tripadvisor API returned no results for {cityname}")
    
    for index, city in enumerate(data):
        if city["name"].lower().strip() == cityname.lower().strip():
            city_info = city
            break
        
        if index == len(data) - 1:
            raise ValueError(f"{cityname} was not found via the Tripadvisor API")
    
    return city_info


def city_info_values(city_info):
    ''' 
    Extracts the location_id, name, secondary_name, and state from the
    Tripadvisor API response JSON for the provided city.
    '''
    location_id = city_info["location_id"]
    name = city_info["name"].replace(" ", "_")
    
    if "city" in city_info["address_obj"]:
        secondary_name = city_info["address_obj"]["city"].replace(" ", "_")
    else:
        secondary_name = None
    state = city_info["address_obj"]["state"].replace(" ", "_")
    
    return location_id, name, secondary_name, state
    

def tripadvisor_city_hotels(city_info, base_url=TRIP_HOTELLST_URL):
    ''' 
    Creates the URL for the hotel list for the city.
    '''
    location_id, name, secondary_name, state = city_info_values(city_info)
    if secondary_name:
        city_hotellst_url = base_url + f"{location_id}-{name}_{secondary_name}_{state}-Hotels.html"
    else:
        city_hotellst_url = base_url + f"{location_id}-{name}_{state}-Hotels.html"
    
    return city_hotellst_url


def get_additional_hotelpage(city_hotellst_url):
    '''
    Goes to the next page of the hotel list for the city.
    '''
    g_location_id = city_hotellst_url.split("-")[1]

    if "_" in city_hotellst_url.split("-")[2]:
        city_hotellst_url = city_hotellst_url.replace(g_location_id, 
                g_location_id + "-oa30"
    ) 
        
    else:
        oa_old = city_hotellst_url.split("-")[2]
        oa_old_num = int(oa_old[2:])
        glocationid_oa = g_location_id + "-" + oa_old
        oa_new_num = oa_old_num + 30
        oa_new = "oa" + str(oa_new_num)
        city_hotellst_url = city_hotellst_url.replace(glocationid_oa, 
                g_location_id + "-" + oa_new
        )
    
    return city_hotellst_url
    

## Hilo

Builing the URL from the name of the city

In [39]:
response = tripadvisor_api_call("Hilo")

The url is:  https://api.content.tripadvisor.com/api/v1/location/search?key=32E24EF8375842D99E9369D814E4221C&searchQuery=hilo&category=geos&language=en
{
  "data": [
    {
      "location_id": "60583",
      "name": "Hilo",
      "address_obj": {
        "city": "Island of Hawaii",
        "state": "Hawaii",
        "country": "United States",
        "address_string": "Hilo, Island of Hawaii, HI"
      }
    },
    {
      "location_id": "10141483",
      "name": "Hilo Intl Airport",
      "address_obj": {
        "street1": "2450 Kekuanaoa St",
        "city": "Hilo",
        "state": "Hawaii",
        "country": "United States",
        "postalcode": "96720",
        "address_string": "2450 Kekuanaoa St, Hilo, Island of Hawaii, HI 96720"
      }
    }
  ]
}


In [40]:
city_info = tripadvisor_city_info(response, "Hilo")

print(city_info)

{'location_id': '60583', 'name': 'Hilo', 'address_obj': {'city': 'Island of Hawaii', 'state': 'Hawaii', 'country': 'United States', 'address_string': 'Hilo, Island of Hawaii, HI'}}


In [43]:
location_id, name, secondary_name, state = city_info_values(city_info)

print(location_id, name, secondary_name, state)

60583 Hilo Island_of_Hawaii Hawaii


In [41]:
city_hotellst_url = tripadvisor_city_hotels(city_info)

print(city_hotellst_url)

https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html


In [47]:
get_additional_hotelpage(city_hotellst_url)

'https://www.tripadvisor.com/Hotels-g60583-oa30-Hilo_Island_of_Hawaii_Hawaii-Hotels.html'

## Jersey City

Builing the URL from the name of the city

In [52]:
response = tripadvisor_api_call("Jersey City")
# "https://api.content.tripadvisor.com/api/v1/location/search?key=32E24EF8375842D99E9369D814E4221C&searchQuery=jersey%20city&language=en"

The url is:  https://api.content.tripadvisor.com/api/v1/location/search?key=32E24EF8375842D99E9369D814E4221C&searchQuery=jersey%20city&category=geos&language=en
{
  "data": [
    {
      "location_id": "46531",
      "name": "Jersey City",
      "address_obj": {
        "state": "New Jersey",
        "country": "United States",
        "address_string": "Jersey City, NJ"
      }
    },
    {
      "location_id": "29750",
      "name": "Atlantic City",
      "address_obj": {
        "city": "Jersey Shore",
        "state": "New Jersey",
        "country": "United States",
        "address_string": "Atlantic City, NJ"
      }
    },
    {
      "location_id": "46696",
      "name": "Ocean City",
      "address_obj": {
        "city": "Jersey Shore",
        "state": "New Jersey",
        "country": "United States",
        "address_string": "Ocean City, NJ"
      }
    },
    {
      "location_id": "46807",
      "name": "Sea Isle City",
      "address_obj": {
        "city": "Jersey Sho

In [53]:
print(response)

{'data': [{'location_id': '46531', 'name': 'Jersey City', 'address_obj': {'state': 'New Jersey', 'country': 'United States', 'address_string': 'Jersey City, NJ'}}, {'location_id': '29750', 'name': 'Atlantic City', 'address_obj': {'city': 'Jersey Shore', 'state': 'New Jersey', 'country': 'United States', 'address_string': 'Atlantic City, NJ'}}, {'location_id': '46696', 'name': 'Ocean City', 'address_obj': {'city': 'Jersey Shore', 'state': 'New Jersey', 'country': 'United States', 'address_string': 'Ocean City, NJ'}}, {'location_id': '46807', 'name': 'Sea Isle City', 'address_obj': {'city': 'Jersey Shore', 'state': 'New Jersey', 'country': 'United States', 'address_string': 'Sea Isle City, NJ'}}, {'location_id': '46600', 'name': 'Margate City', 'address_obj': {'city': 'Jersey Shore', 'state': 'New Jersey', 'country': 'United States', 'address_string': 'Margate City, NJ'}}, {'location_id': '46884', 'name': 'Ventnor City', 'address_obj': {'city': 'Jersey Shore', 'state': 'New Jersey', 'cou

In [61]:
city_info = tripadvisor_city_info(response, "Jersey City")

print(city_info)

{'location_id': '46531', 'name': 'Jersey City', 'address_obj': {'state': 'New Jersey', 'country': 'United States', 'address_string': 'Jersey City, NJ'}}


In [60]:
location_id, name, secondary_name, state = city_info_values(city_info)

print(location_id, name, secondary_name, state)

46531 Jersey_City None New_Jersey


In [62]:
city_hotellst_url = tripadvisor_city_hotels(city_info)

print(city_hotellst_url)

https://www.tripadvisor.com/Hotels-g46531-Jersey_City_New_Jersey-Hotels.html


In [63]:
city_hotellst_url = get_additional_hotelpage(city_hotellst_url)

print(city_hotellst_url)

https://www.tripadvisor.com/Hotels-g46531-oa30-Jersey_City_New_Jersey-Hotels.html


In [66]:
city_hotellst_url = get_additional_hotelpage(city_hotellst_url)

print(city_hotellst_url)

https://www.tripadvisor.com/Hotels-g46531-oa60-Jersey_City_New_Jersey-Hotels.html


In [None]:
# Class under which page list counts are stored "gBgtO"
# If items on page pulled are less than 30

## Working on viability of the Smartproxy Residential IPs

In [52]:
url = 'https://ip.smartproxy.com/json'
username = SP_RESI_USER
password = SP_RESI_PAS
proxy = f"https://{username}:{password}@us.smartproxy.com:10000"

result = requests.get(url, proxies={
    'http': proxy,
    'https': proxy
})

print(result.text)


{
    "browser": {
        "name": "",
        "version": ""
    },
    "platform": {
        "os": "undefined undefined"
    },
    "engine": {},
    "isp": {
        "isp": "Optimum Online",
        "asn": 6128,
        "domain": "optonline.net",
        "organization": "Optimum Online"
    },
    "city": {
        "name": "Wayne",
        "code": "NJ",
        "state": "New Jersey",
        "time_zone": "America/New_York",
        "zip_code": "07470"
    },
    "proxy": {
        "ip": "68.197.162.44",
        "is_anonymous_proxy": false
    },
    "country": {
        "code": "US",
        "name": "United States",
        "continent": "North America"
    }
}


In [53]:
url = 'https://ip.smartproxy.com/json'
username = SP_RESI_USER
password = SP_RESI_PAS
proxy = f"https://{username}:{password}@us.smartproxy.com:10000"

result = requests.get(url, proxies={
    'http': proxy,
    'https': proxy
})

print(result.text)

{
    "browser": {
        "name": "",
        "version": ""
    },
    "platform": {
        "os": "undefined undefined"
    },
    "engine": {},
    "isp": {
        "isp": "CenturyLink",
        "asn": 209,
        "domain": "qwest.net",
        "organization": "CenturyLink"
    },
    "city": {
        "name": "Phoenix",
        "code": "AZ",
        "state": "Arizona",
        "time_zone": "America/Phoenix",
        "zip_code": "85042"
    },
    "proxy": {
        "ip": "184.98.251.132",
        "is_anonymous_proxy": false
    },
    "country": {
        "code": "US",
        "name": "United States",
        "continent": "North America"
    }
}


In [59]:
username = SP_RESI_USER
password = SP_RESI_PAS
proxy_host = 'us.smartproxy.com'
proxy_port = '10000'

chrome_options = webdriver.ChromeOptions()

proxies_extension = proxies(username, password, proxy_host, proxy_port)
chrome_options.add_extension(proxies_extension)

# chrome_options.add_argument("--headless=new")

chrome_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

url = "https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"

chrome_driver.get(url)

wait = WebDriverWait(chrome_driver, random.randint(60, 100))

page_source_code = soup(chrome_driver.page_source, 'lxml')
chrome_driver.quit()

In [60]:
print(page_source_code)

<html lang="en-US"><head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="en" http-equiv="content-language"/><link as="font" crossorigin="anonymous" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" rel="preload" type="font/woff2"/><link href="https://static.tacdn.com/favicon.ico?v2" id="favicon" rel="icon" type="image/x-icon"/><link color="#000000" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" rel="mask-icon" sizes="any"/><meta content="#34e0a1" name="theme-color"/><meta content="telephone=no" name="format-detection"/><meta content="TripAdvisor" property="al:ios:app_name"/><meta content="284876795" property="al:ios:app_store_id"/><meta content="284876795" name="twitter:app:id:ipad" property="twitter:app:id:ipad"/><meta content="284876795" name="twitter:app:id:iphone" property="twitter:app:id:iphone"/><meta content="tripadvisor://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_H

In [70]:
BASE_URL = "https://www.tripadvisor.com"

In [77]:
hilo_hotel_lst = []
galveston_hotel_lst = []
sunnyvale_hotel_lst = []
jerseycity_hotel_lst = []

In [95]:
# hilo_urls = ['https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html', "https://www.tripadvisor.com/Hotels-g60583-oa30-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"]
# galveston_urls = ['https://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Island_Texas-Hotels.html', "https://www.tripadvisor.com/Hotels-g55879-oa30-Galveston_Galveston_Island_Texas-Hotels.html", "https://www.tripadvisor.com/Hotels-g55879-oa60-Galveston_Galveston_Island_Texas-Hotels.html", "https://www.tripadvisor.com/Hotels-g55879-oa90-Galveston_Galveston_Island_Texas-Hotels.html"]
# sunnyvale_urls = ['https://www.tripadvisor.com/Hotels-g33146-Sunnyvale_California-Hotels.html', "https://www.tripadvisor.com/Hotels-g33146-oa30-Sunnyvale_California-Hotels.html"]
# jerseycity_urls = ['https://www.tripadvisor.com/Hotels-g46531-Jersey_City_New_Jersey-Hotels.html', "https://www.tripadvisor.com/Hotels-g46531-oa30-Jersey_City_New_Jersey-Hotels.html", "https://www.tripadvisor.com/Hotels-g46531-oa60-Jersey_City_New_Jersey-Hotels.html", "https://www.tripadvisor.com/Hotels-g46531-oa90-Jersey_City_New_Jersey-Hotels.html"]

In [78]:
def selen_crwl(url, save_path = False, web_driver_install = False):
    '''
    Crawls a webpage using Selenium and returns the page source code.
    '''
    if web_driver_install:
        chrome_driver = webdriver.Chrome(ChromeDriverManager().install())
    else:
        chrome_driver = webdriver.Chrome()
    chrome_driver.get(url)
    chrome_driver.implicitly_wait(random.randint(60, 100))
    page_source_code = soup(chrome_driver.page_source, 'lxml')
    chrome_driver.quit()
    
    if save_path:
        save_path = save_path + ".pkl"
        with open(save_path, 'wb') as f:
            pickle.dump(page_source_code, f)
            
    return page_source_code
            
            
def smrtprxy_residnt_selen_crwl(url, save_path = False):
    ''' 
    Crawls a webpage using Selenium and Smartproxy residential proxies and
    returns the page source code.
    '''
    chrome_options = webdriver.ChromeOptions()

    proxies_extension = proxies(SP_RESI_USER, SP_RESI_PAS, 
            PROXY_HOST, PROXY_PORT
    )
    chrome_options.add_extension(proxies_extension)

    # chrome_options.add_argument("--headless=new")

    chrome_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    chrome_driver.get(url)

    chrome_driver.implicitly_wait(random.randint(60, 100))
    
    # try:
    #     see_all_button = chrome_driver.find_element(By.CLASS_NAME, "biGQs _P ttuOS")
    #     if see_all_button:
    #         see_all_button.click()
    # except Exception as e:
    #     print(e)
        

    page_source_code = soup(chrome_driver.page_source, 'lxml')
    chrome_driver.quit()
    
    if save_path:
        save_path = save_path + ".pkl"
        with open(save_path, 'wb') as f:
            pickle.dump(page_source_code, f)
    
    
    return page_source_code

def smrtprxy_web_crwl(url, save_path = False):
    ''' 
    Uses Smartproxy Web Scrape to scrape a webpage and returns the page source 
    code.
    '''
    task_params = {
        "target": "universal",
        "url": url # hotel_tripadvisor_url
        }

    response = requests.post(
        "https://scrape.smartproxy.com/v1/tasks",
        json = task_params,
        auth=(SP_WEBCRAWL_USER, SP_WEBCRAWL_PAS)
    )
    
    # print(response.json())
    html_string = response.json()["results"][0]["content"]
    page_source_code  = soup(html_string, 'lxml')
    if save_path:
        save_path = save_path + ".pkl"
        with open(save_path, 'wb') as f:
            pickle.dump(page_source_code, f)
    
    return page_source_code
    


def crawl_hotels_lst(loc_hotellst_url, hotel_info_lst, count, crawl_method,  
            save_path = False, web_driver_install = False
    ):
    '''
    Crawls the a city's list of hotels on Tripadvisor.
    '''
    if crawl_method == "selen_crwl":
        page_source_code = selen_crwl(loc_hotellst_url, save_path, web_driver_install)
    elif crawl_method == "smrtprxy_residnt_selen_crwl":
        page_source_code = smrtprxy_residnt_selen_crwl(loc_hotellst_url, save_path)
    elif crawl_method == "smrtprxy_web_crwl":
        page_source_code = smrtprxy_web_crwl(loc_hotellst_url, save_path)
    
    hotel_divs = page_source_code.select('div[class*="rlqQt"]') # Number of hotels on a page
    
    if len(hotel_divs) < 30:
        print("Number of hotels on page: ", len(hotel_divs))
        return hotel_info_lst

    else:
        print("Number of hotels on page: ", len(hotel_divs))
    
        loc_hotellst_url = get_additional_hotelpage(loc_hotellst_url)
    count += 1
    
    return hotel_info_lst    


def get_hotel_info(page_source_code, hotel_info_lst):
    '''
    Creates a lists within a list of hotel information from the page source 
    code from a Tripadvisor city's list of hotels webpage.
    '''
    hotel_divs = page_source_code.select('div[class*="rlqQt"]')
    # print(hotel_divs)

    for hotel in hotel_divs:
        location_name = hotel.select('h3[class*="nBrpc"]')
        location_name = location_name[0].get_text().split(".")[1].strip()
        
        # parent_location_url = hotel.select('a[class*="lqfZ"]')
        location_url_div = hotel.select('div[class*="jsTLT"]')
        location_url_a_tag = location_url_div[0].select('a')
        location_url_href = location_url_a_tag[0]['href']
        location_url = urllib.parse.urljoin(BASE_URL, location_url_href)
        # print(location_url)
        
        # Example: <div class="luFhX o W f u w JSdbl" aria-label="4.5 of 5 bubbles. 1,909 reviews">
        review_info_div = hotel.select('div[class*="luFhX"]')
        review_info = review_info_div[0].get('aria-label')
        # print(review_info)
        
        if "bubbles." in review_info:
            rating = review_info.split("bubbles.")[0].strip().split(" ")[0]
            review_count = review_info.split("bubbles.")[1].strip().split(" ")[0]
            # print(rating, "-", review_count)
        else:
            review_info.strip().split(" ")[0]
            rating = np.nan
            review_count = 0
            
        hotel_info_lst.append([location_name, location_url, rating, review_count])
    
    return hotel_info_lst


def lst_to_df(hotel_info_lst):
    df = pd.DataFrame(hotel_info_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count"])
    return df

In [65]:
crawl_method = "selen_crwl"
loc_hotellst_url = "https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"
save_path = False
web_driver_install = False

if crawl_method == "selen_crwl":
    page_source_code = selen_crwl(loc_hotellst_url, save_path, web_driver_install)
elif crawl_method == "smrtprxy_residnt_selen_crwl":
    page_source_code = smrtprxy_residnt_selen_crwl(loc_hotellst_url, save_path)
elif crawl_method == "smrtprxy_web_crwl":
    page_source_code = smrtprxy_web_crwl(loc_hotellst_url, save_path)

hotel_divs = page_source_code.select('div[class*="rlqQt"]') # Number of hotels on a page

if len(hotel_divs) < 30:
    print("Number of hotels on page: ", len(hotel_divs))

else:
    print("Number of hotels on page: ", len(hotel_divs))

Number of hotels on page:  30


In [68]:
crawl_method = "smrtprxy_residnt_selen_crwl"
loc_hotellst_url = "https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"
save_path = False

if crawl_method == "selen_crwl":
    page_source_code = selen_crwl(loc_hotellst_url, save_path, web_driver_install)
elif crawl_method == "smrtprxy_residnt_selen_crwl":
    page_source_code = smrtprxy_residnt_selen_crwl(loc_hotellst_url, save_path)
elif crawl_method == "smrtprxy_web_crwl":
    page_source_code = smrtprxy_web_crwl(loc_hotellst_url, save_path)

hotel_divs = page_source_code.select('div[class*="rlqQt"]') # Number of hotels on a page

if len(hotel_divs) < 30:
    print("Number of hotels on page: ", len(hotel_divs))

else:
    print("Number of hotels on page: ", len(hotel_divs))
    

Number of hotels on page:  10


In [87]:
print(hotel_divs)

[<div class="rlqQt _T A"><span><span class="listItem"><span class="organic"><div class="tkvEM Gi z Re o" data-automation="non-plus-hotel-offer-1" id="hotel-listing-0"><div class="pmRuN _T"><div class="qeraN _T qMONr iOIte iJfMg ndRxi rcibp FKwyn"><div class="yYtes _T Fl y"><div class="MMdJi w"><div class="afQPz eXZKw o pABFk w _Z Gm A"><div class="WTWEM w _Z"><a class="BMQDV _F Gv wSSLS SwZTJ FGwzt PaRlG" href="/Hotel_Review-g60583-d113098-Reviews-SCP_Hilo_Hotel-Hilo_Island_of_Hawaii_Hawaii.html?lk=d58fce13-fff7-4b3c-b6cb-427e205c4a14" target="_blank"><div class="_T w _Z" data-clicksource="Photo"><div aria-atomic="true" aria-label="Carousel of images" aria-live="polite" class="IdURT w carousel UznXc wSSLS" role="figure"><div class="yMdQy w"><ul class="zHxHb"><li class="CyFNY _A MBoCH"><picture class="NhWcC _R mdkdE afQPz eXZKw"><source media="(max-width: 400px)" srcset="https://dynamic-media-cdn.tripadvisor.com/media/photo-o/23/69/fa/73/enjoy-our-outdoor-swimming.jpg?w=400&amp;h=400&am

In [79]:
crawl_method = "smrtprxy_residnt_selen_crwl"
loc_hotellst_url = "https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"
save_path = False

if crawl_method == "selen_crwl":
    page_source_code = selen_crwl(loc_hotellst_url, save_path, web_driver_install)
elif crawl_method == "smrtprxy_residnt_selen_crwl":
    page_source_code = smrtprxy_residnt_selen_crwl(loc_hotellst_url, save_path)
elif crawl_method == "smrtprxy_web_crwl":
    page_source_code = smrtprxy_web_crwl(loc_hotellst_url, save_path)

hotel_divs = page_source_code.select('div[class*="rlqQt"]') # Number of hotels on a page

if len(hotel_divs) < 30:
    print("Number of hotels on page: ", len(hotel_divs))

else:
    print("Number of hotels on page: ", len(hotel_divs))
    

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=123.0.6312.107)
Stacktrace:
0   chromedriver                        0x0000000104bfc474 chromedriver + 4326516
1   chromedriver                        0x0000000104bf493c chromedriver + 4294972
2   chromedriver                        0x0000000104820088 chromedriver + 278664
3   chromedriver                        0x00000001047fb270 chromedriver + 127600
4   chromedriver                        0x00000001048879ac chromedriver + 702892
5   chromedriver                        0x000000010489ac0c chromedriver + 781324
6   chromedriver                        0x00000001048574e4 chromedriver + 505060
7   chromedriver                        0x0000000104857f5c chromedriver + 507740
8   chromedriver                        0x0000000104bbfa10 chromedriver + 4078096
9   chromedriver                        0x0000000104bc47c8 chromedriver + 4097992
10  chromedriver                        0x0000000104ba65b4 chromedriver + 3974580
11  chromedriver                        0x0000000104bc50e0 chromedriver + 4100320
12  chromedriver                        0x0000000104b97ba4 chromedriver + 3914660
13  chromedriver                        0x0000000104be56e8 chromedriver + 4232936
14  chromedriver                        0x0000000104be5864 chromedriver + 4233316
15  chromedriver                        0x0000000104bf45b0 chromedriver + 4294064
16  libsystem_pthread.dylib             0x000000019d99af94 _pthread_start + 136
17  libsystem_pthread.dylib             0x000000019d995d34 thread_start + 8


In [73]:
hotel_info_lst = []

for hotel in hotel_divs:
    location_name = hotel.select('h3[class*="nBrpc"]')
    location_name = location_name[0].get_text().split(".")[1].strip()
    
    # parent_location_url = hotel.select('a[class*="lqfZ"]')
    location_url_div = hotel.select('div[class*="jsTLT"]')
    location_url_a_tag = location_url_div[0].select('a')
    location_url_href = location_url_a_tag[0]['href']
    location_url = urllib.parse.urljoin(BASE_URL, location_url_href)
    # print(location_url)
    
    # Example: <div class="luFhX o W f u w JSdbl" aria-label="4.5 of 5 bubbles. 1,909 reviews">
    review_info_div = hotel.select('div[class*="luFhX"]')
    review_info = review_info_div[0].get('aria-label')
    # print(review_info)
    
    if "bubbles." in review_info:
        rating = review_info.split("bubbles.")[0].strip().split(" ")[0]
        review_count = review_info.split("bubbles.")[1].strip().split(" ")[0]
        # print(rating, "-", review_count)
    else:
        review_info.strip().split(" ")[0]
        rating = np.nan
        review_count = 0
        
    hotel_info_lst.append([location_name, location_url, rating, review_count])
    
print(hotel_info_lst)

[['SCP Hilo Hotel', 'https://www.tripadvisor.com/Hotel_Review-g60583-d113098-Reviews-SCP_Hilo_Hotel-Hilo_Island_of_Hawaii_Hawaii.html?lk=b4cd50a6-40df-41f5-a30e-62d20b92d7d1', '4.5', '226'], ['The Inn at Kulaniapia Falls', 'https://www.tripadvisor.com/Hotel_Review-g60583-d146823-Reviews-The_Inn_at_Kulaniapia_Falls-Hilo_Island_of_Hawaii_Hawaii.html?lk=1f1e2d54-fd73-4c91-842c-5a60e6f9735e', '4.5', '1,165'], ['Hilo Honu Inn Bed and Breakfast', 'https://www.tripadvisor.com/Hotel_Review-g60583-d626798-Reviews-Hilo_Honu_Inn_Bed_and_Breakfast-Hilo_Island_of_Hawaii_Hawaii.html?lk=98f29f2c-e712-44c4-8ef0-745b7be58e99', '5.0', '312'], ['Grand Naniloa Hotel Hilo - A Doubletree By Hilton', 'https://www.tripadvisor.com/Hotel_Review-g60583-d113096-Reviews-Grand_Naniloa_Hotel_Hilo_A_Doubletree_By_Hilton-Hilo_Island_of_Hawaii_Hawaii.html?lk=b62d37ca-7d9a-46f3-894d-c6dbae7cd96c', '3.5', '1,761'], ['Hilo Hawaiian Hotel', 'https://www.tripadvisor.com/Hotel_Review-g60583-d209311-Reviews-Hilo_Hawaiian_Hote

In [78]:
crawl_method = "smrtprxy_web_crwl"
loc_hotellst_url = "https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"
save_path = False
web_driver_install = False

if crawl_method == "selen_crwl":
    page_source_code = selen_crwl(loc_hotellst_url, save_path, web_driver_install)
elif crawl_method == "smrtprxy_residnt_selen_crwl":
    page_source_code = smrtprxy_residnt_selen_crwl(loc_hotellst_url, save_path)
elif crawl_method == "smrtprxy_web_crwl":
    page_source_code = smrtprxy_web_crwl(loc_hotellst_url, save_path)

# hotel_divs = page_source_code.select('div[class*="rlqQt"]') # Number of hotels on a page

# if len(hotel_divs) < 30:
#     print("Number of hotels on page: ", len(hotel_divs))

# else:
#     print("Number of hotels on page: ", len(hotel_divs))

{'error': 'Too many requests. (Fixed (successful only)).'}


In [84]:
# Old "smrtprxy_web_crwl" Data (NOTE: WEB CRAWL STILL WORKS JUST FREE TRAIL ENDED)
with open('/Users/John/Downloads/data.json', 'r') as file:
    response = json.load(file)

html_string = response["results"][0]["content"]
page_source_code  = soup(html_string, 'lxml')
hotel_divs = page_source_code.select('div[class*="rlqQt"]') # Number of hotels on a page

if len(hotel_divs) < 30:
    print("Number of hotels on page: ", len(hotel_divs))

else:
    print("Number of hotels on page: ", len(hotel_divs))

Number of hotels on page:  30


In [66]:
print(hotel_divs)

[<div class="rlqQt _T A"><span><span class="listItem"><span class="organic"><div class="tkvEM Gi z Re o" data-automation="non-plus-hotel-offer-1" id="hotel-listing-0"><div class="pmRuN _T"><div class="qeraN _T qMONr iOIte iJfMg ndRxi rcibp FKwyn"><div class="yYtes _T Fl y"><div class="MMdJi w"><div class="afQPz eXZKw o pABFk w _Z Gm A"><div class="WTWEM w _Z"><a class="BMQDV _F Gv wSSLS SwZTJ FGwzt PaRlG" href="/Hotel_Review-g60583-d113098-Reviews-SCP_Hilo_Hotel-Hilo_Island_of_Hawaii_Hawaii.html?lk=f7acc2d2-8d8a-418d-a67e-9e6add69b639" target="_blank"><div class="_T w _Z" data-clicksource="Photo"><div aria-atomic="true" aria-label="Carousel of images" aria-live="polite" class="IdURT w carousel UznXc wSSLS" role="figure"><div class="yMdQy w"><ul class="zHxHb"><li class="CyFNY _A MBoCH"><picture class="NhWcC _R afQPz eXZKw"><source media="(max-width: 400px)" srcset="https://dynamic-media-cdn.tripadvisor.com/media/photo-o/23/69/fa/73/enjoy-our-outdoor-swimming.jpg?w=400&amp;h=400&amp;s=1 

# Prior Work

## Hilo, Hawaii

In [80]:
partial_hilo_hotel_lst = crawl_hotels_lst(hilo_urls, hilo_hotel_lst, save_path = "hilo_hotel_source_code", web_driver_install = False)

In [81]:
partial_hilo_df = lst_to_df(partial_hilo_hotel_lst)

In [106]:
partial_hilo_df.shape

(37, 4)

In [83]:
partial_hilo_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count
0,SCP Hilo Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,226
1,The Inn at Kulaniapia Falls,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,1164
2,Hilo Honu Inn Bed and Breakfast,https://www.tripadvisor.com/Hotel_Review-g6058...,5.0,312
3,Grand Naniloa Hotel Hilo - A Doubletree By Hilton,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,1759
4,Hilo Hawaiian Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,2420


In [130]:
partial_hilo_df.to_csv("partial_hilo_hotel_info.csv", index = False)

In [129]:
task_params = {
        "target": "universal",
        "url": url # hotel_tripadvisor_url
}

response = requests.post(
    "https://scrape.smartproxy.com/v1/tasks",
    json = task_params,
    auth=(SP_WEBCRAWL_USER, SP_WEBCRAWL_PAS)
)

pprint(response.json())

{'results': [{'content': '<!DOCTYPE html><html lang="en-US"><head><meta '
                         'http-equiv="content-type" content="text/html; '
                         'charset=utf-8"/><meta http-equiv="content-language" '
                         'content="en"/><link '
                         'href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" '
                         'rel="preload" as="font" type="font/woff2" '
                         'crossorigin="anonymous"/><link rel="icon" '
                         'id="favicon" '
                         'href="https://static.tacdn.com/favicon.ico?v2" '
                         'type="image/x-icon"/><link rel="mask-icon" '
                         'sizes="any" '
                         'href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" '
                         'color="#000000"/><meta name="theme-color" '
                         'content="#34e0a1"/><meta name="format-dete

In [143]:
html_string = response.json()['results'][0]['content']



In [141]:
type(response.json()['results'][0]['content'])

str

In [176]:
def get_room_number(incomplete_hotel_lst):
    '''
    Use Smartproxy API to scrape the the hotel address and number of rooms 
    from the hotel's tripadvisor page.
    '''
    for index, (_, url, _, _) in enumerate(incomplete_hotel_lst):
        failed_urls = []
        
        task_params = {
        "target": "universal",
        "url": url # hotel_tripadvisor_url
        }

        response = requests.post(
            "https://scrape.smartproxy.com/v1/tasks",
            json = task_params,
            auth=(SP_WEBCRAWL_USER, SP_WEBCRAWL_PAS) # New password
        )
        
        html_string = response.json()['results'][0]['content']

        hotel_source_html = soup(html_string, 'lxml')
        # <span class="CdhWK _S "><span class="biGQs _P pZUbB KxBGd">126 Banyan Way, Hilo, Island of Hawaii, HI 96720</span>
        try:
            address_parent_span = hotel_source_html.select_one('span[class*="CdhWK"]')
            address_span = address_parent_span.select('span[class*="biGQs"]')
            address = address_span[0].text.strip()
        except Exception as e:
            address = np.nan
            print(f"400: {url}")
            failed_urls.append((url, "address", e))

        # rooms <div class="IhqAp Ci">140</div>
        try:
            label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')
            number_of_rooms = label_div.find_next('div').text.strip() if label_div else np.nan
        except Exception as e:
            number_of_rooms = np.nan
            print(f"400: {url}")
            failed_urls.append((url, "number of rooms", e))
        # print('rooms:', number_of_rooms)
        incomplete_hotel_lst[index] = incomplete_hotel_lst[index] + [address, number_of_rooms]
    
    print("Completed!")
    return incomplete_hotel_lst, failed_urls
        

In [178]:
partial_hilo_hotel_lst = pd.read_csv("partial_hilo_hotel_info.csv").values.tolist()

In [179]:
complete_hilo_hotel_lst, hilo_failed_url = get_room_number(partial_hilo_hotel_lst)

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g60583-d626798-Reviews-Hilo_Honu_Inn_Bed_and_Breakfast-Hilo_Island_of_Hawaii_Hawaii.html?lk=449b96a8-9c1f-4242-9343-31adb5894b33
400: https://www.tripadvisor.com/Hotel_Review-g60583-d8454507-Reviews-Kama_aina_Inn-Hilo_Island_of_Hawaii_Hawaii.html?lk=e7785ae5-1baf-4bd3-a0e9-3e3fb0af5fda
400: https://www.tripadvisor.com/Hotel_Review-g60583-d6123125-Reviews-Shaka_Shak_Garden_Inn-Hilo_Island_of_Hawaii_Hawaii.html?lk=0ddce907-e677-46d6-946c-c7c4e896518d
Completed!


In [180]:
complete_hilo_df = pd.DataFrame(complete_hilo_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])

In [181]:
complete_hilo_df.shape

(37, 6)

In [98]:
complete_hilo_df.head(5)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/5d/ntxdl44j0m57__z99jhcwvw00000gn/T/ipykernel_10645/3148504103.py", line 1, in <module>
    complete_hilo_df.head(5)
    ^^^^^^^^^^^^^^^^
NameError: name 'complete_hilo_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Cellar/pygments/2.14.0/libexec/lib/python3.11/site-packages/pygments/styles/__init__.py", line 82, in get_style_by_name
ModuleNotFoundError: No module named 'pygments.styles.default'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 2168, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^

In [None]:
ninety_room_threshold = len(complete_hilo_df.loc[complete_hilo_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_hilo_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_hilo = (complete_hilo_df["Number of Rooms"].isna().sum() / len(complete_hilo_df)) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_hilo, 2)}%')

Number of hotels with 90 or more rooms: 4
Percentage of hotels with 90 or more rooms: 10.81%
Percentage of nan rooms 13.51%


## Galveston, Texas

In [None]:
partial_galveston_hotel_lst = crawl_hotels_lst(galveston_urls, galveston_hotel_lst, save_path = "galveston_hotel_source_code", web_driver_install = False)


In [None]:
partial_galveston_df = lst_to_df(partial_galveston_hotel_lst)

In [None]:
partial_galveston_df.shape

(111, 4)

In [None]:
partial_galveston_df.to_csv("partial_galveston_hotel_info.csv", index = False)

In [None]:
partial_galveston_hotel_lst = pd.read_csv("partial_galveston_hotel_info.csv").values.tolist()

complete_galveston_hotel_lst, galveston_failed_url = get_room_number(partial_galveston_hotel_lst)

complete_galveston_df = pd.DataFrame(complete_galveston_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g55879-d619548-Reviews-Baymont_by_Wyndham_Galveston-Galveston_Galveston_Island_Texas.html?lk=3b1f7c01-9b01-40b4-ab07-8687c9ff3ca8
400: https://www.tripadvisor.com/Hotel_Review-g55879-d1601122-Reviews-Diamond_Beach-Galveston_Galveston_Island_Texas.html?lk=7a1d8d82-7a63-4147-b5ea-ab1daa4805ad
400: https://www.tripadvisor.com/Hotel_Review-g55879-d26487915-Reviews-Casa_Del_Mar_Ocean_View-Galveston_Galveston_Island_Texas.html?lk=390589f5-571c-4a44-be46-3bc867eac7a4
Completed!


In [None]:
complete_galveston_df.shape

(111, 6)

In [None]:
complete_galveston_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count,Address,Number of Rooms
0,"The San Luis Resort, Spa and Conference Center",https://www.tripadvisor.com/Hotel_Review-g5587...,4.5,4117,"5222 Seawall Blvd, Galveston, Galveston Island...",250
1,The Tremont House Hotel,https://www.tripadvisor.com/Hotel_Review-g5587...,4.5,3458,"2300 Ship Mechanic Row St., Galveston, Galvest...",133
2,"Moody Gardens Hotel, Spa and Convention Center",https://www.tripadvisor.com/Hotel_Review-g5587...,4.5,2489,"7 Hope Blvd, Galveston, Galveston Island, TX 7...",423
3,Hilton Galveston Island Resort,https://www.tripadvisor.com/Hotel_Review-g5587...,4.0,3810,"5400 Seawall Boulevard, Galveston, Galveston I...",240
4,"Holiday Inn Resort Galveston-on the Beach, an ...",https://www.tripadvisor.com/Hotel_Review-g5587...,4.0,2506,"5002 Seawall Boulevard, Galveston, Galveston I...",181


In [None]:
ninety_room_threshold = len(complete_galveston_df.loc[complete_galveston_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_galveston_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_galveston = (complete_galveston_df["Number of Rooms"].isna().sum() / len(complete_galveston_df)) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_galveston, 2)}%')

Number of hotels with 90 or more rooms: 31
Percentage of hotels with 90 or more rooms: 27.93%
Percentage of nan rooms 30.63%


## Sunnyvale, California

In [None]:
partial_sunnyvale_hotel_lst = crawl_hotels_lst(sunnyvale_urls, sunnyvale_hotel_lst, save_path = "sunnyvale_hotel_info_", web_driver_install = False)

In [None]:
partial_sunnyvale_df = lst_to_df(partial_sunnyvale_hotel_lst)

In [None]:
partial_sunnyvale_df.shape

(48, 4)

In [None]:
print(partial_sunnyvale_hotel_lst)

[['Radisson Hotel Sunnyvale - Silicon Valley', 'https://www.tripadvisor.com/Hotel_Review-g33146-d82296-Reviews-Radisson_Hotel_Sunnyvale_Silicon_Valley-Sunnyvale_California.html?lk=b32528b3-7092-41e8-bddc-a377a71d74a5', '4.5', '1,203'], ['The Grand', 'https://www.tripadvisor.com/Hotel_Review-g33146-d225415-Reviews-The_Grand-Sunnyvale_California.html?lk=3a412f7b-5785-478b-a63c-5e8d1783408f', '4.5', '1,909'], ['Maple Tree Inn', 'https://www.tripadvisor.com/Hotel_Review-g33146-d84761-Reviews-Maple_Tree_Inn-Sunnyvale_California.html?lk=d57b1303-8962-4be5-82ea-ffde480324ff', '4.5', '491'], ['Wild Palms Hotel, A Jdv By Hyatt Hotel', 'https://www.tripadvisor.com/Hotel_Review-g33146-d217277-Reviews-Wild_Palms_Hotel_A_Jdv_By_Hyatt_Hotel-Sunnyvale_California.html?lk=7cc350c3-0e4c-4f0e-b55f-cad7fac51ae5', '4.0', '1,698'], ['Hilton Garden Inn Sunnyvale', 'https://www.tripadvisor.com/Hotel_Review-g33146-d17726425-Reviews-Hilton_Garden_Inn_Sunnyvale-Sunnyvale_California.html?lk=5976db44-7a79-4972-837

In [None]:
partial_sunnyvale_df.to_csv("partial_sunnyvale_hotel_info.csv", index = False)

In [None]:
partial_sunnyvale_hotel_lst = pd.read_csv("partial_sunnyvale_hotel_info.csv").values.tolist()

In [None]:
complete_sunnyvale_hotel_lst, sunnyvale_failed_url = get_room_number(partial_sunnyvale_hotel_lst)

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g33146-d225415-Reviews-The_Grand-Sunnyvale_California.html?lk=835fe2bf-6065-45e9-9523-9bfeb664f1d3
400: https://www.tripadvisor.com/Hotel_Review-g33146-d21504015-Reviews-Tetra_Hotel_Autograph_Collection-Sunnyvale_California.html?lk=1c76fc62-036b-47cd-ac32-f741dbb06e76
400: https://www.tripadvisor.com/Hotel_Review-g33146-d21248236-Reviews-Redwood_Place_in_Heart_of_Silicon_Valley-Sunnyvale_California.html?lk=70326539-9d28-4d4d-b904-93cb98bb9622
Completed!


In [None]:
complete_sunnyvale_df = pd.DataFrame(complete_sunnyvale_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])


In [None]:
complete_sunnyvale_df.shape

(48, 6)

In [None]:
complete_sunnyvale_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count,Address,Number of Rooms
0,Radisson Hotel Sunnyvale - Silicon Valley,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,1203,"1300 Chesapeake Terrace, Sunnyvale, CA 94089",180
1,The Grand,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,1909,,104
2,Maple Tree Inn,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,491,"711 E El Camino Real, Sunnyvale, CA 94087-2900",178
3,"Wild Palms Hotel, A Jdv By Hyatt Hotel",https://www.tripadvisor.com/Hotel_Review-g3314...,4.0,1698,"910 E Fremont Ave, Sunnyvale, CA 94087-3702",207
4,Hilton Garden Inn Sunnyvale,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,59,"767 N. Mathilda, Sunnyvale, CA 94085",238


In [None]:
ninety_room_threshold = len(complete_sunnyvale_df.loc[complete_sunnyvale_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_sunnyvale_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_sunnyvale = (complete_sunnyvale_df["Number of Rooms"].isna().sum() / len(complete_sunnyvale_df)) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_sunnyvale, 2)}%')

Number of hotels with 90 or more rooms: 25
Percentage of hotels with 90 or more rooms: 52.08%
Percentage of nan rooms 22.92%


## Jersey City, New Jersey

In [None]:
def api_crawlhotels_lst(url_lst, hotel_info_lst, save_path = False):
    '''
    Use a Smartproxy API to scrape the hotel information from a list of hotel 
    urls.
    '''
    for url in url_lst:
        count = 0
        task_params = {
        "target": "universal",
        "url": url # hotel_tripadvisor_url
        }

        response = requests.post(
            "https://scrape.smartproxy.com/v1/tasks",
            json = task_params,
            auth=("U0000158310", "PW1264e87a63ebb47fbe403aff536802abf")
        )
        html_string = response.json()['results'][0]['content']
        
        page_source_code = soup(html_string, 'lxml')

        if save_path:
            save_path = save_path + str(count) + ".pkl"
            with open(save_path, 'wb') as f:
                pickle.dump(page_source_code, f)
            count += 1
        
        hotel_info_lst = get_hotel_info(page_source_code, hotel_info_lst)
        
    return hotel_info_lst

In [None]:
partial_jerseycity_hotel_lst = api_crawlhotels_lst(jerseycity_urls, jerseycity_hotel_lst, save_path = "jerseycity_hotel_info_")

In [None]:
partial_jerseycity_df = lst_to_df(partial_jerseycity_hotel_lst)

In [None]:
partial_jerseycity_df.shape

(131, 4)

In [None]:
partial_jerseycity_df.to_csv("partial_jerseycity_hotel_info.csv", index = False)

In [None]:
partial_jerseycity_hotel_lst = pd.read_csv("partial_jerseycity_hotel_info.csv").values.tolist()

In [None]:
complete_jerseycity_hotel_lst, jerseycity_failed_url = get_room_number(partial_jerseycity_hotel_lst)

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g46531-d20173142-Reviews-Canopy_by_Hilton_Jersey_City_Arts_District-Jersey_City_New_Jersey.html?lk=bc889c6f-86f1-4e75-a8ef-0e8fbf133519
400: https://www.tripadvisor.com/Hotel_Review-g46531-d225883-Reviews-Sonesta_Simply_Suites_Jersey_City-Jersey_City_New_Jersey.html?lk=23e6d6f0-df27-4245-b8fa-40884444b7df
400: https://www.tripadvisor.com/Hotel_Review-g46531-d225883-Reviews-Sonesta_Simply_Suites_Jersey_City-Jersey_City_New_Jersey.html?lk=812c43bf-9670-4d62-85dd-e253e9f14210
400: https://www.tripadvisor.com/Hotel_Review-g46531-d13994921-Reviews-Kensho_homes-Jersey_City_New_Jersey.html?lk=11e24cb4-29cc-4a59-ba52-2e5c62b41947
400: https://www.tripadvisor.com/Hotel_Review-g46531-d92414-Reviews-Days_Inn_by_Wyndham_Jersey_City-Jersey_City_New_Jersey.html?lk=96c0e7e9-8a1d-4a5d-9174-1f29cf85419a
400: https://www.tripadvisor.com/Hotel_Review-g46531-d5535264-Reviews-The_Pier-Jersey_City_New_Jersey.html?lk=6d0db5ac-410c-4bc8-80cc-ca9958e294c9
400: http

In [None]:
complete_jerseycity_df = pd.DataFrame(complete_jerseycity_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])

In [None]:
complete_jerseycity_df.shape

(131, 6)

In [None]:
complete_jerseycity_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count,Address,Number of Rooms
0,The Westin Jersey City Newport,https://www.tripadvisor.com/Hotel_Review-g4653...,4.5,1720,"479 Washington Blvd, Jersey City, NJ 07310-1972",429
1,Hyatt Regency Jersey City on the Hudson,https://www.tripadvisor.com/Hotel_Review-g4653...,4.5,4289,"2 Exchange Pl, Jersey City, NJ 07302-3901",351
2,Hyatt House Jersey City,https://www.tripadvisor.com/Hotel_Review-g4653...,4.0,1269,"1 Exchange Pl, Jersey City, NJ 07302-3920",258
3,The Holland Hotel,https://www.tripadvisor.com/Hotel_Review-g4653...,4.0,779,"175 12th St, Jersey City, NJ 07310-1405",70
4,DoubleTree by Hilton Hotel & Suites Jersey City,https://www.tripadvisor.com/Hotel_Review-g4653...,4.0,1891,"455 Washington Boulevard, Jersey City, NJ 07310",198


In [None]:
ninety_room_threshold = len(complete_jerseycity_df.loc[complete_jerseycity_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_jerseycity_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_jerserycity = (complete_jerseycity_df["Number of Rooms"].isna().sum() / complete_jerseycity_df.shape[0]) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_jerserycity, 2)}%')

Number of hotels with 90 or more rooms: 31
Percentage of hotels with 90 or more rooms: 23.66%
Percentage of nan rooms 49.62%
