In [125]:
from seleniumwire import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
import time
import pickle
import os
import urllib.parse
import random
import requests
from pprint import pprint


In [97]:
BASE_URL = "https://www.tripadvisor.com"

In [96]:
hilo_hotel_lst = []
galveston_hotel_lst = []
sunnyvale_hotel_lst = []
jerseycity_hotel_lst = []

In [95]:
hilo_urls = ['https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html', "https://www.tripadvisor.com/Hotels-g60583-oa30-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"]
galveston_urls = ['https://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Island_Texas-Hotels.html', "https://www.tripadvisor.com/Hotels-g55879-oa30-Galveston_Galveston_Island_Texas-Hotels.html", "https://www.tripadvisor.com/Hotels-g55879-oa60-Galveston_Galveston_Island_Texas-Hotels.html", "https://www.tripadvisor.com/Hotels-g55879-oa90-Galveston_Galveston_Island_Texas-Hotels.html"]
sunnyvale_urls = ['https://www.tripadvisor.com/Hotels-g33146-Sunnyvale_California-Hotels.html', "https://www.tripadvisor.com/Hotels-g33146-oa30-Sunnyvale_California-Hotels.html"]
jerseycity_urls = ['https://www.tripadvisor.com/Hotels-g46531-Jersey_City_New_Jersey-Hotels.html', "https://www.tripadvisor.com/Hotels-g46531-oa30-Jersey_City_New_Jersey-Hotels.html", "https://www.tripadvisor.com/Hotels-g46531-oa60-Jersey_City_New_Jersey-Hotels.html", "https://www.tripadvisor.com/Hotels-g46531-oa90-Jersey_City_New_Jersey-Hotels.html"]

In [192]:
def crawl_hotels_lst(url_lst, hotel_info_lst, save_path = False, web_driver_install = False):
    '''
    Crawls the a city's list of hotels on Tripadvisor.
    '''
    for url in url_lst:
        count = 0
        
        if web_driver_install:
            driver = webdriver.Chrome(ChromeDriverManager().install())
        else:
            driver = webdriver.Chrome()
        driver.get(url)
        driver.implicitly_wait(random.randint(30, 100))
        page_source_code = soup(driver.page_source, 'lxml')
        driver.quit()
        
        if save_path:
            save_path = save_path + str(count) + ".pkl"
            with open(save_path, 'wb') as f:
                pickle.dump(page_source_code, f)
            count += 1
        
        hotel_info_lst = get_hotel_info(page_source_code, hotel_info_lst)
        
    return hotel_info_lst


def get_hotel_info(page_source_code, hotel_info_lst):
    '''
    Creates a lists within a list of hotel information from the page source 
    code from a Tripadvisor city's list of hotels webpage.
    '''
    hotel_divs = page_source_code.select('div[class*="rlqQt"]')
    # print(hotel_divs)

    for hotel in hotel_divs:
        location_name = hotel.select('h3[class*="nBrpc"]')
        location_name = location_name[0].get_text().split(".")[1].strip()
        
        # parent_location_url = hotel.select('a[class*="lqfZ"]')
        location_url_div = hotel.select('div[class*="jsTLT"]')
        location_url_a_tag = location_url_div[0].select('a')
        location_url_href = location_url_a_tag[0]['href']
        location_url = urllib.parse.urljoin(BASE_URL, location_url_href)
        # print(location_url)
        
        # Example: <div class="luFhX o W f u w JSdbl" aria-label="4.5 of 5 bubbles. 1,909 reviews">
        review_info_div = hotel.select('div[class*="luFhX"]')
        review_info = review_info_div[0].get('aria-label')
        # print(review_info)
        
        if "bubbles." in review_info:
            rating = review_info.split("bubbles.")[0].strip().split(" ")[0]
            review_count = review_info.split("bubbles.")[1].strip().split(" ")[0]
            # print(rating, "-", review_count)
        else:
            review_info.strip().split(" ")[0]
            rating = np.nan
            review_count = 0
            
        hotel_info_lst.append([location_name, location_url, rating, review_count])
    
    return hotel_info_lst
        
def lst_to_df(hotel_info_lst):
    df = pd.DataFrame(hotel_info_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count"])
    return df

## Hilo, Hawaii

In [80]:
partial_hilo_hotel_lst = crawl_hotels_lst(hilo_urls, hilo_hotel_lst, save_path = "hilo_hotel_source_code", web_driver_install = False)

In [81]:
partial_hilo_df = lst_to_df(partial_hilo_hotel_lst)

In [106]:
partial_hilo_df.shape

(37, 4)

In [83]:
partial_hilo_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count
0,SCP Hilo Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,226
1,The Inn at Kulaniapia Falls,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,1164
2,Hilo Honu Inn Bed and Breakfast,https://www.tripadvisor.com/Hotel_Review-g6058...,5.0,312
3,Grand Naniloa Hotel Hilo - A Doubletree By Hilton,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,1759
4,Hilo Hawaiian Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,2420


In [130]:
partial_hilo_df.to_csv("partial_hilo_hotel_info.csv", index = False)

In [129]:
task_params = {
        "target": "universal",
        "url": url # hotel_tripadvisor_url
}

response = requests.post(
    "https://scrape.smartproxy.com/v1/tasks",
    json = task_params,
    auth=("U0000158310", "PW1264e87a63ebb47fbe403aff536802abf")
)

label_div = soup.find('div', text='NUMBER OF ROOMS')
number_of_rooms = label_div.find_next('div').text.strip() if label_div else np.nan



pprint(response.json())


{'results': [{'content': '<!DOCTYPE html><html lang="en-US"><head><meta '
                         'http-equiv="content-type" content="text/html; '
                         'charset=utf-8"/><meta http-equiv="content-language" '
                         'content="en"/><link '
                         'href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" '
                         'rel="preload" as="font" type="font/woff2" '
                         'crossorigin="anonymous"/><link rel="icon" '
                         'id="favicon" '
                         'href="https://static.tacdn.com/favicon.ico?v2" '
                         'type="image/x-icon"/><link rel="mask-icon" '
                         'sizes="any" '
                         'href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" '
                         'color="#000000"/><meta name="theme-color" '
                         'content="#34e0a1"/><meta name="format-dete

In [143]:
html_string = response.json()['results'][0]['content']



In [141]:
type(response.json()['results'][0]['content'])

str

In [176]:
def get_room_number(incomplete_hotel_lst):
    '''
    Use Smartproxy API to scrape the the hotel address and number of rooms 
    from the hotel's tripadvisor page.
    '''
    for index, (_, url, _, _) in enumerate(incomplete_hotel_lst):
        failed_urls = []
        
        task_params = {
        "target": "universal",
        "url": url # hotel_tripadvisor_url
        }

        response = requests.post(
            "https://scrape.smartproxy.com/v1/tasks",
            json = task_params,
            auth=("U0000158310", "Wznl9MXYhvtuku1_12") # New password
        )
        
        html_string = response.json()['results'][0]['content']

        hotel_source_html = soup(html_string, 'lxml')
        # <span class="CdhWK _S "><span class="biGQs _P pZUbB KxBGd">126 Banyan Way, Hilo, Island of Hawaii, HI 96720</span>
        try:
            address_parent_span = hotel_source_html.select_one('span[class*="CdhWK"]')
            address_span = address_parent_span.select('span[class*="biGQs"]')
            address = address_span[0].text.strip()
        except Exception as e:
            address = np.nan
            print(f"400: {url}")
            failed_urls.append((url, "address", e))

        # rooms <div class="IhqAp Ci">140</div>
        try:
            label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')
            number_of_rooms = label_div.find_next('div').text.strip() if label_div else np.nan
        except Exception as e:
            number_of_rooms = np.nan
            print(f"400: {url}")
            failed_urls.append((url, "number of rooms", e))
        # print('rooms:', number_of_rooms)
        incomplete_hotel_lst[index] = incomplete_hotel_lst[index] + [address, number_of_rooms]
    
    print("Completed!")
    return incomplete_hotel_lst, failed_urls
        

In [178]:
partial_hilo_hotel_lst = pd.read_csv("partial_hilo_hotel_info.csv").values.tolist()

In [179]:
complete_hilo_hotel_lst, hilo_failed_url = get_room_number(partial_hilo_hotel_lst)

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g60583-d626798-Reviews-Hilo_Honu_Inn_Bed_and_Breakfast-Hilo_Island_of_Hawaii_Hawaii.html?lk=449b96a8-9c1f-4242-9343-31adb5894b33
400: https://www.tripadvisor.com/Hotel_Review-g60583-d8454507-Reviews-Kama_aina_Inn-Hilo_Island_of_Hawaii_Hawaii.html?lk=e7785ae5-1baf-4bd3-a0e9-3e3fb0af5fda
400: https://www.tripadvisor.com/Hotel_Review-g60583-d6123125-Reviews-Shaka_Shak_Garden_Inn-Hilo_Island_of_Hawaii_Hawaii.html?lk=0ddce907-e677-46d6-946c-c7c4e896518d
Completed!


In [180]:
complete_hilo_df = pd.DataFrame(complete_hilo_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])

In [181]:
complete_hilo_df.shape

(37, 6)

In [182]:
complete_hilo_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count,Address,Number of Rooms
0,SCP Hilo Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,226,"126 Banyan Way, Hilo, Island of Hawaii, HI 96720",140
1,The Inn at Kulaniapia Falls,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,1164,"100 Kulaniapia Dr, Hilo, Island of Hawaii, HI ...",13
2,Hilo Honu Inn Bed and Breakfast,https://www.tripadvisor.com/Hotel_Review-g6058...,5.0,312,,3
3,Grand Naniloa Hotel Hilo - A Doubletree By Hilton,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,1759,"93 Banyan Dr, Hilo, Island of Hawaii, HI 96720...",378
4,Hilo Hawaiian Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,2420,"71 Banyan Drive, Hilo, Island of Hawaii, HI 96...",286


In [222]:
ninety_room_threshold = len(complete_hilo_df.loc[complete_hilo_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_hilo_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_hilo = (complete_hilo_df["Number of Rooms"].isna().sum() / len(complete_hilo_df)) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_hilo, 2)}%')

Number of hotels with 90 or more rooms: 4
Percentage of hotels with 90 or more rooms: 10.81%
Percentage of nan rooms 13.51%


## Galveston, Texas

In [84]:
partial_galveston_hotel_lst = crawl_hotels_lst(galveston_urls, galveston_hotel_lst, save_path = "galveston_hotel_source_code", web_driver_install = False)


In [89]:
partial_galveston_df = lst_to_df(partial_galveston_hotel_lst)

In [90]:
partial_galveston_df.shape

(111, 4)

In [131]:
partial_galveston_df.to_csv("partial_galveston_hotel_info.csv", index = False)

In [183]:
partial_galveston_hotel_lst = pd.read_csv("partial_galveston_hotel_info.csv").values.tolist()

complete_galveston_hotel_lst, galveston_failed_url = get_room_number(partial_galveston_hotel_lst)

complete_galveston_df = pd.DataFrame(complete_galveston_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g55879-d619548-Reviews-Baymont_by_Wyndham_Galveston-Galveston_Galveston_Island_Texas.html?lk=3b1f7c01-9b01-40b4-ab07-8687c9ff3ca8
400: https://www.tripadvisor.com/Hotel_Review-g55879-d1601122-Reviews-Diamond_Beach-Galveston_Galveston_Island_Texas.html?lk=7a1d8d82-7a63-4147-b5ea-ab1daa4805ad
400: https://www.tripadvisor.com/Hotel_Review-g55879-d26487915-Reviews-Casa_Del_Mar_Ocean_View-Galveston_Galveston_Island_Texas.html?lk=390589f5-571c-4a44-be46-3bc867eac7a4
Completed!


In [184]:
complete_galveston_df.shape

(111, 6)

In [185]:
complete_galveston_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count,Address,Number of Rooms
0,"The San Luis Resort, Spa and Conference Center",https://www.tripadvisor.com/Hotel_Review-g5587...,4.5,4117,"5222 Seawall Blvd, Galveston, Galveston Island...",250
1,The Tremont House Hotel,https://www.tripadvisor.com/Hotel_Review-g5587...,4.5,3458,"2300 Ship Mechanic Row St., Galveston, Galvest...",133
2,"Moody Gardens Hotel, Spa and Convention Center",https://www.tripadvisor.com/Hotel_Review-g5587...,4.5,2489,"7 Hope Blvd, Galveston, Galveston Island, TX 7...",423
3,Hilton Galveston Island Resort,https://www.tripadvisor.com/Hotel_Review-g5587...,4.0,3810,"5400 Seawall Boulevard, Galveston, Galveston I...",240
4,"Holiday Inn Resort Galveston-on the Beach, an ...",https://www.tripadvisor.com/Hotel_Review-g5587...,4.0,2506,"5002 Seawall Boulevard, Galveston, Galveston I...",181


In [220]:
ninety_room_threshold = len(complete_galveston_df.loc[complete_galveston_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_galveston_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_galveston = (complete_galveston_df["Number of Rooms"].isna().sum() / len(complete_galveston_df)) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_galveston, 2)}%')

Number of hotels with 90 or more rooms: 31
Percentage of hotels with 90 or more rooms: 27.93%
Percentage of nan rooms 30.63%


## Sunnyvale, California

In [100]:
partial_sunnyvale_hotel_lst = crawl_hotels_lst(sunnyvale_urls, sunnyvale_hotel_lst, save_path = "sunnyvale_hotel_info_", web_driver_install = False)

In [101]:
partial_sunnyvale_df = lst_to_df(partial_sunnyvale_hotel_lst)

In [134]:
partial_sunnyvale_df.shape

(48, 4)

In [73]:
print(partial_sunnyvale_hotel_lst)

[['Radisson Hotel Sunnyvale - Silicon Valley', 'https://www.tripadvisor.com/Hotel_Review-g33146-d82296-Reviews-Radisson_Hotel_Sunnyvale_Silicon_Valley-Sunnyvale_California.html?lk=b32528b3-7092-41e8-bddc-a377a71d74a5', '4.5', '1,203'], ['The Grand', 'https://www.tripadvisor.com/Hotel_Review-g33146-d225415-Reviews-The_Grand-Sunnyvale_California.html?lk=3a412f7b-5785-478b-a63c-5e8d1783408f', '4.5', '1,909'], ['Maple Tree Inn', 'https://www.tripadvisor.com/Hotel_Review-g33146-d84761-Reviews-Maple_Tree_Inn-Sunnyvale_California.html?lk=d57b1303-8962-4be5-82ea-ffde480324ff', '4.5', '491'], ['Wild Palms Hotel, A Jdv By Hyatt Hotel', 'https://www.tripadvisor.com/Hotel_Review-g33146-d217277-Reviews-Wild_Palms_Hotel_A_Jdv_By_Hyatt_Hotel-Sunnyvale_California.html?lk=7cc350c3-0e4c-4f0e-b55f-cad7fac51ae5', '4.0', '1,698'], ['Hilton Garden Inn Sunnyvale', 'https://www.tripadvisor.com/Hotel_Review-g33146-d17726425-Reviews-Hilton_Garden_Inn_Sunnyvale-Sunnyvale_California.html?lk=5976db44-7a79-4972-837

In [133]:
partial_sunnyvale_df.to_csv("partial_sunnyvale_hotel_info.csv", index = False)

In [186]:
partial_sunnyvale_hotel_lst = pd.read_csv("partial_sunnyvale_hotel_info.csv").values.tolist()

In [188]:
complete_sunnyvale_hotel_lst, sunnyvale_failed_url = get_room_number(partial_sunnyvale_hotel_lst)

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g33146-d225415-Reviews-The_Grand-Sunnyvale_California.html?lk=835fe2bf-6065-45e9-9523-9bfeb664f1d3
400: https://www.tripadvisor.com/Hotel_Review-g33146-d21504015-Reviews-Tetra_Hotel_Autograph_Collection-Sunnyvale_California.html?lk=1c76fc62-036b-47cd-ac32-f741dbb06e76
400: https://www.tripadvisor.com/Hotel_Review-g33146-d21248236-Reviews-Redwood_Place_in_Heart_of_Silicon_Valley-Sunnyvale_California.html?lk=70326539-9d28-4d4d-b904-93cb98bb9622
Completed!


In [189]:
complete_sunnyvale_df = pd.DataFrame(complete_sunnyvale_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])


In [190]:
complete_sunnyvale_df.shape

(48, 6)

In [191]:
complete_sunnyvale_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count,Address,Number of Rooms
0,Radisson Hotel Sunnyvale - Silicon Valley,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,1203,"1300 Chesapeake Terrace, Sunnyvale, CA 94089",180
1,The Grand,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,1909,,104
2,Maple Tree Inn,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,491,"711 E El Camino Real, Sunnyvale, CA 94087-2900",178
3,"Wild Palms Hotel, A Jdv By Hyatt Hotel",https://www.tripadvisor.com/Hotel_Review-g3314...,4.0,1698,"910 E Fremont Ave, Sunnyvale, CA 94087-3702",207
4,Hilton Garden Inn Sunnyvale,https://www.tripadvisor.com/Hotel_Review-g3314...,4.5,59,"767 N. Mathilda, Sunnyvale, CA 94085",238


In [221]:
ninety_room_threshold = len(complete_sunnyvale_df.loc[complete_sunnyvale_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_sunnyvale_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_sunnyvale = (complete_sunnyvale_df["Number of Rooms"].isna().sum() / len(complete_sunnyvale_df)) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_sunnyvale, 2)}%')

Number of hotels with 90 or more rooms: 25
Percentage of hotels with 90 or more rooms: 52.08%
Percentage of nan rooms 22.92%


## Jersey City, New Jersey

In [196]:
def api_crawlhotels_lst(url_lst, hotel_info_lst, save_path = False):
    '''
    Use a Smartproxy API to scrape the hotel information from a list of hotel 
    urls.
    '''
    for url in url_lst:
        count = 0
        task_params = {
        "target": "universal",
        "url": url # hotel_tripadvisor_url
        }

        response = requests.post(
            "https://scrape.smartproxy.com/v1/tasks",
            json = task_params,
            auth=("U0000158310", "PW1264e87a63ebb47fbe403aff536802abf")
        )
        html_string = response.json()['results'][0]['content']
        
        page_source_code = soup(html_string, 'lxml')

        if save_path:
            save_path = save_path + str(count) + ".pkl"
            with open(save_path, 'wb') as f:
                pickle.dump(page_source_code, f)
            count += 1
        
        hotel_info_lst = get_hotel_info(page_source_code, hotel_info_lst)
        
    return hotel_info_lst

In [198]:
partial_jerseycity_hotel_lst = api_crawlhotels_lst(jerseycity_urls, jerseycity_hotel_lst, save_path = "jerseycity_hotel_info_")

In [199]:
partial_jerseycity_df = lst_to_df(partial_jerseycity_hotel_lst)

In [200]:
partial_jerseycity_df.shape

(131, 4)

In [201]:
partial_jerseycity_df.to_csv("partial_jerseycity_hotel_info.csv", index = False)

In [202]:
partial_jerseycity_hotel_lst = pd.read_csv("partial_jerseycity_hotel_info.csv").values.tolist()

In [203]:
complete_jerseycity_hotel_lst, jerseycity_failed_url = get_room_number(partial_jerseycity_hotel_lst)

  label_div = hotel_source_html.find('div', text='NUMBER OF ROOMS')


400: https://www.tripadvisor.com/Hotel_Review-g46531-d20173142-Reviews-Canopy_by_Hilton_Jersey_City_Arts_District-Jersey_City_New_Jersey.html?lk=bc889c6f-86f1-4e75-a8ef-0e8fbf133519
400: https://www.tripadvisor.com/Hotel_Review-g46531-d225883-Reviews-Sonesta_Simply_Suites_Jersey_City-Jersey_City_New_Jersey.html?lk=23e6d6f0-df27-4245-b8fa-40884444b7df
400: https://www.tripadvisor.com/Hotel_Review-g46531-d225883-Reviews-Sonesta_Simply_Suites_Jersey_City-Jersey_City_New_Jersey.html?lk=812c43bf-9670-4d62-85dd-e253e9f14210
400: https://www.tripadvisor.com/Hotel_Review-g46531-d13994921-Reviews-Kensho_homes-Jersey_City_New_Jersey.html?lk=11e24cb4-29cc-4a59-ba52-2e5c62b41947
400: https://www.tripadvisor.com/Hotel_Review-g46531-d92414-Reviews-Days_Inn_by_Wyndham_Jersey_City-Jersey_City_New_Jersey.html?lk=96c0e7e9-8a1d-4a5d-9174-1f29cf85419a
400: https://www.tripadvisor.com/Hotel_Review-g46531-d5535264-Reviews-The_Pier-Jersey_City_New_Jersey.html?lk=6d0db5ac-410c-4bc8-80cc-ca9958e294c9
400: http

In [204]:
complete_jerseycity_df = pd.DataFrame(complete_jerseycity_hotel_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count", "Address", "Number of Rooms"])

In [205]:
complete_jerseycity_df.shape

(131, 6)

In [206]:
complete_jerseycity_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count,Address,Number of Rooms
0,The Westin Jersey City Newport,https://www.tripadvisor.com/Hotel_Review-g4653...,4.5,1720,"479 Washington Blvd, Jersey City, NJ 07310-1972",429
1,Hyatt Regency Jersey City on the Hudson,https://www.tripadvisor.com/Hotel_Review-g4653...,4.5,4289,"2 Exchange Pl, Jersey City, NJ 07302-3901",351
2,Hyatt House Jersey City,https://www.tripadvisor.com/Hotel_Review-g4653...,4.0,1269,"1 Exchange Pl, Jersey City, NJ 07302-3920",258
3,The Holland Hotel,https://www.tripadvisor.com/Hotel_Review-g4653...,4.0,779,"175 12th St, Jersey City, NJ 07310-1405",70
4,DoubleTree by Hilton Hotel & Suites Jersey City,https://www.tripadvisor.com/Hotel_Review-g4653...,4.0,1891,"455 Washington Boulevard, Jersey City, NJ 07310",198


In [218]:
ninety_room_threshold = len(complete_jerseycity_df.loc[complete_jerseycity_df.loc[:, "Number of Rooms"].astype(float) >= 90, :])
print(f'Number of hotels with 90 or more rooms: {ninety_room_threshold}')
ninety_room_percentage = (ninety_room_threshold / len(complete_jerseycity_df)) * 100
print(f'Percentage of hotels with 90 or more rooms: {round(ninety_room_percentage, 2)}%')
roomnan_percentage_jerserycity = (complete_jerseycity_df["Number of Rooms"].isna().sum() / complete_jerseycity_df.shape[0]) * 100
print(f'Percentage of nan rooms {round(roomnan_percentage_jerserycity, 2)}%')

Number of hotels with 90 or more rooms: 31
Percentage of hotels with 90 or more rooms: 23.66%
Percentage of nan rooms 49.62%
