In [63]:
from seleniumwire import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
import time
import pickle
import os
import urllib.parse
import random


In [64]:
BASE_URL = "https://www.tripadvisor.com"

In [76]:
hilo_hotel_lst = []
galveston_hotel_lst = []
sunnyvale_hotel_lst = []
jerseycity_hotel_lst = []

In [75]:
hilo_urls = ['https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html', "https://www.tripadvisor.com/Hotels-g60583-oa30-Hilo_Island_of_Hawaii_Hawaii-Hotels.html"]
galveston_urls = ['https://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Island_Texas-Hotels.html', "https://www.tripadvisor.com/Hotels-g55879-oa30-Galveston_Galveston_Island_Texas-Hotels.html", "https://www.tripadvisor.com/Hotels-g55879-oa60-Galveston_Galveston_Island_Texas-Hotels.html", "https://www.tripadvisor.com/Hotels-g55879-oa90-Galveston_Galveston_Island_Texas-Hotels.html"]
sunnyvale_urls = ['https://www.tripadvisor.com/Hotels-g33146-Sunnyvale_California-Hotels.html', "https://www.tripadvisor.com/Hotels-g33146-oa30-Sunnyvale_California-Hotels.html"]
jerseycity_urls = ['https://www.tripadvisor.com/Hotels-g46531-Jersey_City_New_Jersey-Hotels.html', "https://www.tripadvisor.com/Hotels-g46531-oa30-Jersey_City_New_Jersey-Hotels.html", "https://www.tripadvisor.com/Hotels-g46531-oa60-Jersey_City_New_Jersey-Hotels.html", "https://www.tripadvisor.com/Hotels-g46531-oa90-Jersey_City_New_Jersey-Hotels.html"]

In [88]:
def crawl_hotels_lst(url_lst, hotel_info_lst, save_path = False, web_driver_install = False):
    for url in url_lst:
        count = 0
        
        if web_driver_install:
            driver = webdriver.Chrome(ChromeDriverManager().install())
        else:
            driver = webdriver.Chrome()
        driver.get(url)
        driver.implicitly_wait(random.randint(30, 100))
        page_source_code = soup(driver.page_source, 'lxml')
        driver.quit()
        
        if save_path:
            save_path = save_path + str(count) + ".pkl"
            with open(save_path, 'wb') as f:
                pickle.dump(page_source_code, f)
            count += 1
        
        hotel_info_lst = get_hotel_info(page_source_code, hotel_info_lst)
        
    return hotel_info_lst


def get_hotel_info(page_source_code, hotel_info_lst):
    hotel_divs = page_source_code.select('div[class*="rlqQt"]')
    # print(hotel_divs)

    for hotel in hotel_divs:
        location_name = hotel.select('h3[class*="nBrpc"]')
        location_name = location_name[0].get_text().split(".")[1].strip()
        
        # parent_location_url = hotel.select('a[class*="lqfZ"]')
        location_url_div = hotel.select('div[class*="jsTLT"]')
        location_url_a_tag = location_url_div[0].select('a')
        location_url_href = location_url_a_tag[0]['href']
        location_url = urllib.parse.urljoin(BASE_URL, location_url_href)
        # print(location_url)
        
        # Example: <div class="luFhX o W f u w JSdbl" aria-label="4.5 of 5 bubbles. 1,909 reviews">
        review_info_div = hotel.select('div[class*="luFhX"]')
        review_info = review_info_div[0].get('aria-label')
        # print(review_info)
        
        if "bubbles." in review_info:
            rating = review_info.split("bubbles.")[0].strip().split(" ")[0]
            review_count = review_info.split("bubbles.")[1].strip().split(" ")[0]
            # print(rating, "-", review_count)
        else:
            review_info.strip().split(" ")[0]
            rating = np.nan
            review_count = 0
            
        hotel_info_lst.append([location_name, location_url, rating, review_count])
    
    return hotel_info_lst
        
def lst_to_df(hotel_info_lst):
    df = pd.DataFrame(hotel_info_lst, columns = ["Location Name", "Location URL", "Rating", "Review Count"])
    return df

In [80]:
partial_hilo_hotel_lst = crawl_hotels_lst(hilo_urls, hilo_hotel_lst, save_path = "hilo_hotel_source_code", web_driver_install = False)

In [81]:
partial_hilo_df = lst_to_df(partial_hilo_hotel_lst)

In [82]:
partial_hilo_df.shape

(37, 4)

In [83]:
partial_hilo_df.head(5)

Unnamed: 0,Location Name,Location URL,Rating,Review Count
0,SCP Hilo Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,226
1,The Inn at Kulaniapia Falls,https://www.tripadvisor.com/Hotel_Review-g6058...,4.5,1164
2,Hilo Honu Inn Bed and Breakfast,https://www.tripadvisor.com/Hotel_Review-g6058...,5.0,312
3,Grand Naniloa Hotel Hilo - A Doubletree By Hilton,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,1759
4,Hilo Hawaiian Hotel,https://www.tripadvisor.com/Hotel_Review-g6058...,3.5,2420


In [84]:
partial_galveston_hotel_lst = crawl_hotels_lst(galveston_urls, galveston_hotel_lst, save_path = "galveston_hotel_source_code", web_driver_install = False)


In [89]:
partial_galveston_df = lst_to_df(partial_galveston_hotel_lst)

In [90]:
partial_galveston_df.shape

(111, 4)

In [None]:
partial_sunnyvale_hotel_lst = crawl_hotels_lst(sunnyvale_urls, sunnyvale_hotel_lst, save_path = "sunnyvale_hotel_info_", web_driver_install = False)

In [73]:
print(partial_sunnyvale_hotel_lst)

[['Radisson Hotel Sunnyvale - Silicon Valley', 'https://www.tripadvisor.com/Hotel_Review-g33146-d82296-Reviews-Radisson_Hotel_Sunnyvale_Silicon_Valley-Sunnyvale_California.html?lk=b32528b3-7092-41e8-bddc-a377a71d74a5', '4.5', '1,203'], ['The Grand', 'https://www.tripadvisor.com/Hotel_Review-g33146-d225415-Reviews-The_Grand-Sunnyvale_California.html?lk=3a412f7b-5785-478b-a63c-5e8d1783408f', '4.5', '1,909'], ['Maple Tree Inn', 'https://www.tripadvisor.com/Hotel_Review-g33146-d84761-Reviews-Maple_Tree_Inn-Sunnyvale_California.html?lk=d57b1303-8962-4be5-82ea-ffde480324ff', '4.5', '491'], ['Wild Palms Hotel, A Jdv By Hyatt Hotel', 'https://www.tripadvisor.com/Hotel_Review-g33146-d217277-Reviews-Wild_Palms_Hotel_A_Jdv_By_Hyatt_Hotel-Sunnyvale_California.html?lk=7cc350c3-0e4c-4f0e-b55f-cad7fac51ae5', '4.0', '1,698'], ['Hilton Garden Inn Sunnyvale', 'https://www.tripadvisor.com/Hotel_Review-g33146-d17726425-Reviews-Hilton_Garden_Inn_Sunnyvale-Sunnyvale_California.html?lk=5976db44-7a79-4972-837

In [85]:
partial_jerseycity_hotel_lst = crawl_hotels_lst(jerseycity_urls, jerseycity_hotel_lst, save_path = "jerseycity_hotel_info_", web_driver_install = False)

In [86]:
partial_jerseycity_df = lst_to_df(partial_jerseycity_hotel_lst)

In [87]:
partial_jerseycity_df.shape

(40, 4)

## Hilo, Hawaii

In [None]:
print(partial_sunnyvale_hotel_lst)

In [6]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html')
driver.implicitly_wait(12)
soup_hilo = soup(driver.page_source)
driver.quit()

In [8]:
print(soup_hilo.prettify())

<html lang="en-US">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="en" http-equiv="content-language"/>
  <link as="font" crossorigin="anonymous" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" rel="preload" type="font/woff2"/>
  <link href="https://static.tacdn.com/favicon.ico?v2" id="favicon" rel="icon" type="image/x-icon"/>
  <link color="#000000" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" rel="mask-icon" sizes="any"/>
  <meta content="#34e0a1" name="theme-color"/>
  <meta content="telephone=no" name="format-detection"/>
  <meta content="TripAdvisor" property="al:ios:app_name"/>
  <meta content="284876795" property="al:ios:app_store_id"/>
  <meta content="284876795" name="twitter:app:id:ipad" property="twitter:app:id:ipad"/>
  <meta content="284876795" name="twitter:app:id:iphone" property="twitter:app:id:iphone"/>
  <meta content="tripadvisor://www.tripadvisor.com

In [25]:
print(soup_hilo)

<html lang="en-US"><head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="en" http-equiv="content-language"/><link as="font" crossorigin="anonymous" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" rel="preload" type="font/woff2"/><link href="https://static.tacdn.com/favicon.ico?v2" id="favicon" rel="icon" type="image/x-icon"/><link color="#000000" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" rel="mask-icon" sizes="any"/><meta content="#34e0a1" name="theme-color"/><meta content="telephone=no" name="format-detection"/><meta content="TripAdvisor" property="al:ios:app_name"/><meta content="284876795" property="al:ios:app_store_id"/><meta content="284876795" name="twitter:app:id:ipad" property="twitter:app:id:ipad"/><meta content="284876795" name="twitter:app:id:iphone" property="twitter:app:id:iphone"/><meta content="tripadvisor://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_H

In [58]:
# divs = soup_hilo.find_all('div', class_=lambda x: x and 'rlqQt' in x.split())

# print(divs)

hotel_divs = soup_hilo.select('div[class*="rlqQt"]')
# print(hotel_divs)

for hotel in hotel_divs:
    location_name = hotel.select('h3[class*="nBrpc"]')
    print(location_name[0].get_text().split(".")[1].strip())
    
    # parent_location_url = hotel.select('a[class*="lqfZ"]')
    location_url_div = hotel.select('div[class*="jsTLT"]')
    location_url_a_tag = location_url_div[0].select('a')
    location_url_href = location_url_a_tag[0]['href']
    location_url = urllib.parse.urljoin(BASE_URL, location_url_href)
    print(location_url)
    
    # Example: <div class="luFhX o W f u w JSdbl" aria-label="4.5 of 5 bubbles. 1,909 reviews">
    review_info_div = hotel.select('div[class*="luFhX"]')
    review_info = review_info_div[0].get('aria-label')

    # print(review_info)
    
    if "bubbles." in review_info:
        rating = review_info.split("bubbles.")[0].strip().split(" ")[0]
        review_count = review_info.split("bubbles.")[1].strip().split(" ")[0]
        print(rating, "-", review_count)
    else:
        review_info.strip().split(" ")[0]
        rating = np.nan
        review_count = 0
        
    
    # print(location_url_div)
    # location_url_a_tag = location_url_div.find('a')
    # location_url_href = location_url_a_tag['href']
    # print(location_url_href)
    
    # print(location_name[0].get_text())
    # print(location_name[0].get_text().split(".")[1].strip())
    
    # print(location_name)
    break
# for div in soup_hilo.find_all('div', class_='yJIls z y'):
#     print(div)
#     location_name = div.find_next_sibling('h3', class_=lambda x: x and 'nBrpc' in x.split())
#     if location_name:
#         print(location_name.get_text(strip=True))

SCP Hilo Hotel
https://www.tripadvisor.com/Hotel_Review-g60583-d113098-Reviews-SCP_Hilo_Hotel-Hilo_Island_of_Hawaii_Hawaii.html?lk=60aa012b-05ab-44ca-a5c6-4646667c44c0
4.5 - 226


In [None]:
/html/body/div[1]/main/div[2]/div/div[3]/div/div[3]/div/div[3]/span/span/span/div/div/div/div[2]/header/div/div/a/h3/text()[2]

In [None]:
<h3 class="nBrpc Wd o W RSTby">1. SCP Hilo Hotel</h3>

In [16]:
with open('soup_hilo.pkl', 'wb') as file:
    pickle.dump(soup_hilo, file, protocol=pickle.HIGHEST_PROTOCOL)

print("Pickle file saved:", os.path.isfile('soup_hilo.pkl'))

Pickle file saved: True


In [23]:
soup_hilo.builder

<bs4.builder._lxml.LXMLTreeBuilder at 0x12c192bd0>

In [7]:
driver = webdriver.Chrome()

driver.get("https://www.tripadvisor.com/Hotels-g60583-oa30-Hilo_Island_of_Hawaii_Hawaii-Hotels.html")
driver.implicitly_wait(9)
soup_hilo_2 = soup(driver.page_source)
driver.quit()

In [11]:
print(soup_hilo_2)

<html lang="en-US"><head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="en" http-equiv="content-language"/><link as="font" crossorigin="anonymous" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" rel="preload" type="font/woff2"/><link href="https://static.tacdn.com/favicon.ico?v2" id="favicon" rel="icon" type="image/x-icon"/><link color="#000000" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" rel="mask-icon" sizes="any"/><meta content="#34e0a1" name="theme-color"/><meta content="telephone=no" name="format-detection"/><meta content="TripAdvisor" property="al:ios:app_name"/><meta content="284876795" property="al:ios:app_store_id"/><meta content="284876795" name="twitter:app:id:ipad" property="twitter:app:id:ipad"/><meta content="284876795" name="twitter:app:id:iphone" property="twitter:app:id:iphone"/><meta content="tripadvisor://www.tripadvisor.com/Hotels-g60583-oa30-Hilo_Island_of_Haw

In [17]:
with open("soup_hilo_2.pkl", "wb") as file:
    pickle.dump(soup_hilo_2, file, protocol=pickle.HIGHEST_PROTOCOL)

print("Pickle file saved:", os.path.isfile("soup_hilo_2.pkl"))

Pickle file saved: True


## Galveston, Texas

In [20]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Island_Texas-Hotels.html')
driver.implicitly_wait(10)
soup_galv = soup(driver.page_source)
driver.quit()

In [21]:
print(soup_galv)

<html lang="en-US"><head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="en" http-equiv="content-language"/><link as="font" crossorigin="anonymous" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" rel="preload" type="font/woff2"/><link href="https://static.tacdn.com/favicon.ico?v2" id="favicon" rel="icon" type="image/x-icon"/><link color="#000000" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" rel="mask-icon" sizes="any"/><meta content="#34e0a1" name="theme-color"/><meta content="telephone=no" name="format-detection"/><meta content="TripAdvisor" property="al:ios:app_name"/><meta content="284876795" property="al:ios:app_store_id"/><meta content="284876795" name="twitter:app:id:ipad" property="twitter:app:id:ipad"/><meta content="284876795" name="twitter:app:id:iphone" property="twitter:app:id:iphone"/><meta content="tripadvisor://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Isl

In [None]:
with open("soup_galv.pkl", "wb") as file:
    pickle.dump(soup_galv, file, protocol=pickle.HIGHEST_PROTOCOL)

print("Pickle file saved:", os.path.isfile("soup_galv.pkl"))

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g55879-oa30-Galveston_Galveston_Island_Texas-Hotels.html')
driver.implicitly_wait(11)
soup_galv_2 = soup(driver.page_source)
driver.quit()

In [None]:
print(soup_galv_2)


In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g55879-oa60-Galveston_Galveston_Island_Texas-Hotels.html')
driver.implicitly_wait(15)
soup_galv_3 = soup(driver.page_source)
driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g55879-oa90-Galveston_Galveston_Island_Texas-Hotels.html')
driver.implicitly_wait(7)
soup_galv_4 = soup(driver.page_source)
driver.quit()

## Sunnyvale, California

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g33146-Sunnyvale_California-Hotels.html')
driver.implicitly_wait(12)
soup_suny = soup(driver.page_source)
driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g33146-oa30-Sunnyvale_California-Hotels.html')
driver.implicitly_wait(9)
soup_suny_2 = soup(driver.page_source)
driver.quit()

## Jersey City, New Jersey

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g46531-Jersey_City_New_Jersey-Hotels.html')
driver.implicitly_wait(15)
soup_jc = soup(driver.page_source)
driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g46531-oa30-Jersey_City_New_Jersey-Hotels.html')
driver.implicitly_wait(9)
soup_jc_2 = soup(driver.page_source)
driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g46531-oa60-Jersey_City_New_Jersey-Hotels.html')
driver.implicitly_wait(14)
soup_jc_3 = soup(driver.page_source)
driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g46531-oa90-Jersey_City_New_Jersey-Hotels.html')
driver.implicitly_wait(19)
soup_jc_4 = soup(driver.page_source)
driver.quit()

In [None]:
#############################################
#                  Old Code                 #
#############################################

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def get_hotel_name(base_city_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    service = Service(executable_path='/opt/homebrew/bin/chromedriver')
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(base_city_url)
    
    try:
        element = driver.find_element(By.CLASS_NAME, 'nBrpc Wd o W')
        print(element)
    except Exception as e:
        
        print("Error: ", e)
    driver.quit()

get_hotel_name("https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html")

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get("https://www.tripadvisor.com/Hotels-g60583-Hilo_Island_of_Hawaii_Hawaii-Hotels.html")

driver.implicitly_wait(10)

elements = driver.find_elements(By.CSS_SELECTOR, "h3.nBrpc.Wd.o.W")

resort_names = [element.text for element in elements]

for name in resort_names:
    print(name)

driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get('https://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Island_Texas-Hotels.html')

driver.implicitly_wait(10)

html = requests.get('https://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Island_Texas-Hotels.html')
html.status_code

bsobj = soup(html.content, 'lxml')

hotel = []
for name in bsobj.findAll('div',{'class':'listing_title'}):
    hotel.append(name.text.strip())
  
ratings = []
for rating in bsobj.findAll('a',{'class':'ui_bubble_rating'}):
    ratings.append(rating['alt'])
  
reviews = []
for review in bsobj.findAll('a',{'class':'review_count'}):
    reviews.append(review.text.strip())

In [None]:
html = url = "https://www.tripadvisor.com/Hotel_Review-g55879-d122738-Reviews-Hilton_Galveston_Island_Resort-Galveston_Galveston_Island_Texas.html"

response = requests.get(url)
html.status_code

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.tripadvisor.com/Hotels-g55879-Galveston_Galveston_Island_Texas-Hotels.html'

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

hotel_name_elements = soup.find_all(class_='listing_title')

for element in hotel_name_elements:
    print(element.text.strip())

In [None]:
# url = "https://www.tripadvisor.com/Hotel_Review-g60583-d113096-Reviews-Grand_Naniloa_Hotel_Hilo_A_Doubletree_By_Hilton-Hilo_Island_of_Hawaii_Hawaii.html"
url = "https://www.tripadvisor.com/Hotel_Review-g55879-d122738-Reviews-Hilton_Galveston_Island_Resort-Galveston_Galveston_Island_Texas.html"

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, "html.parser")

link_element = soup.find("a", class_="placeholder-class-for-link")
link = link_element["href"] if link_element else None

hotel_name_element = soup.find("h1", class_="placeholder-class-for-hotel-name")
hotel_name = hotel_name_element.text if hotel_name_element else None

rating_element = soup.find("some-tag", class_="placeholder-class-for-rating")
rating = rating_element.text.split(" ")[0] if rating_element else None

print(f"Link: {link}")
print(f"Hotel Name: {hotel_name}")
print(f"Rating: {rating}")