# Booking.com

In [15]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re

# Path to your WebDriver executable
service = Service('D:/Estiven/Trabajo/Freelancer/booking_scraping_analysis/chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service=service)

url = 'https://www.booking.com/searchresults.html?ss=New+York&checkin_monthday=1&checkin_year_month=2024-07&checkout_monthday=2&checkout_year_month=2024-07&group_adults=2&no_rooms=1&lang=en-us&soz=1&lang_changed=1&selected_currency=USD'

# Navigate to the page
driver.get(url)

# Wait until the review scores are present
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-testid="review-score"]')))

# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

# Prepare a list to hold the data
data = []

# Find all the hotel containers
hotel_containers = soup.find_all('div', {'data-testid': 'property-card'})
print('CONTAINERS', hotel_containers)

# Loop through each hotel container to extract the required information
for hotel in hotel_containers:
    try:
        hotel_name = hotel.find('div', {'class': 'fa4a3a8221 b121bc708f'}).get_text(strip=True)

        score_tag = hotel.find('div', {'data-testid': 'review-score'})
        score_text = score_tag.find('div', {'class': 'f13857cc8c e008572b71'}).get_text(strip=True).split()[1] if score_tag else None

        distance_tag = hotel.find('span', {'data-testid': 'distance'})
        distance = distance_tag.get_text(strip=True).split('from')[0] if distance_tag else None

        if re.search(r'\bm\b', distance):
            distance = float(re.split(r'\bm\b', distance)[0]) / 1000
        elif 'km' in distance:
            distance = float(re.split(r'km', distance)[0])

        price_tag = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
        price_text = price_tag.get_text(strip=True).split('$')[1] if price_tag else None

        taxes_tag = hotel.find('div', {'data-testid': 'taxes-and-charges'})
        taxes_text = taxes_tag.get_text(strip=True).split('$')[1].split(' ')[0] if taxes_tag else None

        total_price = float(price_text) + float(taxes_text)

        nights_adults_tag = hotel.find('div', {'data-testid': 'price-for-x-nights'})
        nights_adults_text = nights_adults_tag.get_text(strip=True) if nights_adults_tag else None

        card_deal_tag = hotel.find('span', {'data-testid': 'property-card-deal'})
        card_deal_text = card_deal_tag.get_text(strip=True) if card_deal_tag else None

        stars_tag = hotel.find('div', {'data-testid': 'rating-stars'})
        stars = len(stars_tag.find_all('svg')) if stars_tag else None

        # Subway Access
        subway_access_tag = hotel.find('span', {'class': 'f5113518a6'})
        subway_access = True if subway_access_tag else False

        # Neighbourhood name
        neighborhood_tag = hotel.find('span', {'data-testid': 'address'})
        neighborhood = neighborhood_tag.get_text(strip=True).split(', ')[0] if neighborhood_tag else None

        # Room type
        room_type_tag = hotel.find('h4', {'class': 'b290e5dfa6 cf1a0708d9'})
        room_type = room_type_tag.get_text(strip=True) if room_type_tag else None

        # Bed type
        bed_type_tag = hotel.find('div', {'class': 'ded2b5e753'}).find('div', {'class': 'b290e5dfa6'})
        bed_type = bed_type_tag.get_text(strip=True) if bed_type_tag else None

        # Initialize variables to store the policies
        cancellation_policy = None
        payment_policy = None

        # Find all li tags with the class 'a6a38de85e' within the hotel element
        li_tags = hotel.find_all('li', class_='deaf462b24')

        # Iterate over each li tag and extract the respective policy based on the presence of unique icons or identifiers
        for li in li_tags:
            # Check for the cancellation policy icon
            if li.find('span', {'data-testid': 'cancellation-policy-icon'}):
                cancellation_policy_tag = li.find('div', {'class': 'daa8593c50 a1af39b461'}).find('div', {'class': 'b290e5dfa6 b0eee6023f'})
                cancellation_policy = cancellation_policy_tag.get_text(strip=True) if cancellation_policy_tag else None
            # Check for the payment policy icon
            elif li.find('span', {'data-testid': 'prepayment-policy-icon'}):
                payment_policy_tag = li.find('div', {'class': 'daa8593c50 a1af39b461'}).find('div', {'class': 'b290e5dfa6 b0eee6023f'})
                payment_policy = payment_policy_tag.get_text(strip=True) if payment_policy_tag else None

        # Review class
        review_class_tag = hotel.find('div', {'class': 'e98ee79976 daa8593c50 fd9c2cba1d'}).find('div', {'class': 'f13857cc8c e6314e676b a287ba9834'})
        review_class = review_class_tag.get_text(strip=True) if review_class_tag else None

        # Number of reviews
        number_of_reviews_tag = hotel.find('div', {'class': 'e98ee79976 daa8593c50 fd9c2cba1d'}).find('div', {'class': 'b290e5dfa6 a5cc9f664c c4b07b6aa8'})
        number_of_reviews = number_of_reviews_tag.get_text(strip=True).split('reviews')[0].replace(',', '') if number_of_reviews_tag else None

        data.append([
            hotel_name, score_text, distance, 
            price_text, taxes_text, total_price, nights_adults_text, card_deal_text, 
            stars, subway_access, neighborhood, room_type,
            bed_type, cancellation_policy, payment_policy, review_class, number_of_reviews
            ])

    except Exception as e:
        print(f"Encountered an exception: {e}")
        continue

# Write the data to a CSV file
with open('hotel_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow([
        "Hotel Name", "Score", "Distance to Center, (km)", "Price", "Taxes and Fees", "Total price",
        "Nights and Adults", "Card Deal", 
        "Stars", "Subway Access", 
        "Neighborhood", "Room Type", "Bed Type", "Cancellation Policy", "Payment Policy",
        "Review Class", "Number of Reviews"
    ])
    writer.writerows(data)

print("Data extraction completed and saved to hotel_data.csv")

CONTAINERS [<div aria-label="Property" class="fa298e29e2 b74446e476 e40c0c68b1 ea1d0cfcb7 d8991ab7ae e8b7755ec7 ad0e783e41" data-testid="property-card" role="group" style="--bui_box_spaced_padding--s:4"><div class="efa3f4d6ac e54292ee17" data-testid="property-card-container"><div class="a89c002b3e"><div class="f2243ed1db e3431fcd81"><a aria-hidden="true" data-testid="property-card-desktop-single-image" href="https://www.booking.com/hotel/us/pod-times-square.html?aid=304142&amp;label=gen173nr-1FCAQoggJCD3NlYXJjaF9uZXcgeW9ya0gxWARoMogBAZgBMbgBF8gBDNgBAegBAfgBA4gCAagCA7gCqMLBswbAAgHSAiRkZjViZmMzMi04N2QzLTQxYzQtYTllNS1hODI0N2UzMGY1ZDfYAgXgAgE&amp;ucfs=1&amp;arphpl=1&amp;checkin=2024-07-01&amp;checkout=2024-07-02&amp;group_adults=2&amp;req_adults=2&amp;no_rooms=1&amp;group_children=0&amp;req_children=0&amp;hpos=1&amp;hapos=1&amp;sr_order=popularity&amp;srpvid=3ca6725474370339&amp;srepoch=1718640938&amp;all_sr_blocks=237726011_101250051_2_0_0&amp;highlighted_blocks=237726011_101250051_2_0_0&

In [5]:
import requests
from bs4 import BeautifulSoup
import re
import csv

url = 'https://www.booking.com/searchresults.html?ss=New+York&checkin_monthday=1&checkin_year_month=2024-07&checkout_monthday=2&checkout_year_month=2024-07&group_adults=2&no_rooms=1&lang=en-us&soz=1&lang_changed=1&selected_currency=USD'

# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
# Prepare a list to hold the data
data = []

# Find all the hotel containers
hotel_containers = soup.find_all('div', {'data-testid': 'property-card'})

# Loop through each hotel container to extract the required information
for hotel in hotel_containers:
    try:
        hotel_name = hotel.find('div', {'data-testid': 'title'}).get_text(strip=True)

        score_tag = hotel.find('div', {'data-testid': 'review-score'})
        score_text = score_tag.find('div', {'class': 'a3b8729ab1 d86cee9b25'}).get_text(strip=True).split()[1] if score_tag else None

        distance_tag = hotel.find('span', {'data-testid': 'distance'})
        distance = distance_tag.get_text(strip=True).split('from')[0] if distance_tag else None

        if re.search(r'\bm\b', distance):
            distance = float(re.split(r'\bm\b', distance)[0]) / 1000
        elif 'km' in distance:
            distance = float(re.split(r'km', distance)[0])

        hotel_link_tag = hotel.find('a', {'class': 'a78ca197d0'})
        hotel_link = hotel_link_tag['href'] if hotel_link_tag else None

        price_tag = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
        price_text = price_tag.get_text(strip=True).split('$')[1] if price_tag else None

        taxes_tag = hotel.find('div', {'data-testid': 'taxes-and-charges'})
        taxes_text = taxes_tag.get_text(strip=True).split('$')[1].split(' ')[0] if taxes_tag else None

        total_price = float(price_text) + float(taxes_text)

        nights_adults_tag = hotel.find('div', {'data-testid': 'price-for-x-nights'})
        nights_adults_text = nights_adults_tag.get_text(strip=True) if nights_adults_tag else None

        card_deal_tag = hotel.find('span', {'data-testid': 'property-card-deal'})
        card_deal_text = card_deal_tag.get_text(strip=True) if card_deal_tag else None

        # Cleanliness
        cleanliness_tag = hotel.find('span', {'class': 'a3332d346a'})
        cleanliness = cleanliness_tag.get_text(strip=True).split('Location')[1] if cleanliness_tag else None

        stars_tag = hotel.find('div', {'data-testid': 'rating-stars'})
        stars = len(stars_tag.find_all('svg')) if stars_tag else None

        # Subway Access
        subway_access_tag = hotel.find('span', {'class': 'f419a93f12'})
        subway_access = True if subway_access_tag else False

        # Neighbourhood name
        neighborhood_tag = hotel.find('span', {'data-testid': 'address'})
        neighborhood = neighborhood_tag.get_text(strip=True).split(', ')[0] if neighborhood_tag else None

        # Room type
        room_type_tag = hotel.find('h4', {'class': 'abf093bdfe e8f7c070a7'})
        room_type = room_type_tag.get_text(strip=True) if room_type_tag else None

        # Bed type
        bed_type_tag = hotel.find('div', {'class': 'fc367255e6'}).find('div', {'class': 'abf093bdfe'})
        bed_type = bed_type_tag.get_text(strip=True) if bed_type_tag else None

        # Initialize variables to store the policies
        cancellation_policy = None
        payment_policy = None

        # Find all li tags with the class 'a6a38de85e' within the hotel element
        li_tags = hotel.find_all('li', class_='a6a38de85e')

        # Iterate over each li tag and extract the respective policy based on the presence of unique icons or identifiers
        for li in li_tags:
            # Check for the cancellation policy icon
            if li.find('span', {'data-testid': 'cancellation-policy-icon'}):
                cancellation_policy_tag = li.find('div', {'class': 'abf093bdfe d068504c75'})
                cancellation_policy = cancellation_policy_tag.get_text(strip=True) if cancellation_policy_tag else None
            # Check for the payment policy icon
            elif li.find('span', {'data-testid': 'prepayment-policy-icon'}):
                payment_policy_tag = li.find('div', {'class': 'abf093bdfe d068504c75'})
                payment_policy = payment_policy_tag.get_text(strip=True) if payment_policy_tag else None

        # Review class
        review_class_tag = hotel.find('div', {'class': 'dc5041d860 c72df67c95 a29749fd9f'}).find('div', {'class': 'a3b8729ab1 e6208ee469 cb2cbb3ccb'})
        review_class = review_class_tag.get_text(strip=True) if review_class_tag else None

        # Number of reviews
        number_of_reviews_tag = hotel.find('div', {'class': 'dc5041d860 c72df67c95 a29749fd9f'}).find('div', {'class': 'abf093bdfe f45d8e4c32 d935416c47'})
        number_of_reviews = number_of_reviews_tag.get_text(strip=True).split('reviews')[0].replace(',', '') if number_of_reviews_tag else None

        data.append([
            hotel_name, score_text, distance, hotel_link, price_text, 
            taxes_text, total_price, nights_adults_text, card_deal_text, 
            stars, cleanliness, subway_access, neighborhood, room_type,
            bed_type, cancellation_policy, payment_policy, review_class, number_of_reviews
        ])

    except Exception as e:
        print(f"Encountered an exception: {e}")
        continue

# Write the data to a CSV file
with open('hotel_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow([
        "Hotel Name", "Score", "Distance to Center, (km)", "Hotel Link", "Price", 
        "Taxes and Fees", "Total price", "Nights and Adults", "Card Deal", 
        "Stars", "Cleanliness", "Subway Access", 
        "Neighborhood", "Room Type", "Bed Type", "Cancellation Policy", "Payment Policy",
        "Review Class", "Number of Reviews"
    ])
    writer.writerows(data)

print("Data extraction completed and saved to hotel_data.csv")

<!DOCTYPE html>

<!--
You know you could be getting paid to poke around in our code?
We're hiring designers and developers to work in Amsterdam:
https://careers.booking.com/
-->
<!-- wdot-802 -->
<script nonce="W6W1t3YVtJ6a99w" type="text/javascript">
document.addEventListener('DOMContentLoaded', function () {
/**
* provides the current user's cookie consent
* in order to use it:
* 1. inline privacy/cookieConsent.js in the page you need to use it.
* please note that this library relies on window.PCM.isCountryNeedCookieBanner to be initialised
* before using (calling getValue function) it
* 2. in your js file:
*
* var privacyCookieConsent = B.require('privacyCookieConsent');
* var consent = privacyCookieConsent.getValue();
*/
B.define('privacyCookieConsent', function () {
var consentGroupIsAllowed = {
analytical: 'C0002%3A1',
marketing: 'C0004%3A1'
};
var optanonConsentCookieName = 'OptanonConsent';
var optanonBoxClosedCookieName = 'OptanonAlertBoxClosed';
var halfOfYearMillis = 180 * 2

In [4]:
import pandas as pd

data = pd.read_csv('scraped_data/expedia_data.csv', encoding='utf-8')
data['Hotel Name'].unique()

array(['Tempo by Hilton New York Times Square', 'Hotel AKA NoMad',
       'Hyatt Centric Times Square New York'], dtype=object)

# Expedia.com

In [28]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# Path to your WebDriver executable
service = Service('D:/Estiven/Trabajo/Freelancer/booking_scraping_analysis/chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service=service)

url ='https://www.expedia.com/Hotel-Search?destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&d1=2024-06-06&d2=2024-06-07&adults=2&rooms=1&startDate=2024-06-20&endDate=2024-06-21&regionId=178293&theme=&userIntent=&semdtl=&useRewards=false&sort=RECOMMENDED'

# Navigate to the page
driver.get(url)

# Wait until the hotel name elements are present
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3.uitk-heading-5')))

# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

# Prepare a list to hold the data
data = []

# Find all the hotel containers
hotel_containers = soup.find_all('div', {'data-stid': 'lodging-card-responsive'})

# Loop through each hotel container to extract the required information
for hotel in hotel_containers:
    try:
        hotel_name_tag = hotel.find('h3', {'class': 'uitk-heading-5'})
        hotel_name = hotel_name_tag.get_text(strip=True) if hotel_name_tag else None

        # Find the price before taxes
        price_before_tax_tag = hotel.find('div', {'class': 'uitk-text uitk-type-300 uitk-text-default-theme is-visually-hidden'})
        price_before_tax = price_before_tax_tag.get_text(strip=True) if price_before_tax_tag else None

        # Find the price after taxes
        price_after_tax_tag = hotel.find('div', {'class': 'uitk-text uitk-type-end uitk-type-200 uitk-text-default-theme'})
        price_after_tax = price_after_tax_tag.get_text(strip=True) if price_after_tax_tag else None

        # Find the rating
        rating_tag = hotel.find('span', {'class': 'uitk-badge-base-text'})
        rating = rating_tag.get_text(strip=True) if rating_tag else None

        # Find the classification
        classification_tag = hotel.find('span', {'class': 'uitk-text uitk-type-300 uitk-type-medium uitk-text-emphasis-theme'})
        classification = classification_tag.get_text(strip=True) if classification_tag else None

        # Find the total number of reviews
        reviews_tag = hotel.find('span', {'class': 'uitk-text uitk-type-200 uitk-type-regular uitk-text-default-theme'})
        reviews = reviews_tag.get_text(strip=True) if reviews_tag else None

        # Find the stay type and the bed type
        stay_type_tag = hotel.find('div', {'class': 'uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme', 'aria-hidden': 'true'})
        stay_type = stay_type_tag.get_text(strip=True).split(',')[0] if stay_type_tag else None
        bed_type = stay_type_tag.get_text(strip=True).split(', ')[1] if stay_type_tag else None

        # Find the neighborhood
        neighborhood_tag = hotel.find('div', {'class': 'uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme', 'aria-hidden': 'false'})
        neighborhood = neighborhood_tag.get_text(strip=True) if neighborhood_tag else None

        data.append([hotel_name, price_before_tax, price_after_tax, rating, classification, reviews, stay_type, bed_type, neighborhood])

    except Exception as e:
        print(f"Encountered an exception: {e}")
        continue

# Write the data to a CSV file
with open('hotel_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Hotel Name", "Price Before Taxes", "Price After Taxes", "Rating", "Classification", "Reviews", "Stay Type", "Bed type", "Neighborhood"])
    writer.writerows(data)

print("Data extraction completed and saved to hotel_data.csv")

Data extraction completed and saved to hotel_data.csv


In [2]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Path to your WebDriver executable
service = Service('D:/Estiven/Trabajo/Freelancer/booking_scraping_analysis/chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service=service)

url = 'https://www.expedia.com/Hotel-Search?destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&d1=2024-06-06&d2=2024-06-07&adults=2&rooms=1&startDate=2024-06-20&endDate=2024-06-21&regionId=178293&theme=&userIntent=&semdtl=&useRewards=false&sort=RECOMMENDED'

# Navigate to the page
driver.get(url)

# Wait until the hotel name elements are present
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3.uitk-heading-5')))

# Prepare a list to hold the data
data = []

# Find all the hotel containers
hotel_containers = driver.find_elements(By.CSS_SELECTOR, 'div[data-stid="lodging-card-responsive"]')

# Loop through each hotel container to extract the required information
for hotel in hotel_containers:
    try:
        hotel_name = hotel.find_element(By.CSS_SELECTOR, 'h3.uitk-heading-5').text

        # Find the price before taxes
        price_before_tax = hotel.find_element(By.CSS_SELECTOR, 'div.uitk-text.uitk-type-300.uitk-text-default-theme.is-visually-hidden').text

        # Find the price after taxes
        price_after_tax = hotel.find_element(By.CSS_SELECTOR, 'div.uitk-text.uitk-type-end.uitk-type-200.uitk-text-default-theme').text

        # Find the rating
        rating = hotel.find_element(By.CSS_SELECTOR, 'span.uitk-badge-base-text').text

        # Find the classification
        classification = hotel.find_element(By.CSS_SELECTOR, 'span.uitk-text.uitk-type-300.uitk-type-medium.uitk-text-emphasis-theme').text

        # Find the total number of reviews
        reviews = hotel.find_element(By.CSS_SELECTOR, 'span.uitk-text.uitk-type-200.uitk-type-regular.uitk-text-default-theme').text

        # Find the stay type and the bed type
        stay_type_tag = hotel.find_element(By.CSS_SELECTOR, 'div.uitk-text.uitk-text-spacing-half.truncate-lines-2.uitk-type-300.uitk-text-default-theme[aria-hidden="true"]')
        stay_type = stay_type_tag.text.split(',')[0]
        bed_type = stay_type_tag.text.split(', ')[1]

        # Find the neighborhood
        neighborhood = hotel.find_element(By.CSS_SELECTOR, 'div.uitk-text.uitk-text-spacing-half.truncate-lines-2.uitk-type-300.uitk-text-default-theme[aria-hidden="false"]').text

        data.append([hotel_name, price_before_tax, price_after_tax, rating, classification, reviews, stay_type, bed_type, neighborhood])

    except Exception as e:
        print(f"Encountered an exception: {e}")
        continue

# Write the data to a CSV file
with open('hotel_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Hotel Name", "Price Before Taxes", "Price After Taxes", "Rating", "Classification", "Reviews", "Stay Type", "Bed Type", "Neighborhood"])
    writer.writerows(data)

print("Data extraction completed and saved to hotel_data.csv")

# Close the driver
driver.quit()


Data extraction completed and saved to hotel_data.csv


In [2]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# Path to your WebDriver executable
service = Service('D:/Estiven/Trabajo/Freelancer/booking_scraping_analysis/chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service=service)

url = 'https://www.expedia.com/Hotel-Search?destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&d1=2024-06-06&d2=2024-06-07&adults=2&rooms=1&startDate=2024-06-20&endDate=2024-06-21&regionId=178293&theme=&userIntent=&semdtl=&useRewards=false&sort=RECOMMENDED'

# Navigate to the page
driver.get(url)

# Wait until the hotel name elements are present
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3.uitk-heading-5')))

# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

# Prepare a list to hold the data
data = []

# Find all the hotel containers
hotel_containers = soup.find_all('div', {'data-stid': 'lodging-card-responsive'})

# Loop through each hotel container to extract the required information
for hotel in hotel_containers:
    try:
        hotel_name_tag = hotel.find('h3', {'class': 'uitk-heading-5'})
        hotel_name = hotel_name_tag.get_text(strip=True) if hotel_name_tag else None

        # Find the price before taxes
        price_before_tax_tag = hotel.find('div', {'class': 'uitk-text uitk-type-300 uitk-text-default-theme is-visually-hidden'})
        price_before_tax = price_before_tax_tag.get_text(strip=True) if price_before_tax_tag else None

        # Find the price after taxes
        price_after_tax_tag = hotel.find('div', {'class': 'uitk-text uitk-type-end uitk-type-200 uitk-text-default-theme'})
        price_after_tax = price_after_tax_tag.get_text(strip=True) if price_after_tax_tag else None

        # Find the rating
        rating_tag = hotel.find('span', {'class': 'uitk-badge-base-text'})
        rating = rating_tag.get_text(strip=True) if rating_tag else None

        # Find the classification
        classification_tag = hotel.find('span', {'class': 'uitk-text uitk-type-300 uitk-type-medium uitk-text-emphasis-theme'})
        classification = classification_tag.get_text(strip=True) if classification_tag else None

        # Find the total number of reviews
        reviews_tag = hotel.find('span', {'class': 'uitk-text uitk-type-200 uitk-type-regular uitk-text-default-theme'})
        reviews = reviews_tag.get_text(strip=True) if reviews_tag else None

        # Find the stay type and the bed type
        stay_type_tag = hotel.find('div', {'class': 'uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme', 'aria-hidden': 'true'})
        stay_type = stay_type_tag.get_text(strip=True).split(',')[0] if stay_type_tag else None
        bed_type = stay_type_tag.get_text(strip=True).split(', ')[1] if stay_type_tag else None

        # Find the neighborhood
        neighborhood_tag = hotel.find('div', {'class': 'uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme', 'aria-hidden': 'false'})
        neighborhood = neighborhood_tag.get_text(strip=True) if neighborhood_tag else None

        data.append([hotel_name, price_before_tax, price_after_tax, rating, classification, reviews, stay_type, bed_type, neighborhood])

    except Exception as e:
        print(f"Encountered an exception: {e}")
        continue

# Write the data to a CSV file
with open('hotel_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Hotel Name", "Price Before Taxes", "Price After Taxes", "Rating", "Classification", "Reviews", "Stay Type", "Bed Type", "Neighborhood"])
    writer.writerows(data)

print("Data extraction completed and saved to hotel_data.csv")


Encountered an exception: list index out of range
Data extraction completed and saved to hotel_data.csv


In [2]:
import pandas as pd

In [7]:
pd.read_csv('booking_data.csv', encoding='utf-8')['ttt'].unique()

array([1], dtype=int64)

In [1]:
import csv
import time
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options

# Path to your WebDriver executable
service = Service('D:/Estiven/Trabajo/Freelancer/booking_scraping_analysis/chromedriver-win64/chromedriver.exe')

# Create a new Options object
options = webdriver.ChromeOptions()

# Add options
options.add_argument("start-maximized") # open Browser in maximized mode
options.add_argument("disable-infobars") # disabling infobars
options.add_argument("--disable-extensions") # disabling extensions
options.add_argument("--disable-gpu") # applicable to windows os only
options.add_argument("--disable-dev-shm-usage") # overcome limited resource problems
options.add_argument("--no-sandbox") # Bypass OS security model
# options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3')
options.add_argument('accept-encoding=gzip, deflate, br')
options.add_argument('accept=text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7')
options.add_argument('referer=https://www.expedia.com/')
options.add_argument('upgrade-insecure-requests=1')

# Pass the options when creating the WebDriver
driver = webdriver.Chrome(service=service, options=options)

def generate_booking_url(ttt, los, snapshot_date):
    checkin_date = snapshot_date + timedelta(days=ttt)
    checkout_date = checkin_date + timedelta(days=los)
    return f'https://www.booking.com/searchresults.html?ss=New+York&checkin_monthday={checkin_date.day}&checkin_year_month={checkin_date.strftime("%Y-%m")}&checkout_monthday={checkout_date.day}&checkout_year_month={checkout_date.strftime("%Y-%m")}&group_adults=2&no_rooms=1&lang=en-us&soz=1&lang_changed=1&selected_currency=USD'

def generate_expedia_url(ttt, los, snapshot_date):
    checkin_date = (snapshot_date + timedelta(days=ttt)).strftime('%Y-%m-%d')
    checkout_date = (snapshot_date + timedelta(days=ttt + los)).strftime('%Y-%m-%d')
    return f'https://www.expedia.com/Hotel-Search?destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&d1={checkin_date}&d2={checkout_date}&adults=2&rooms=1'

def scrape_booking_page(soup, ttt, los):
    data = []
    hotel_containers = soup.find_all('div', {'data-testid': 'property-card'})
    for hotel in hotel_containers:
        try:
            hotel_name = hotel.find('div', {'data-testid': 'title'}).get_text(strip=True)
            score_tag = hotel.find('div', {'data-testid': 'review-score'})
            score_text = score_tag.find('div', {'class': 'a3b8729ab1 d86cee9b25'}).get_text(strip=True).split()[1] if score_tag else None
            distance_tag = hotel.find('span', {'data-testid': 'distance'})
            distance = distance_tag.get_text(strip=True) if distance_tag else None
            hotel_link_tag = hotel.find('a', {'class': 'a78ca197d0'})
            hotel_link = hotel_link_tag['href'] if hotel_link_tag else None
            price_tag = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
            price_text = price_tag.get_text(strip=True) if price_tag else None
            taxes_tag = hotel.find('div', {'data-testid': 'taxes-and-charges'})
            taxes_text = taxes_tag.get_text(strip=True) if taxes_tag else None
            nights_adults_tag = hotel.find('div', {'data-testid': 'price-for-x-nights'})
            nights_adults_text = nights_adults_tag.get_text(strip=True) if nights_adults_tag else None
            card_deal_tag = hotel.find('span', {'data-testid': 'property-card-deal'})
            card_deal_text = card_deal_tag.get_text(strip=True) if card_deal_tag else None
            cleanliness_tag = hotel.find('span', {'class': 'a3332d346a'})
            cleanliness = cleanliness_tag.get_text(strip=True) if cleanliness_tag else None
            stars_tag = hotel.find('div', {'data-testid': 'rating-stars'})
            stars = len(stars_tag.find_all('svg')) if stars_tag else None
            subway_access_tag = hotel.find('span', {'class': 'f419a93f12'})
            subway_access = True if subway_access_tag else False
            neighborhood_tag = hotel.find('span', {'data-testid': 'address'})
            neighborhood = neighborhood_tag.get_text(strip=True).split(', ')[0] if neighborhood_tag else None
            room_type_tag = hotel.find('h4', {'class': 'abf093bdfe e8f7c070a7'})
            room_type = room_type_tag.get_text(strip=True) if room_type_tag else None
            bed_type_tag = hotel.find('div', {'class': 'fc367255e6'}).find('div', {'class': 'abf093bdfe'})
            bed_type = bed_type_tag.get_text(strip=True) if bed_type_tag else None

            cancellation_policy, payment_policy = None, None
            li_tags = hotel.find_all('li', class_='a6a38de85e')
            for li in li_tags:
                if li.find('span', {'data-testid': 'cancellation-policy-icon'}):
                    cancellation_policy_tag = li.find('div', {'class': 'abf093bdfe d068504c75'})
                    cancellation_policy = cancellation_policy_tag.get_text(strip=True) if cancellation_policy_tag else None
                elif li.find('span', {'data-testid': 'prepayment-policy-icon'}):
                    payment_policy_tag = li.find('div', {'class': 'abf093bdfe d068504c75'})
                    payment_policy = payment_policy_tag.get_text(strip=True) if payment_policy_tag else None

            review_class_tag = hotel.find('div', {'class': 'dc5041d860 c72df67c95 a29749fd9f'}).find('div', {'class': 'a3b8729ab1 e6208ee469 cb2cbb3ccb'})
            review_class = review_class_tag.get_text(strip=True) if review_class_tag else None
            number_of_reviews_tag = hotel.find('div', {'class': 'dc5041d860 c72df67c95 a29749fd9f'}).find('div', {'class': 'abf093bdfe f45d8e4c32 d935416c47'})
            number_of_reviews = number_of_reviews_tag.get_text(strip=True) if number_of_reviews_tag else None

            data.append([
                hotel_name, score_text, distance, hotel_link, price_text, 
                taxes_text, nights_adults_text, card_deal_text, 
                stars, cleanliness, subway_access, neighborhood, room_type,
                bed_type, cancellation_policy, payment_policy, review_class, number_of_reviews,
                ttt, los
            ])
        except Exception as e:
            print(f"Encountered an exception: {e}")
            continue
    return data

def scrape_expedia_page(soup, ttt, los):
    data = []
    hotel_containers = soup.find_all('div', {'data-stid': 'lodging-card-responsive'})
    for hotel in hotel_containers:
        try:
            hotel_name_tag = hotel.find('h3', {'class': 'uitk-heading-5'})
            hotel_name = hotel_name_tag.get_text(strip=True) if hotel_name_tag else None
            price_before_tax_tag = hotel.find('div', {'class': 'uitk-text uitk-type-300 uitk-text-default-theme is-visually-hidden'})
            price_before_tax = price_before_tax_tag.get_text(strip=True) if price_before_tax_tag else None
            price_after_tax_tag = hotel.find('div', {'class': 'uitk-text uitk-type-end uitk-type-200 uitk-text-default-theme'})
            price_after_tax = price_after_tax_tag.get_text(strip=True) if price_after_tax_tag else None
            rating_tag = hotel.find('span', {'class': 'uitk-badge-base-text'})
            rating = rating_tag.get_text(strip=True) if rating_tag else None
            classification_tag = hotel.find('span', {'class': 'uitk-text uitk-type-300 uitk-type-medium uitk-text-emphasis-theme'})
            classification = classification_tag.get_text(strip=True) if classification_tag else None
            reviews_tag = hotel.find('span', {'class': 'uitk-text uitk-type-200 uitk-type-regular uitk-text-default-theme'})
            reviews = reviews_tag.get_text(strip=True) if reviews_tag else None
            stay_type_tag = hotel.find('div', {'class': 'uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme', 'aria-hidden': 'true'})
            stay_type = stay_type_tag.get_text(strip=True).split(',')[0] if stay_type_tag else None
            bed_type = stay_type_tag.get_text(strip=True).split(', ')[1] if stay_type_tag else None
            neighborhood_tag = hotel.find('div', {'class': 'uitk-text uitk-text-spacing-half truncate-lines-2 uitk-type-300 uitk-text-default-theme', 'aria-hidden': 'false'})
            neighborhood = neighborhood_tag.get_text(strip=True) if neighborhood_tag else None

            data.append([hotel_name, price_before_tax, price_after_tax, rating, classification, reviews, stay_type, bed_type, neighborhood, ttt, los])
        except Exception as e:
            print(f"Encountered an exception: {e}")
            continue
    return data

def scrape_all_pages(url, site, ttt, los):
    driver.get(url)
    all_data = []
    while True:
        try:
            if site == 'booking':
                WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-testid="property-card"]')))
            elif site == 'expedia':
                WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3.uitk-heading-5')))

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            page_data = scrape_booking_page(soup, ttt, los) if site == 'booking' else scrape_expedia_page(soup, ttt, los)
            all_data.extend(page_data)

            if site == 'booking':
                next_button = driver.find_elements(By.CSS_SELECTOR, 'button[data-testid="pagination-next"]')
            # else:
            #     next_button = driver.find_elements(By.XPATH, '//button[contains(@class, "uitk-button") and contains(@class, "uitk-button-secondary") and text()="Show more"]')
            #     print(next_button)
                
            if next_button and next_button[0].is_displayed() and len(all_data) < 100:
                next_button[0].click()
                time.sleep(2)
            else:
                break
        except Exception as e:
            print(f"Encountered an exception while scraping {site}: {e}")
            break
    return all_data


def main():
    snapshot_dates = [datetime.today(), datetime.today() + timedelta(days=7), datetime.today() + timedelta(days=14)]
    ttt_range = range(1, 2)
    los_range = range(1, 2)
    
    all_booking_data = []
    all_expedia_data = []

    for snapshot_date in snapshot_dates:
        for ttt in ttt_range:
            for los in los_range:
                booking_url = generate_booking_url(ttt, los, snapshot_date)
                expedia_url = generate_expedia_url(ttt, los, snapshot_date)
                print(expedia_url)
                
                booking_data = scrape_all_pages(booking_url, 'booking', ttt, los)
                all_booking_data.extend(booking_data)
                
                expedia_data = scrape_all_pages(expedia_url, 'expedia', ttt, los)
                all_expedia_data.extend(expedia_data)
                
    with open('booking_data.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            "Hotel Name", "Score", "Distance to Center", "Hotel Link", "Price", 
            "Taxes and Fees", "Nights and Adults", "Card Deal", 
            "Stars", "Cleanliness", "Subway Access", "Neighborhood", "Room Type", "Bed Type", 
            "Cancellation Policy", "Payment Policy", "Review Class", "Number of Reviews", "ttt", "los"
        ])
        writer.writerows(all_booking_data)

    with open('expedia_data.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Hotel Name", "Price Before Taxes", "Price After Taxes", 
                        "Rating", "Classification", "Reviews", 
                        "Stay Type", "Bed type", "Neighborhood",
                        "ttt", "los"])
        writer.writerows(all_expedia_data)

    print("Data extraction completed and saved to booking_data.csv and expedia_data.csv")

if __name__ == "__main__":
    main()
    driver.quit()

https://www.expedia.com/Hotel-Search?destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&d1=2024-06-07&d2=2024-06-08&adults=2&rooms=1
Encountered an exception while scraping expedia: cannot access local variable 'next_button' where it is not associated with a value
https://www.expedia.com/Hotel-Search?destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&d1=2024-06-14&d2=2024-06-15&adults=2&rooms=1
Encountered an exception while scraping expedia: Message: 

https://www.expedia.com/Hotel-Search?destination=New%20York%20%28and%20vicinity%29%2C%20New%20York%2C%20United%20States%20of%20America&d1=2024-06-21&d2=2024-06-22&adults=2&rooms=1
Encountered an exception while scraping expedia: Message: 

Data extraction completed and saved to booking_data.csv and expedia_data.csv


In [4]:
import time
import asyncio
from playwright.async_api import async_playwright

async def main():
    async with async_playwright() as pw:
        browser = await pw.firefox.launch(headless=False)
        page = await browser.new_page()
        await page.goto('https://www.expedia.com/Hotel-Search?adults=2&d1=2024-06-01&d2=2024-06-07&destination=Milan%20%28and%20vicinity%29%2C%20Lombardy%2C%20Italy&endDate=2024-06-07&latLong=45.47179%2C9.18617&regionId=180012&rooms=1&semdtl=&sort=RECOMMENDED&startDate=2024-06-01&theme=&useRewards=false&userIntent=')
        await asyncio.sleep(2)

        # scrape hotels
        cards = page.locator('[data-stid="lodging-card-responsive"]').all()
        hotels = []

        for card in cards: 
            content = card.locator('div.uitk-card-content-section')
            title = await content.locator('h3').text_content()

            if await content.locator('span.uitk-badge-base-text').is_visible():
                rating = await content.locator('span.uitk-badge-base-text').text_content()
            else:
                rating = False

            if await content.locator('div.uitk-type-500').is_visible():
                price = await content.locator('div.uitk-type-500').text_content()
            else:
                price = False

            hotel = {
                'title': title,
                'rating': rating,
                'price': price
            }

            hotels.append(hotel)

        print(hotels)
        await browser.close()

# Run the function
main()

<coroutine object main at 0x000001C6A7937940>

In [13]:
import pandas as pd
pd.read_csv('FinalExpedia.csv').head(1)

Unnamed: 0,Name,DateOfSearch,Checkin,Checkout,NumOfWeekendDays,Nights,District,Score,Rating,Review,Includes,Refundable,Price
0,The Hoxton Williamsburg,3/1/2023,3/2/2023,3/3/2023,0.0,1.0,Williamsburg,9.0,Wonderful,"(1,000 reviews)",,Fully refundable,$347 total


In [6]:
pd.read_csv('scraped_data/expedia_data.csv').head(1)

Unnamed: 0,Hotel Name,Price Before Taxes,Price After Taxes,Rating,Classification,Reviews,Stay Type,Bed type,Neighborhood,ttt,los
0,Hotel AKA NoMad,The price is $295,$382 total,Ad,Excellent,"1,005 reviews",Premium Room,1 King Bed (NoMad),New York,1,1


In [16]:
pd.read_csv('scraped_data/booking_data.csv')[['Cancellation Policy', 'Payment Policy']]

Unnamed: 0,Cancellation Policy,Payment Policy
0,,
1,,
2,,
3,,
4,,
...,...,...
11940,Free cancellation,
11941,Free cancellation,No prepayment needed– pay at the property
11942,,
11943,Free cancellation,No prepayment needed– pay at the property
