In [None]:
import time, random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [11]:
# -------------------
# USER INPUT
# -------------------
city = "Amsterdam"
checkin = "2025-12-15"
checkout = "2025-12-22"
csv_file = "booking_hotels_clean.csv"
url = f"https://www.booking.com/searchresults.html?ss={city}&checkin={checkin}&checkout={checkout}"

# -------------------
# SELENIUM SETUP
# -------------------
options = webdriver.ChromeOptions()
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(url)

# -------------------
# SMART SCROLL FUNCTION
# -------------------
def scroll_until_stop(driver, already_scraped, max_tries=10):
    """
    Keep scrolling down until no new hotels appear,
    or max_tries reached (safety).
    """
    tries = 0
    while tries < max_tries:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(1, 2))

        cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="property-card"]')
        if len(cards) > already_scraped:
            return cards  # new hotels appeared
        else:
            tries += 1
    return cards

def load_more(driver, wait_time=2):
    """Click 'Load more results' if present"""
    try:
        load_more = driver.find_element(By.XPATH, "//button//span[contains(text(), 'Load more results')]")
        driver.execute_script("arguments[0].scrollIntoView();", load_more)
        time.sleep(random.uniform(0.5, 1.5))
        driver.execute_script("arguments[0].click();", load_more)
        print("Clicked 'Load more results'...")
        time.sleep(wait_time + random.uniform(0,2))
    except:
        pass

# -------------------
# SCRAPER LOOP
# -------------------
hotels_data = []
already_scraped = 0
page = 1

while True:
    print(f"Scraping page {page}...")

    # Keep scrolling until all hotels on current page are loaded
    hotel_cards = scroll_until_stop(driver, already_scraped)

    # Slice only newly loaded hotels
    new_cards = hotel_cards[already_scraped:]
    if not new_cards:
        print("⚠️ No new hotels found on this load. Stopping.")
        break

    print(f"Found {len(new_cards)} new hotels on this load.")

    for card in new_cards:
        time.sleep(random.uniform(0.2, 0.8))  # mimic human

        # Property name
        try:
            name = card.find_element(By.CSS_SELECTOR, 'div[data-testid="title"]').text
        except:
            continue

        # Price
        try:
            price_text = card.find_element(By.CSS_SELECTOR, 'span[data-testid="price-and-discounted-price"]').text
            price = int("".join([c for c in price_text if c.isdigit()]))
        except:
            price = None

        # Nights
        try:
            nights_text = card.find_element(By.CSS_SELECTOR, 'div[data-testid="price-for-x-nights"]').text
            nights = int([s for s in nights_text.split() if s.isdigit()][0])
        except:
            nights = None

        # Star rating
        try:
            star_elem = card.find_element(By.CSS_SELECTOR, 'div.ebc566407a[aria-label*="out of"]')
            star_rating = star_elem.get_attribute("aria-label").split(" ")[0]
        except:
            star_rating = None

        # Review score
        try:
            review_score = card.find_element(By.CSS_SELECTOR, 'div.f63b14ab7a.dff2e52086').text
        except:
            review_score = None

        # Review count
        try:
            review_count_text = card.find_element(By.CSS_SELECTOR, 'div.fff1944c52.fb14de7f14.eaa8455879').text
            review_count = int("".join([c for c in review_count_text if c.isdigit()]))
        except:
            review_count = None

        # Room type
        try:
            room_type = card.find_element(By.CSS_SELECTOR, 'h4.fff1944c52.f254df5361').text
        except:
            room_type = None

        # Part of the city
        try:
            part_of_city_text = card.find_element(By.CSS_SELECTOR, 'span[data-testid="address"]').text
            if "," in part_of_city_text:
                part_of_city = part_of_city_text.split(",")[0].strip()
            else:
                part_of_city = part_of_city_text.strip()
        except:
            part_of_city = None

        # Distance from center
        try:
            distance_text = card.find_element(By.CSS_SELECTOR, 'span[data-testid="distance"]').text
            distance = float(distance_text.split()[0].replace(",", "."))
        except:
            distance = None

        # URL
        try:
            link = card.find_element(By.CSS_SELECTOR, 'a[data-testid="title-link"]').get_attribute("href")
        except:
            link = None

        hotels_data.append({
            "name": name,
            "price": price,
            "nro_of_nights": nights,
            "star_rating": star_rating,
            "review_score": review_score,
            "review_count": review_count,
            "room_type": room_type,
            "part_of_city": part_of_city,
            "distance_from_center": distance,
            "url": link
        })

        # Save every 50 hotels
        if len(hotels_data) % 50 == 0:
            pd.DataFrame(hotels_data).to_csv(csv_file, index=False)
            print(f"💾 Saved {len(hotels_data)} hotels so far...")

    # Update marker
    already_scraped = len(hotel_cards)

    # Try load more
    load_more(driver)
    page += 1

# -------------------
# CLOSE & SAVE
# -------------------
driver.quit()
pd.DataFrame(hotels_data).to_csv(csv_file, index=False)
print(f"✅ Scraping finished. Total hotels saved: {len(hotels_data)}")

Scraping page 1...
Found 26 new hotels on this load.
Scraping page 2...
Found 25 new hotels on this load.
💾 Saved 50 hotels so far...
Scraping page 3...
Found 25 new hotels on this load.
Clicked 'Load more results'...
Scraping page 4...
Found 25 new hotels on this load.
💾 Saved 100 hotels so far...
Clicked 'Load more results'...
Scraping page 5...
Found 24 new hotels on this load.
Clicked 'Load more results'...
Scraping page 6...
Found 25 new hotels on this load.
💾 Saved 150 hotels so far...
Clicked 'Load more results'...
Scraping page 7...
Found 25 new hotels on this load.
Clicked 'Load more results'...
Scraping page 8...
Found 25 new hotels on this load.
💾 Saved 200 hotels so far...
Clicked 'Load more results'...
Scraping page 9...
Found 25 new hotels on this load.
Clicked 'Load more results'...
Scraping page 10...
Found 25 new hotels on this load.
💾 Saved 250 hotels so far...
Clicked 'Load more results'...
Scraping page 11...
Found 25 new hotels on this load.
Clicked 'Load more resu

In [12]:
# -------------------
# INPUT
# -------------------
csv_file = "booking_hotels_clean.csv"
output_csv = "booking_hotels_with_address.csv"

# Load existing CSV
df = pd.read_csv(csv_file)

# Add address column if not present
if "address" not in df.columns:
    df["address"] = None

# -------------------
# SELENIUM SETUP
# -------------------
options = webdriver.ChromeOptions()
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# -------------------
# SCRAPE ADDRESSES
# -------------------
for idx, row in df.iterrows():
    if pd.notna(row["address"]):
        continue  # skip if already collected

    url = row["url"]
    if not url:
        continue

    try:
        driver.get(url)
        # Wait for address element
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.b99b6ef58f.cb4b7a25d9.b06461926f'))
        )
        address = driver.find_element(By.CSS_SELECTOR, 'div.b99b6ef58f.cb4b7a25d9.b06461926f').text
        df.at[idx, "address"] = address
        print(f"[{idx+1}/{len(df)}] Collected address for: {row['name']}")

        time.sleep(random.uniform(0.5, 1.5))  # human-like delay
    except Exception as e:
        print(f"[{idx+1}/{len(df)}] Failed for {row['name']}: {e}")
        continue

# -------------------
# SAVE CSV
# -------------------
driver.quit()
df.to_csv(output_csv, index=False)
print(f"✅ Done! Addresses saved to {output_csv}")

[1/800] Collected address for: The Muse Amsterdam - Boutique Hotel
[2/800] Collected address for: citizenM Amsterdam South
[3/800] Collected address for: Ruby Emma Hotel Amsterdam
[4/800] Collected address for: Inntel Hotels Amsterdam Landmark
[5/800] Collected address for: Urban Lodge Hotel
[6/800] Collected address for: The Social Hub Amsterdam City
[7/800] Collected address for: Via Amsterdam
[8/800] Collected address for: Joy Hotel
[9/800] Collected address for: Hotel Casa Amsterdam
[10/800] Collected address for: Holiday Inn Express Amsterdam - Sloterdijk Station by IHG
[11/800] Collected address for: ClinkNOORD Hostel
[12/800] Collected address for: Holiday Inn Express Amsterdam Arena Towers by IHG
[13/800] Collected address for: ibis Amsterdam Centre
[14/800] Collected address for: Hotel Estheréa
[15/800] Collected address for: Novotel Amsterdam City
[16/800] Collected address for: Tribe Amsterdam City
[17/800] Collected address for: Holiday Inn Express Amsterdam - North Riversi