---
author: "Robert Ritz"
draft: true
echo: false
---

# Data Collection

In [6]:
import pandas as pd

from bs4 import BeautifulSoup
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, TimeoutException
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from tqdm.auto import tqdm
import time

In [7]:
options = Options()
options.add_argument("--headless=new")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")

In [8]:
cities = [
    "New-York--NY--United-States",
    "Los-Angeles--CA--United-States",
    "Chicago--IL--United-States",
    "Austin--TX--United-States",
    "Las-Vegas--NV--United-States",
]

checkin = '2024-03-25'
checkout_list = ['2024-03-27', '2024-03-28', '2024-03-29', '2024-03-30', '2024-03-31']

In [None]:
tax_rates {
    "New-York--NY--United-States": 0,
    "Los-Angeles--CA--United-States": .10,
    "Chicago--IL--United-States": .22,
    "Austin--TX--United-States": 0.0525,
    "Las-Vegas--NV--United-States": .116,
}

In [9]:
listings = []
for checkout in tqdm(checkout_list, desc='Checkout Date', position=0):
    for adults in tqdm(range(2, 7), desc="Adults", position=1, leave=False):  # adults ranging from 2 to 6
            for city in tqdm(cities, desc="City", position=2, leave=False):
                url = f"https://www.airbnb.com/s/{city}/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&price_filter_input_type=0&date_picker_type=calendar&source=structured_search_input_header&search_type=filter_change&adults={adults}&checkin={checkin}&checkout={checkout}"
                # Load root city listings
                driver = Chrome(options=options)
                driver.get(url)
                wait = WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])
                try:
                    element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """nav[aria-label='Search results pagination']""")))
                except TimeoutException:
                    pass
    
                # click button to see full price
                button = driver.find_element(By.CSS_SELECTOR, 'button[role="switch"]')
                button.click()
                time.sleep(3)
                
                # Determine number of pages in listings
                pages = driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-2].text
                try:
                    pages = int(pages)
                except ValueError:
                    continue
    
                # Navigate through each successive page in the listings
                for page in range(pages):
                    soup = BeautifulSoup(driver.page_source)
                    items = soup.find_all("div", {'itemprop':'itemListElement'})
                    for item in items:
                        features = {}
                        features['name'] = item.find('meta', {'itemprop':'name'})['content']
                        features['url'] = item.find('meta', {'itemprop':'url'})['content']
                        features['card_title'] = item.find('div', {'data-testid':'listing-card-title'}).text
                        features['price'] = item.find(lambda tag: tag.name == 'div' and "total before taxes" in tag.text).text.split("total before taxes")[1].strip()
                        features['city'] = city
                        features['checkin'] = checkin
                        features['checkout'] = checkout
                        features['adults'] = adults
                        listings.append(features)
                    pd.DataFrame(listings).to_csv("airbnb_listings.csv", index=False)
                    
                    # Got to next page and wait until element is loaded
                    driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-1].click()
                    time.sleep(3)
                driver.quit()

Checkout Date:   0%|          | 0/5 [00:00<?, ?it/s]

Adults:   0%|          | 0/5 [00:00<?, ?it/s]

City:   0%|          | 0/5 [00:00<?, ?it/s]

City:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 