---
author: "Robert Ritz"
draft: true
echo: false
---

# Data Collection

In [1]:
import pandas as pd

from bs4 import BeautifulSoup
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, TimeoutException
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from tqdm.auto import tqdm
import time

In [2]:
options = Options()
options.add_argument("--headless=new")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")

In [3]:
cities = [
    "New-York--NY--United-States",
    "Los-Angeles--CA--United-States",
    "Chicago--IL--United-States",
    "Houston--TX--United-States",
    "Phoenix--AZ--United-States",
    "Philadelphia--PA--United-States",
    "San-Antonio--TX--United-States",
    "San-Diego--CA--United-States",
    "Dallas--TX--United-States",
    "San-Jose--CA--United-States",
    "Austin--TX--United-States",
    "Jacksonville--FL--United-States",
    "Fort-Worth--TX--United-States",
    "Columbus--OH--United-States",
    "Charlotte--NC--United-States",
    "San-Francisco--CA--United-States",
    "Indianapolis--IN--United-States",
    "Seattle--WA--United-States",
    "Denver--CO--United-States",
    "Washington--DC--United-States",
    "Boston--MA--United-States",
    "El-Paso--TX--United-States",
    "Nashville--TN--United-States",
    "Detroit--MI--United-States",
    "Oklahoma-City--OK--United-States",
    "Portland--OR--United-States",
    "Las-Vegas--NV--United-States",
    "Memphis--TN--United-States",
    "Louisville--KY--United-States",
    "Baltimore--MD--United-States",
    "Milwaukee--WI--United-States",
    "Albuquerque--NM--United-States",
    "Tucson--AZ--United-States",
    "Fresno--CA--United-States",
    "Mesa--AZ--United-States",
    "Sacramento--CA--United-States",
    "Atlanta--GA--United-States",
    "Kansas-City--MO--United-States",
    "Colorado-Springs--CO--United-States",
    "Miami--FL--United-States",
    "Raleigh--NC--United-States",
    "Omaha--NE--United-States",
    "Long-Beach--CA--United-States",
    "Virginia-Beach--VA--United-States",
    "Oakland--CA--United-States",
    "Minneapolis--MN--United-States",
    "Tulsa--OK--United-States",
    "Arlington--TX--United-States",
    "Tampa--FL--United-States",
    "New-Orleans--LA--United-States"
]

In [None]:
checkin = '2024-03-25'
checkout = '2024-03-27'

listings = []
for adults in tqdm(range(2, 7), desc="Adults"):  # adults ranging from 2 to 6
        for city in tqdm(cities, position=0, desc="City"):
            url = f"https://www.airbnb.com/s/{city}/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&price_filter_input_type=0&date_picker_type=calendar&source=structured_search_input_header&search_type=filter_change&adults={adults}&checkin={checkin}&checkout={checkout}"
            # Load root city listings
            driver = Chrome(options=options)
            driver.get(url)
            wait = WebDriverWait(driver, timeout=10, poll_frequency=1, ignored_exceptions=[ElementNotVisibleException, ElementNotSelectableException])
            try:
                element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, """nav[aria-label='Search results pagination']""")))
            except TimeoutException:
                pass

            # click button to see full price
            button = driver.find_element(By.CSS_SELECTOR, 'button[role="switch"]')
            button.click()
            time.sleep(3)
            
            # Determine number of pages in listings
            pages = driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-2].text
            try:
                pages = int(pages)
            except ValueError:
                continue

            # Navigate through each successive page in the listings
            for page in range(pages):
                soup = BeautifulSoup(driver.page_source)
                items = soup.find_all("div", {'itemprop':'itemListElement'})
                for item in items:
                    features = {}
                    features['name'] = item.find('meta', {'itemprop':'name'})['content']
                    features['url'] = item.find('meta', {'itemprop':'url'})['content']
                    features['card_title'] = item.find('div', {'data-testid':'listing-card-title'}).text
                    features['price'] = item.find(lambda tag: tag.name == 'div' and "total before taxes" in tag.text).text.split("total before taxes")[1].strip()
                    features['city'] = city
                    features['checkin'] = checkin
                    features['checkout'] = checkout
                    features['adults'] = adults
                    listings.append(features)
                pd.DataFrame(listings).to_csv("airbnb_listings.csv", index=False)
                
                # Got to next page and wait until element is loaded
                driver.find_elements(By.TAG_NAME, "nav")[-1].find_elements(By.TAG_NAME, "a")[-1].click()
                time.sleep(3)
            driver.quit()

Adults:   0%|          | 0/5 [00:00<?, ?it/s]

City:   0%|          | 0/50 [00:00<?, ?it/s]