In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from typing import List, Dict

# --- Configuration ---
# REPLACE THIS URL WITH A REAL STUBHUB EVENT URL
STUBHUB_EVENT_URL = "YOUR_STUBHUB_EVENT_URL_HERE" 
WAIT_TIME = 5 # Time (in seconds) to wait for JavaScript to load tickets

def scrape_stubhub_tickets(url: str, event_id: str) -> List[Dict]:
    """
    Scrapes ticket listings from a single StubHub event URL using Selenium.
    """
    print(f"-> Starting scrape for event: {event_id}")
    
    # Initialize Selenium WebDriver (handles driver executable)
    try:
        service = Service(ChromeDriverManager().install())
        
        # Run in Headless mode for efficiency (no visible browser window)
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        
        driver = webdriver.Chrome(service=service, options=options)
        driver.get(url)

        # IMPORTANT: Wait for the JavaScript to execute and load all ticket listings
        print(f"   Waiting {WAIT_TIME} seconds for listings to load...")
        time.sleep(WAIT_TIME)
        
        # Get the fully rendered page source
        page_source = driver.page_source
        driver.quit()
        
    except Exception as e:
        print(f"!!! Error during WebDriver operation: {e}")
        return []

    # --- BeautifulSoup Parsing ---
    soup = BeautifulSoup(page_source, 'html.parser')
    ticket_listings = []
    
    # NOTE: You MUST inspect the live page to find the current, correct CSS selectors
    # for ticket rows, prices, and sections.
    # 
    
    # Find all ticket listing containers (These are EXAMPLE class names)
    listing_rows = soup.find_all('div', class_='listing-row') 
    
    for row in listing_rows:
        try:
            # Extract Target Variable: Price
            price_element = row.find('span', class_='price-value') # EXAMPLE SELECTOR
            price = float(price_element.text.replace('$', '').replace(',', '').strip()) if price_element else None
            
            # Extract Features: Section and Row
            section_element = row.find('div', class_='section-name') # EXAMPLE SELECTOR
            section = section_element.text.strip() if section_element else None
            
            row_element = row.find('div', class_='row-name') # EXAMPLE SELECTOR
            ticket_row = row_element.text.strip() if row_element else None
            
            ticket_listings.append({
                "event_id": event_id,
                "extraction_time": pd.Timestamp.now(),
                "ticket_price": price,              # <-- Target Variable (Final Sale Price)
                "section": section,                 # <-- Feature
                "ticket_row": ticket_row,           # <-- Feature
            })
            
        except Exception as e:
            # Skip invalid rows or rows with missing data
            continue

    print(f"-> Successfully extracted {len(ticket_listings)} ticket listings.")
    return ticket_listings

# --- Execution ---
if __name__ == "__main__":
    # Example usage
    EVENT_ID = "Your-Concert-ID-from-Bandsintown"
    
    all_data = scrape_stubhub_tickets(STUBHUB_EVENT_URL, EVENT_ID)
    
    if all_data:
        df = pd.DataFrame(all_data)
        # Save to parquet to match your project's structure
        df.to_parquet("data/processed/stubhub_ticket_panel.parquet", index=False)
        print("\n--- Scrape Complete ---")
        print(f"Data saved to data/processed/stubhub_ticket_panel.parquet")
        print(df.head())
    else:
        print("\n--- Scrape Failed ---")
        print("Check the URL and ensure the CSS selectors are correct.")

-> Starting scrape for event: Your-Concert-ID-from-Bandsintown
!!! Error during WebDriver operation: Message: unknown error: cannot find Chrome binary
Stacktrace:
#0 0x610df43414e3 <unknown>
#1 0x610df4070c76 <unknown>
#2 0x610df4097757 <unknown>
#3 0x610df4096029 <unknown>
#4 0x610df40d4ccc <unknown>
#5 0x610df40d447f <unknown>
#6 0x610df40cbde3 <unknown>
#7 0x610df40a12dd <unknown>
#8 0x610df40a234e <unknown>
#9 0x610df43013e4 <unknown>
#10 0x610df43053d7 <unknown>
#11 0x610df430fb20 <unknown>
#12 0x610df4306023 <unknown>
#13 0x610df42d41aa <unknown>
#14 0x610df432a6b8 <unknown>
#15 0x610df432a847 <unknown>
#16 0x610df433a243 <unknown>
#17 0x78c9d7ca9aa4 <unknown>
#18 0x78c9d7d36c6c <unknown>


--- Scrape Failed ---
Check the URL and ensure the CSS selectors are correct.


In [None]:
# StubHub recent

import time
import pandas as pd
from bs4 import BeautifulSoup
from typing import List, Dict

# Use undetected_chromedriver for stealth
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# --- Configuration ---
# REPLACE THESE WITH YOUR DATA
STUBHUB_EVENT_URL = "YOUR_STUBHUB_EVENT_URL_HERE" 
EVENT_ID = "YOUR-EVENT-ID-HERE" 

def scrape_stubhub_tickets_stealth(url: str, event_id: str) -> List[Dict]:
    """
    Scrapes ticket listings from a single StubHub event URL using Undetected-Chromedriver.
    """
    print(f"-> Starting stealth scrape for event: {event_id}")
    
    # Initialize Undetected-Chromedriver
    options = uc.ChromeOptions()
    # It's often better to try non-headless first to visually confirm it works
    # options.add_argument('--headless') # Keep this commented out for initial testing
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = uc.Chrome(options=options) 
    
    try:
        driver.get(url)

        # CRITICAL STEP: Wait for the dynamic content (ticket listings) to appear.
        # REPLACE 'TICKET_LISTING_CONTAINER_ID' with the actual unique ID or class of the main ticket section.
        # Look for a unique data-qa attribute if available.
        TICKET_LISTING_SELECTOR = (By.CSS_SELECTOR, 'div[data-qa="ticket-listing"]') # PLACEHOLDER
        
        print("   Waiting for listings to load...")
        WebDriverWait(driver, 20).until(EC.presence_of_element_located(TICKET_LISTING_SELECTOR))
        
        # Mimic human behavior with a small delay
        time.sleep(2) 
        
        page_source = driver.page_source
        
    except Exception as e:
        print(f"!!! Error during stealth scrape: {e}")
        return []
    finally:
        driver.quit()

    # --- BeautifulSoup Parsing ---
    soup = BeautifulSoup(page_source, 'html.parser')
    ticket_listings = []
    
    # YOU MUST REPLACE ALL SELECTORS BELOW with the actual classes from a live page inspection.
    listing_rows = soup.find_all('div', class_='listing-row-class') # PLACEHOLDER for the entire ticket row
    
    for row in listing_rows:
        try:
            # Target Variable: Price
            price_element = row.find('span', class_='price-value-class') # PLACEHOLDER
            price = float(price_element.text.replace('$', '').replace(',', '').strip()) if price_element else None
            
            # Feature: Section and Row
            section_element = row.find('div', class_='section-name-class') # PLACEHOLDER
            section = section_element.text.strip() if section_element else None
            
            row_element = row.find('div', class_='row-name-class') # PLACEHOLDER
            ticket_row = row_element.text.strip() if row_element else None
            
            # --- Data Point Extraction ---
            ticket_listings.append({
                "event_id": event_id,
                "extraction_time": pd.Timestamp.now(),
                "ticket_price": price,         
                "section": section,            
                "ticket_row": ticket_row,      
            })
            
        except Exception:
            continue

    print(f"-> Successfully extracted {len(ticket_listings)} ticket listings.")
    return ticket_listings

# --- Execution Block (You must use asyncio.run() or run in a notebook) ---
if __name__ == "__main__":
    
    all_data = scrape_stubhub_tickets_stealth(STUBHUB_EVENT_URL, EVENT_ID)
    
    if all_data:
        df = pd.DataFrame(all_data)
        df.to_parquet("data/processed/stubhub_ticket_panel.parquet", index=False)
        print("\n--- Scrape Complete ---")
        print(df.head().to_markdown())
    else:
        print("\n--- Scrape Failed ---")