# Web Scraping Ticket Prices from StubHub

This Jupyter Notebook demonstrates how to scrape ticket prices from StubHub using Selenium and process the data with pandas. The workflow includes:

1. Setting up the Selenium WebDriver with custom options.
2. Reading ticket sales data from an Excel file.
3. Logging into StubHub and navigating to the search results.
4. Extracting ticket prices for specified artists.
5. Saving the scraped data back to an Excel file for further analysis.

Below are the detailed steps and code implementation.


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from math import nan
from datetime import datetime
import re
import time
import json




# Get today's date
current_date = datetime.today()

In [2]:
# Define a test user-agent string to simulate a browser request
test_ua = 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'

# Initialize Chrome options for the WebDriver
options = Options()

# Uncomment the following line if you want to run the browser in headless mode (no GUI)
# options.add_argument("--headless")

# Set the window size for the browser
options.add_argument("--window-size=1920,1080")

# Set the user-agent to the test user-agent defined above
options.add_argument(f'--user-agent={test_ua}')

# Add additional options to improve stability and compatibility
options.add_argument('--no-sandbox')  # Required for running in some environments
options.add_argument("--disable-extensions")  # Disable extensions to avoid potential conflicts
# options.add_argument("user-data-dir=C:\\Users\\eric9\\AppData\\Local\\Google\\Chrome\\User Data\\Default")

# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)


In [3]:
# The following code reads an Excel file and loads the data into a pandas DataFrame
path = "../../Documents/Ticket Sales.xlsx"
events = pd.read_excel(path, sheet_name ="Events")
# events = pd.read_excel(path, sheet_name ="Sheet1")


In [4]:
"""
This code snippet is designed to automate the process of logging into the StubHub website using Selenium WebDriver. Here's a step-by-step breakdown of what the code does:
1. Navigates to StubHub's homepage.
2. Clicks on the 'Sign In' button.
3. Waits for the email input field to be present and enters the email.
4. Enters the password and submits the form.
5. Attempts to click the submit button if it appears.
Note: Ensure that the necessary imports for Selenium WebDriver, WebDriverWait, and expected conditions (EC) are included in your script.
"""
# Navigate to StubHub's homepage
driver.get("https://www.stubhub.ca")

# Click on the 'Sign In' button
driver.find_element(By.XPATH, "//*[text() ='Sign In']").click()

# Wait for the email input field to be present and enter the email
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='email']")))
driver.find_element(By.CSS_SELECTOR, "input[type='email']").send_keys("eric9090909090@hotmail.com")

# Enter the password and submit the form
driver.find_element(By.CSS_SELECTOR, "input[type=password]").send_keys("BlckPnk39!@!" + Keys.ENTER)
time.sleep(5)

# Attempt to click the submit button if it appears
try:
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
except:
    pass
time.sleep(5)

## StubHub Event Scraper Functions

In [None]:
from bs4 import BeautifulSoup

def generate_stubhub_url(artist, location= "Toronto"):
    """
    Generates a StubHub search URL for a given string.
    This function takes a string input, replaces spaces with plus signs,
    and appends it to a predefined StubHub search URL. If the input is 
    None or NaN, it returns a placeholder string "lol".
    
    Args:
        search_query (str): The search query string.
    
    Returns:
        str: A formatted StubHub search URL or "lol" if the input is None or NaN.
    """
    if pd.isna(artist):
        artist = "lol"
    if pd.isna(location):
        location = "Toronto"
    return "https://www.stubhub.ca/secure/search?q=" + artist + "%20" + location

def get_event_link(event_grid, date):
    """
    Retrieves the href attribute of the event link from the specified XPath.
    
    Returns:
        str: The href attribute of the event link.
    """
    events =  event_grid["0"]["items"]
    if pd.isna(date):
        return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
    for event in events:
        if event["formattedDate"] == date.strftime("%d %b %Y"):
            return event["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
    return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"+"&quantity=0"

def get_date(event_grid):
    """
    Extracts a date from a given URL string using regex.
    
    Args:
        url (str): The URL string to extract the date from.
    
    Returns:
        datetime: The extracted date as a datetime object, or None if no date is found.
    """
    if len(event_grid["0"]["items"]) == 0:
        return None
    date_str = event_grid["0"]["items"][0]["formattedDate"]

    # Check if the date string matches the format '27 Nov' or '27 Nov 2025'
    if re.match(r'\d{2} \w{3} \d{4}', date_str):
        date = datetime.strptime(date_str, "%d %b %Y")
    elif re.match(r'\d{2} \w{3}', date_str):
        date = datetime.strptime(date_str + " " + str(current_date.year), "%d %b %Y")
    else:
        return None
    return date


venues = ["Budweiser", "History", " Rogers Stadium", "Massey Hall", "DPRTMNT", "Rogers Arena", "Axis", "Noir", "Rebel", "Cabana", "Woodbine Park", "CODA", "Metropolis"]
def get_venue_text(event_grid):
    """
    Retrieves the text of the venue from the specified XPath from the search query page
    
    Args:
    
    Returns:
        str: The text of the venue.
    """                                                
    venue = event_grid["0"]["items"][0]["venueName"]
    for v in venues:
        if v in venue:
            return v
    return venue

def get_location(event_grid):
    """
    Retrieves the text of the location from the specified XPath.
    
    Returns:
        str: The text of the city.
    """
    return event_grid["0"]["items"][0]["venueCity"]

def get_ticket_prices():
    """
    Retrieves ticket prices from a web page using Selenium.
    This function finds elements on the page that contain ticket prices, extracts the prices,
    and returns the three lowest prices, the user's ticket price (if available), and the maximum resell price.
    Returns:
        tuple: A tuple containing:
            - ticket_prices (list): A list of the three lowest ticket prices.
            - user_ticket_price (float): The price of the user's ticket, if available.
            - max_resell (float): The maximum resell price found.
    """

    ticket_price_elements = driver.find_elements(By.CLASS_NAME, "sc-1bp3ico-0")
    listing_count = len(ticket_price_elements)
    ticket_prices = [0, 0, 0]
    user_ticket_price = 0
    max_resell = 0
    for i in range(listing_count):
        try:
            price = ticket_price_elements[i].get_attribute("data-price")
        except:
            ticket_price_elements = driver.find_elements(By.CLASS_NAME, "sc-1bp3ico-0")
            if len(ticket_price_elements) != 0:
                price = ticket_price_elements[i].get_attribute("data-price")
            else:
                price = 0
            pass
        if i < 3:
            ticket_prices[i]=(float(re.sub(r'[^\d.]', '', price )))
        if len(ticket_price_elements[i].find_elements(By.CLASS_NAME, "sc-1l8fa2j-14")) > 0:
            try:
                user_ticket_price = (float(re.sub(r'[^\d.]', '', price )))
            except:
                pass
        if len(ticket_price_elements[i].find_elements(By.CLASS_NAME, "sc-cm4cry-3")) > 0:
            new_high =  (float(re.sub(r'[^\d.]', '', price )))
            if new_high > max_resell:
                try:
                    max_resell = (float(re.sub(r'[^\d.]', '', price )))
                except:
                    pass
        

        
    if (len(ticket_price_elements) > 5) & (max_resell == 0):    
        max_resell = ticket_prices[0]
    return ticket_prices, user_ticket_price, max_resell


def get_ticket_prices():
    """
    Retrieves ticket prices from a web page using Selenium and BeautifulSoup.
    This function finds elements on the page that contain ticket prices, extracts the prices,
    and returns the three lowest prices, the user's ticket price (if available), and the maximum resell price.
    Returns:
        tuple: A tuple containing:
            - ticket_prices (list): A list of the three lowest ticket prices.
            - user_ticket_price (float): The price of the user's ticket, if available.
            - max_resell (float): The maximum resell price found.
    """

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    ticket_price_elements = soup.find_all(class_="sc-1bp3ico-0")
    listing_count = len(ticket_price_elements)
    ticket_prices = [0, 0, 0]
    user_ticket_price = 0
    max_resell = 0

    for i in range(listing_count):
        try:
            price = ticket_price_elements[i].get('data-price')
        except:
            ticket_price_elements = soup.find_all(class_="sc-1bp3ico-0")
            if len(ticket_price_elements) != 0:
                price = ticket_price_elements[i].get('data-price')
            else:
                price = 0
            pass
        if i < 3:
            ticket_prices[i] = float(re.sub(r'[^\d.]', '', price))
        if ticket_price_elements[i].find(class_="sc-1l8fa2j-14"):
            try:
                user_ticket_price = float(re.sub(r'[^\d.]', '', price))
            except:
                pass
        if ticket_price_elements[i].find(class_="sc-cm4cry-3"):
            new_high = float(re.sub(r'[^\d.]', '', price))
            if new_high > max_resell:
                try:
                    max_resell = float(re.sub(r'[^\d.]', '', price))
                except:
                    pass

    if (len(ticket_price_elements) > 5) & (max_resell == 0):
        max_resell = ticket_prices[0]
    return ticket_prices, user_ticket_price, max_resell


## Helper Functions for StubHub Automation

In [6]:
def close_prompts():
    """
    Closes any modal that appears and applies ticket filters.
    """
    # Close any modal that appears
    driver.find_element(By.XPATH, '//*[@id="modal-root"]/div/div/div/div[2]/div[3]/button').click()
   
def apply_ticket_filters():   
    driver.find_element(By.CSS_SELECTOR, "div.sc-xrltsx-2").click()
    reccomended_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-popular-filters']/div/div/div/div[2]/div/div/div/div[2]/div/input")
    if reccomended_filter.get_attribute("value") == "true":
        time.sleep(3)
        reccomended_filter.click()


def filter_tickets_by_venue(venue_name):
    """
    Filters tickets by venue and clicks on the appropriate ticket filter.

    Args:
        venue_name (str): The name of the venue to filter tickets by.
    """
    ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
    ticket_filter.click()
    div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
    venue_filters = {
        'scotiabank arena': 'Lower',
        'massey': 'Main Floor',
        'place bell': 'Floor',
        'rogers stadium': 'Floor',
        'budweiser': 'Lawn',
        'cola': 'Floor',
        'rogers center': 'Field',
        'rogers arena': 'Floor',
    }
    for venue, filter_text in venue_filters.items():
        if venue in venue_name.lower():
            for div in div_elements:
                if filter_text in div.text:
                    div.click()
                    return


In [7]:

def process_artist(index, row):
    """
    Processes an artist's event by navigating to the search URL, applying filters, and retrieving ticket prices.
    
    Args:
        row (pd.Series): A row from the sales DataFrame containing artist and event information.
    
    Returns:
        tuple: A tuple containing the artist's name, the ticket price from StubHub, and the user's ticket price.
    """

    artist_search_url = generate_stubhub_url(row["Artist"], row["Location"])
    driver.get("view-source:" + artist_search_url)
    event_grid = driver.page_source
    soup = BeautifulSoup(event_grid, 'html.parser')
    event_grid = soup.find('td', class_='line-content', string=lambda x: x and 'eventGrids' in x)
    json_str = event_grid.text.strip()
    data = json.loads(json_str)
    
    event_grid = data["eventGrids"]
    if len(event_grid["0"]["items"]) == 0:
        return row["Artist"], [0, 0, 0], 0
    date = get_date(event_grid)
    venue = get_venue_text(event_grid)
    location = get_location(event_grid)
    if pd.isna(row["Location"]):
        events.at[index, "Location"] = location
    if pd.isna(row["Date"]):
        events.at[index, "Date"] = date
    if pd.isna(row["Venue"]):
        events.at[index, "Venue"] = venue
    if pd.isna(row["Min Cost"]):
        events.at[index, "Min Cost"] = 0
    if pd.isna(row["Max Resell"]):
        events.at[index, "Max Resell"] = 0
    
    event_link = get_event_link(event_grid, row["Date"])
    driver.get(event_link)
    # Close any modal that appears
    
    try:
        close_prompts()
        time.sleep(1)
        apply_ticket_filters()
        filter_tickets_by_venue( events.at[index, "Venue"])
        time.sleep(5)
    except:
        pass
    ticket_price, user_ticket_price, max_resell = get_ticket_prices()
    if (max_resell > events.at[index, "Max Resell"] or pd.isna(row["Max Resell"])):
        events.at[index, "Max Resell"] = ticket_price[0]
    return row["Artist"], ticket_price, user_ticket_price





## Processing Artists and Ticket Prices


In [8]:
# Initialize lists to store results
processed_artists = []
ticket_prices = []
user_ticket_prices = []
event_dates = []


# Iterate over each row in the sales DataFrame
for index, row in events.iterrows():
    if (current_date < row["Date"] or pd.isna(row["Date"])):
        artist, ticket_price, user_ticket_price = process_artist(index, row)
        if artist:
            processed_artists.append(artist)
            ticket_prices.append(ticket_price)
            user_ticket_prices.append(user_ticket_price)
            event_dates.append(row["Date"])
            print(artist, ticket_price, user_ticket_price)
            


acraze [0, 0, 0] 0
Alan Walker [120.0, 121.0, 222.0] 0
Angrybaby [31.0, 167.0, 0] 0
Azzeca [44.0, 0, 0] 0
Baynk [42.0, 47.0, 76.0] 0
BBNO$ [76.0, 85.0, 85.0] 0
Becky Hill [81.0, 85.0, 86.0] 0
Ben Bohmer [96.0, 115.0, 118.0] 0
Berry Can't Swim [74.0, 79.0, 82.0] 0
Billie Eilish [213.0, 221.0, 223.0] 0
Billy Joel [216.0, 217.0, 225.0] 0
Black Tiger Sex Machine [117.0, 198.0, 200.0] 0
Blanke [34.0, 61.0, 0] 0
BUNT [138.0, 138.0, 152.0] 0
caribou [70.0, 124.0, 126.0] 0
carol ades [138.0, 0, 0] 0
Chelsea Cutler and Jeremy Zucker [94.0, 116.0, 123.0] 0
Chris avant garde [0, 0, 0] 0
Chris Luno [41.0, 0, 0] 0
Cloonee [76.0, 80.0, 80.0] 0
ColdPlay [534.0, 557.0, 558.0] 0
ColdPlay [534.0, 557.0, 558.0] 0
ColdPlay [534.0, 557.0, 558.0] 0
ColdPlay [534.0, 557.0, 558.0] 0
Counterparts [50.0, 50.0, 50.0] 0
Creed [54.0, 55.0, 56.0] 0
Dabin [86.0, 0, 0] 0
dance with the dead [58.0, 0, 0] 0
DeathPack [0, 0, 0] 0
Destroy Lonely [74.0, 75.0, 76.0] 0
Dua Lipa [181.0, 198.0, 202.0] 0
ENTOURAGE FEST: OGUZ, 

StaleElementReferenceException: Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=130.0.6723.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6532738A5+3004357]
	(No symbol) [0x00007FF652F09970]
	(No symbol) [0x00007FF652DB582A]
	(No symbol) [0x00007FF652DBC2E2]
	(No symbol) [0x00007FF652DBE637]
	(No symbol) [0x00007FF652DBE6F0]
	(No symbol) [0x00007FF652E0563A]
	(No symbol) [0x00007FF652E05E7C]
	(No symbol) [0x00007FF652DF979C]
	(No symbol) [0x00007FF652E2BC1F]
	(No symbol) [0x00007FF652DF92A6]
	(No symbol) [0x00007FF652E2BDF0]
	(No symbol) [0x00007FF652E4BA4C]
	(No symbol) [0x00007FF652E2B983]
	(No symbol) [0x00007FF652DF7628]
	(No symbol) [0x00007FF652DF8791]
	GetHandleVerifier [0x00007FF65329A00D+3161901]
	GetHandleVerifier [0x00007FF6532EE060+3506048]
	GetHandleVerifier [0x00007FF6532E400D+3465005]
	GetHandleVerifier [0x00007FF653060EEB+830987]
	(No symbol) [0x00007FF652F1467F]
	(No symbol) [0x00007FF652F109D4]
	(No symbol) [0x00007FF652F10B6D]
	(No symbol) [0x00007FF652F00149]
	BaseThreadInitThunk [0x00007FFA64D9259D+29]
	RtlUserThreadStart [0x00007FFA66D8AF38+40]


In [None]:
events["Date"] = pd.to_datetime(events["Date"]).dt.date
events["Presale"] = pd.to_datetime(events["Presale"]).dt.date


## Combining and Saving Processed Data

In [None]:
stubhub  = pd.concat([pd.Series(processed_artists), pd.Series(event_dates), pd.Series(ticket_prices), pd.Series(user_ticket_prices)], axis=1)

# Set the column names for the DataFrame
stubhub.columns = ["Artist", "Dates", "Stubhub", "Me"]

# Sort the DataFrame by the 'Me' column in descending order
stubhub = stubhub.sort_values(by="Me", ascending=False)

stubhub["Dates"] = pd.to_datetime(stubhub["Dates"]).dt.date

In [None]:
# Expand the 'Stubhub' column into separate columns for each listing
stubhub[['Stubhub_1', 'Stubhub_2', 'Stubhub_3']] = pd.DataFrame(stubhub['Stubhub'].tolist(), index=stubhub.index)


# Filter the DataFrame to find shows where 'Me' price is higher than 'Stubhub' price
higher_price_shows = stubhub[stubhub["Me"] > stubhub["Stubhub_1"]]

# Display the filtered DataFrame
print(higher_price_shows)

In [None]:
higher_price_shows = stubhub[(stubhub["Stubhub_2"]- stubhub["Me"] >2) & (stubhub["Me"] > 0)]
higher_price_shows

In [None]:
# Write the DataFrame to an Excel file, replacing the existing sheet if it exists
with pd.ExcelWriter(path, mode='a', engine="openpyxl",date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD',  if_sheet_exists="replace") as writer:
    stubhub.to_excel(writer, sheet_name="stubhub", header=True, index=False)



with pd.ExcelWriter(path, mode='a', engine="openpyxl",date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD', if_sheet_exists="replace") as writer:
    events.to_excel(writer, sheet_name="Events", header=True, index=False)
 