# Web Scraping Ticket Prices from StubHub

This Jupyter Notebook demonstrates how to scrape ticket prices from StubHub using Selenium and process the data with pandas. The workflow includes:

1. Setting up the Selenium WebDriver with custom options.
2. Reading ticket sales data from an Excel file.
3. Logging into StubHub and navigating to the search results.
4. Extracting ticket prices for specified artists.
5. Saving the scraped data back to an Excel file for further analysis.

Below are the detailed steps and code implementation.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from math import nan
from datetime import datetime
import re
import time
import json




# Get today's date
current_date = datetime.today()

In [3]:
# Define a test user-agent string to simulate a browser request
test_ua = 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'

# Initialize Chrome options for the WebDriver
options = Options()

# Uncomment the following line if you want to run the browser in headless mode (no GUI)
# options.add_argument("--headless")

# Set the window size for the browser
options.add_argument("--window-size=1920,1080")

# Set the user-agent to the test user-agent defined above
options.add_argument(f'--user-agent={test_ua}')

# Add additional options to improve stability and compatibility
options.add_argument('--no-sandbox')  # Required for running in some environments
options.add_argument("--disable-extensions")  # Disable extensions to avoid potential conflicts
# options.add_argument("user-data-dir=C:\\Users\\eric9\\AppData\\Local\\Google\\Chrome\\User Data\\Default")

# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)


In [4]:
# The following code reads an Excel file and loads the data into a pandas DataFrame
path = "../../Documents/Ticket Sales.xlsx"
events = pd.read_excel(path, sheet_name ="Events")
# events = pd.read_excel(path, sheet_name ="Sheet1")


In [5]:
"""
This code snippet is designed to automate the process of logging into the StubHub website using Selenium WebDriver. Here's a step-by-step breakdown of what the code does:
1. Navigates to StubHub's homepage.
2. Clicks on the 'Sign In' button.
3. Waits for the email input field to be present and enters the email.
4. Enters the password and submits the form.
5. Attempts to click the submit button if it appears.
Note: Ensure that the necessary imports for Selenium WebDriver, WebDriverWait, and expected conditions (EC) are included in your script.
"""
# Navigate to StubHub's homepage
driver.get("https://www.stubhub.ca")

# Click on the 'Sign In' button
driver.find_element(By.XPATH, "//*[text() ='Sign In']").click()

# Wait for the email input field to be present and enter the email
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='email']")))
driver.find_element(By.CSS_SELECTOR, "input[type='email']").send_keys("eric9090909090@hotmail.com")

# Enter the password and submit the form
driver.find_element(By.CSS_SELECTOR, "input[type=password]").send_keys("BlckPnk39!@!" + Keys.ENTER)
time.sleep(5)

# Attempt to click the submit button if it appears
try:
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
except:
    pass
time.sleep(5)

## StubHub Event Scraper Functions

In [6]:


# def generate_stubhub_url(artist, location= "Toronto"):
#     """
#     Generates a StubHub search URL for a given string.
#     This function takes a string input, replaces spaces with plus signs,
#     and appends it to a predefined StubHub search URL. If the input is 
#     None or NaN, it returns a placeholder string "lol".
    
#     Args:
#         search_query (str): The search query string.
    
#     Returns:
#         str: A formatted StubHub search URL or "lol" if the input is None or NaN.
#     """
#     if pd.isna(artist):
#         artist = "lol"
#     if pd.isna(location):
#         location = "Toronto"
#     return "https://www.stubhub.ca/secure/search?q=" + artist + "%20" + location

# def get_event_link(event_grid, date):
#     """
#     Retrieves the href attribute of the event link from the specified XPath.
    
#     Returns:
#         str: The href attribute of the event link.
#     """
#     events =  event_grid["0"]["items"]
#     if pd.isna(date):
#         return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
#     for event in events:
#         if event["formattedDate"] == date.strftime("%d %b %Y"):
#             return event["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
#     return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"+"&quantity=0"

# def get_date(event_grid):
#     """
#     Extracts a date from a given URL string using regex.
    
#     Args:
#         url (str): The URL string to extract the date from.
    
#     Returns:
#         datetime: The extracted date as a datetime object, or None if no date is found.
#     """
#     date_str = event_grid["0"]["items"][0]["formattedDate"]

#     # Check if the date string matches the format '27 Nov' or '27 Nov 2025'
#     if re.match(r'\d{2} \w{3} \d{4}', date_str):
#         date = datetime.strptime(date_str, "%d %b %Y")
#     elif re.match(r'\d{2} \w{3}', date_str):
#         date = datetime.strptime(date_str + " " + str(current_date.year), "%d %b %Y")
#     else:
#         date = None

#     return date


# venues = ["Budweiser", "History", " Rogers Stadium", "Massey Hall", "DPRTMNT", "Rogers Arena", "Axis", "Noir", "Rebel", "Cabana", "Woodbine Park", "CODA", "Metropolis"]
# def get_venue_text(event_grid):
#     """
#     Retrieves the text of the venue from the specified XPath from the search query page
    
#     Args:
    
#     Returns:
#         str: The text of the venue.
#     """                                                
#     venue = event_grid["0"]["items"][0]["venueName"]
#     for v in venues:
#         if v in venue:
#             return v
#     return venue

# def get_location(event_grid):
#     """
#     Retrieves the text of the location from the specified XPath.
    
#     Returns:
#         str: The text of the city.
#     """
#     return event_grid["0"]["items"][0]["venueCity"]




# def get_user_ticket_price():
#     """
#     Retrieves the user's ticket price from the list of ticket listings.
    
#     Returns:
#         float: The user's ticket price, or 0 if no user ticket price is found.
#     """
#     user_ticket_price = 0
#     ticket_listings = driver.find_elements(By.CLASS_NAME, "sc-57jg3s-0")
#     for listing in ticket_listings:
#         try:
#             listing.find_element(By.CLASS_NAME, "sc-1l8fa2j-14")
#             user_ticket_price = listing.find_element(By.CLASS_NAME, "sc-1bp3ico-0").get_attribute("data-price")
#             user_ticket_price = float(re.sub(r'[^\d.]', '', user_ticket_price))
#             break
#         except:
#             pass    
#     return user_ticket_price

# def get_lowest_ticket_price():
#     # Retrieve the price from the listing
#     ticket_price_elements = driver.find_elements(By.CLASS_NAME, "sc-1bp3ico-0")
#     ticket_prices = [0, 0, 0]
#     for i in range(min(3, len(ticket_price_elements))):
#         price = ticket_price_elements[i].get_attribute("data-price")
#         ticket_prices[i]=(float(re.sub(r'[^\d.]', '', price )))  

#     return ticket_prices


In [7]:
def generate_stubhub_url(artist, location= "Toronto"):
    """
    Generates a StubHub search URL for a given string.
    This function takes a string input, replaces spaces with plus signs,
    and appends it to a predefined StubHub search URL. If the input is 
    None or NaN, it returns a placeholder string "lol".
    
    Args:
        search_query (str): The search query string.
    
    Returns:
        str: A formatted StubHub search URL or "lol" if the input is None or NaN.
    """
    if pd.isna(artist):
        artist = "lol"
    if pd.isna(location):
        location = "Toronto"
    return "https://www.stubhub.ca/secure/search?q=" + artist + "%20" + location

def get_event_link(event_grid, date):
    """
    Retrieves the href attribute of the event link from the specified XPath.
    
    Returns:
        str: The href attribute of the event link.
    """
    events =  event_grid["0"]["items"]
    if pd.isna(date):
        return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
    for event in events:
        if event["formattedDate"] == date.strftime("%d %b %Y"):
            return event["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
    return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"+"&quantity=0"

def get_date(event_grid):
    """
    Extracts a date from a given URL string using regex.
    
    Args:
        url (str): The URL string to extract the date from.
    
    Returns:
        datetime: The extracted date as a datetime object, or None if no date is found.
    """
    date_str = event_grid["0"]["items"][0]["formattedDate"]

    # Check if the date string matches the format '27 Nov' or '27 Nov 2025'
    if re.match(r'\d{2} \w{3} \d{4}', date_str):
        date = datetime.strptime(date_str, "%d %b %Y")
    elif re.match(r'\d{2} \w{3}', date_str):
        date = datetime.strptime(date_str + " " + str(current_date.year), "%d %b %Y")
    else:
        date = None

    return date


venues = ["Budweiser", "History", " Rogers Stadium", "Massey Hall", "DPRTMNT", "Rogers Arena", "Axis", "Noir", "Rebel", "Cabana", "Woodbine Park", "CODA", "Metropolis"]
def get_venue_text(event_grid):
    """
    Retrieves the text of the venue from the specified XPath from the search query page
    
    Args:
    
    Returns:
        str: The text of the venue.
    """                                                
    venue = event_grid["0"]["items"][0]["venueName"]
    for v in venues:
        if v in venue:
            return v
    return venue

def get_location(event_grid):
    """
    Retrieves the text of the location from the specified XPath.
    
    Returns:
        str: The text of the city.
    """
    return event_grid["0"]["items"][0]["venueCity"]






def get_ticket_prices():
    # Retrieve the price from the listing

    ticket_price_elements = driver.find_elements(By.CLASS_NAME, "sc-1bp3ico-0")
    listing_count = len(ticket_price_elements)
    ticket_prices = [0, 0, 0]
    user_ticket_price = 0
    max_resell = 0
    for i in range(listing_count):
        try:
            price = ticket_price_elements[i].get_attribute("data-price")
        except:
            ticket_price_elements = driver.find_elements(By.CLASS_NAME, "sc-1bp3ico-0")
            price = ticket_price_elements[i].get_attribute("data-price")
            pass
        if i < 3:
            ticket_prices[i]=(float(re.sub(r'[^\d.]', '', price )))
        if len(ticket_price_elements[i].find_elements(By.CLASS_NAME, "sc-1l8fa2j-14")) > 0:
            try:
                user_ticket_price = (float(re.sub(r'[^\d.]', '', price )))
            except:
                pass
        if len(ticket_price_elements[i].find_elements(By.CLASS_NAME, "sc-cm4cry-3")) > 0:
            new_high =  (float(re.sub(r'[^\d.]', '', price )))
            if new_high > max_resell:
                try:
                    max_resell = (float(re.sub(r'[^\d.]', '', price )))
                except:
                    pass
        

        
    if (len(ticket_price_elements) > 5) & (max_resell == 0):    
        max_resell = ticket_prices[0]
    return ticket_prices, user_ticket_price, max_resell


## Helper Functions for StubHub Automation

In [8]:
def close_prompts():
    """
    Closes any modal that appears and applies ticket filters.
    """
    # Close any modal that appears
    driver.find_element(By.XPATH, '//*[@id="modal-root"]/div/div/div/div[2]/div[3]/button').click()
   
def apply_ticket_filters():   
    driver.find_element(By.CSS_SELECTOR, "div.sc-xrltsx-2").click()
    reccomended_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-popular-filters']/div/div/div/div[2]/div/div/div/div[2]/div/input")
    if reccomended_filter.get_attribute("value") == "true":
        time.sleep(2)
        reccomended_filter.click()


def filter_tickets_by_venue(venue_name):
    """
    Filters tickets by venue and clicks on the appropriate ticket filter.

    Args:
        venue_name (str): The name of the venue to filter tickets by.
    """
    if 'scotiabank arena' in venue_name.lower():
        ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
        ticket_filter.click()
        # Find elements with the specified class and loop through them
        div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
        for div in div_elements:
            if "Lower" in div.text:
                div.click()
                break
    if 'massey hall' in venue_name.lower():
        ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
        ticket_filter.click()
        # Find elements with the specified class and loop through them
        div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
        for div in div_elements:
            if "Main Floor" in div.text:
                div.click()
                break
    if venue_name == "Place Bell":
        ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
        ticket_filter.click()
        # Find elements with the specified class and loop through them
        div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
        for div in div_elements:
            if "Floor" in div.text:
                div.click()
                break
    if "rogers stadium" in venue_name.lower():
        ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
        ticket_filter.click()
        div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
        for div in div_elements:
            if "Floor" in div.text:
                div.click()
                break
    if  'budweiser' in venue_name.lower():
        ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
        ticket_filter.click()
        div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
        for div in div_elements:
            if "Lawn" in div.text:
                div.click()
                break

In [9]:

def process_artist(index, row):
    """
    Processes an artist's event by navigating to the search URL, applying filters, and retrieving ticket prices.
    
    Args:
        row (pd.Series): A row from the sales DataFrame containing artist and event information.
    
    Returns:
        tuple: A tuple containing the artist's name, the ticket price from StubHub, and the user's ticket price.
    """

    artist_search_url = generate_stubhub_url(row["Artist"], row["Location"])
    driver.get("view-source:" + artist_search_url)
    event_grid = driver.page_source
    soup = BeautifulSoup(event_grid, 'html.parser')
    event_grid = soup.find('td', class_='line-content', string=lambda x: x and 'eventGrids' in x)
    json_str = event_grid.text.strip()
    data = json.loads(json_str)

    event_grid = data["eventGrids"]
    date = get_date(event_grid)
    venue = get_venue_text(event_grid)
    location = get_location(event_grid)
    if pd.isna(row["Location"]):
        events.at[index, "Location"] = location
    if pd.isna(row["Date"]):
        events.at[index, "Date"] = date
    if pd.isna(row["Venue"]):
        events.at[index, "Venue"] = venue
    if pd.isna(row["Min Cost"]):
        events.at[index, "Min Cost"] = 0
    if pd.isna(row["Max Resell"]):
        events.at[index, "Max Resell"] = 0
    
    event_link = get_event_link(event_grid, row["Date"])
    driver.get(event_link)
    # Close any modal that appears
    
    try:
        close_prompts()
        time.sleep(1)
        apply_ticket_filters()
        filter_tickets_by_venue( events.at[index, "Venue"])
        time.sleep(5)
    except:
        pass
    ticket_price, user_ticket_price, max_resell = get_ticket_prices()
    if (max_resell > events.at[index, "Max Resell"] or pd.isna(row["Max Resell"])):
        events.at[index, "Max Resell"] = ticket_price[0]
    return row["Artist"], ticket_price, user_ticket_price





## Processing Artists and Ticket Prices


In [10]:
# Initialize lists to store results
processed_artists = []
ticket_prices = []
user_ticket_prices = []
event_dates = []


# Iterate over each row in the sales DataFrame
for index, row in events.iterrows():
    # if row["Artist"] == "Rüfüs Du Sol":        
    if (current_date < row["Date"] or pd.isna(row["Date"])):
        artist, ticket_price, user_ticket_price = process_artist(index, row)
        if artist:
            processed_artists.append(artist)
            ticket_prices.append(ticket_price)
            user_ticket_prices.append(user_ticket_price)
            event_dates.append(row["Date"])
            print(artist, ticket_price, user_ticket_price)
            


Lawrence [27.0, 30.0, 30.0] 0
Metallica [0, 0, 0] 0
Metallica [0, 0, 0] 0
Emorfik [52.0, 66.0, 99.0] 66.0
Tylar Hubbard [59.0, 61.0, 61.0] 0
Kaivon [60.0, 119.0, 965.0] 60.0
Lavern [54.0, 54.0, 54.0] 54.0
Timmy Trumpet [80.0, 80.0, 90.0] 94.0
Alley CVT [116.0, 169.0, 201.0] 0
AC Slater [135.0, 139.0, 0] 0
Pitbull [151.0, 174.0, 180.0] 0
Myles Smith [58.0, 80.0, 80.0] 0
Polo G [40.0, 50.0, 58.0] 0
Chase Atlantic [99.0, 108.0, 108.0] 0
Myles Smith [58.0, 80.0, 80.0] 0
Illenium  [63.0, 66.0, 66.0] 0
Maddix [79.0, 80.0, 81.0] 81.0
Vini Vici [36.0, 52.0, 54.0] 0
Tinashe [29.0, 43.0, 44.0] 0
Kiss of Life [164.0, 167.0, 170.0] 0
Taylor Swift [2235.0, 2250.0, 2250.0] 0
Taylor Swift [2235.0, 2250.0, 2250.0] 0
Taylor Swift [2235.0, 2250.0, 2250.0] 0
Don Toliver [0, 0, 0] 0
Jessica Audifred [52.0, 53.0, 53.0] 0
Taylor Switft [2235.0, 2250.0, 2250.0] 0
Sullivan King [73.0, 76.0, 84.0] 73.0
Ship Wrek [64.0, 65.0, 70.0] 64.0
Atliens [39.0, 40.0, 40.0] 54.0
IsoKnock [100.0, 103.0, 105.0] 0
Becky Hill

In [11]:
events["Date"] = pd.to_datetime(events["Date"]).dt.date
events["Presale"] = pd.to_datetime(events["Presale"]).dt.date


## Combining and Saving Processed Data

In [12]:
stubhub  = pd.concat([pd.Series(processed_artists), pd.Series(event_dates), pd.Series(ticket_prices), pd.Series(user_ticket_prices)], axis=1)

# Set the column names for the DataFrame
stubhub.columns = ["Artist", "Dates", "Stubhub", "Me"]

# Sort the DataFrame by the 'Me' column in descending order
stubhub = stubhub.sort_values(by="Me", ascending=False)

stubhub["Dates"] = pd.to_datetime(stubhub["Dates"]).dt.date

In [13]:
# Expand the 'Stubhub' column into separate columns for each listing
stubhub[['Stubhub_1', 'Stubhub_2', 'Stubhub_3']] = pd.DataFrame(stubhub['Stubhub'].tolist(), index=stubhub.index)


# Filter the DataFrame to find shows where 'Me' price is higher than 'Stubhub' price
higher_price_shows = stubhub[stubhub["Me"] > stubhub["Stubhub_1"]]

# Display the filtered DataFrame
print(higher_price_shows)

           Artist       Dates                Stubhub     Me  Stubhub_1  \
61    Alan Walker  2025-02-07  [119.0, 121.0, 122.0]  123.0      119.0   
7   Timmy Trumpet  2024-11-01     [80.0, 80.0, 90.0]   94.0       80.0   
16         Maddix  2024-11-08     [79.0, 80.0, 81.0]   81.0       79.0   
42           Hol!  2024-12-07        [78.0, 79.0, 0]   79.0       78.0   
3         Emorfik  2024-10-31     [52.0, 66.0, 99.0]   66.0       52.0   
28        Atliens  2024-11-16     [39.0, 40.0, 40.0]   54.0       39.0   

    Stubhub_2  Stubhub_3  
61      121.0      122.0  
7        80.0       90.0  
16       80.0       81.0  
42       79.0        0.0  
3        66.0       99.0  
28       40.0       40.0  


In [14]:
higher_price_shows = stubhub[(stubhub["Stubhub_2"]- stubhub["Me"] >2) & (stubhub["Me"] > 0)]
higher_price_shows

Unnamed: 0,Artist,Dates,Stubhub,Me,Stubhub_1,Stubhub_2,Stubhub_3
50,Lilly Palmer,2024-12-26,"[90.0, 96.0, 0]",90.0,90.0,96.0,0.0
49,Markus Schulz,2024-12-20,"[80.0, 103.0, 965.0]",80.0,80.0,103.0,965.0
26,Sullivan King,2024-11-15,"[73.0, 76.0, 84.0]",73.0,73.0,76.0,84.0
56,Glaive,2025-01-27,"[68.0, 72.0, 73.0]",68.0,68.0,72.0,73.0
5,Kaivon,2024-11-01,"[60.0, 119.0, 965.0]",60.0,60.0,119.0,965.0


In [15]:
# Write the DataFrame to an Excel file, replacing the existing sheet if it exists
with pd.ExcelWriter(path, mode='a', engine="openpyxl",date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD',  if_sheet_exists="replace") as writer:
    stubhub.to_excel(writer, sheet_name="stubhub", header=True, index=False)



with pd.ExcelWriter(path, mode='a', engine="openpyxl",date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD', if_sheet_exists="replace") as writer:
    events.to_excel(writer, sheet_name="Events", header=True, index=False)
 