# Web Scraping Ticket Prices from StubHub

This Jupyter Notebook demonstrates how to scrape ticket prices from StubHub using Selenium and process the data with pandas. The workflow includes:

1. Setting up the Selenium WebDriver with custom options.
2. Reading ticket sales data from an Excel file.
3. Logging into StubHub and navigating to the search results.
4. Extracting ticket prices for specified artists.
5. Saving the scraped data back to an Excel file for further analysis.

Below are the detailed steps and code implementation.


In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

import pandas as pd
import numpy as np
from math import nan
from datetime import datetime
import re
import time




In [40]:
# Define a test user-agent string to simulate a browser request
test_ua = 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'

# Initialize Chrome options for the WebDriver
options = Options()

# Uncomment the following line if you want to run the browser in headless mode (no GUI)
# options.add_argument("--headless")

# Set the window size for the browser
options.add_argument("--window-size=1920,1080")

# Set the user-agent to the test user-agent defined above
options.add_argument(f'--user-agent={test_ua}')

# Add additional options to improve stability and compatibility
options.add_argument('--no-sandbox')  # Required for running in some environments
options.add_argument("--disable-extensions")  # Disable extensions to avoid potential conflicts

# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)



# Get today's date
current_date = datetime.today()


In [41]:
# The following code reads an Excel file and loads the data into a pandas DataFrame
path = "../../Documents/Ticket Sales.xlsx"
events = pd.read_excel(path, sheet_name ="events 2.0")
# events = pd.read_excel(path, sheet_name ="Sheet1")


In [42]:
"""
This code snippet is designed to automate the process of logging into the StubHub website using Selenium WebDriver. Here's a step-by-step breakdown of what the code does:
1. Navigates to StubHub's homepage.
2. Clicks on the 'Sign In' button.
3. Waits for the email input field to be present and enters the email.
4. Enters the password and submits the form.
5. Attempts to click the submit button if it appears.
Note: Ensure that the necessary imports for Selenium WebDriver, WebDriverWait, and expected conditions (EC) are included in your script.
"""
# Navigate to StubHub's homepage
driver.get("https://www.stubhub.ca")

# Click on the 'Sign In' button
driver.find_element(By.XPATH, "//*[text() ='Sign In']").click()

# Wait for the email input field to be present and enter the email
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='email']")))
driver.find_element(By.CSS_SELECTOR, "input[type='email']").send_keys("eric9090909090@hotmail.com")

# Enter the password and submit the form
driver.find_element(By.CSS_SELECTOR, "input[type=password]").send_keys("BlckPnk39!@!" + Keys.ENTER)
time.sleep(5)

# Attempt to click the submit button if it appears
try:
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
except:
    pass
time.sleep(5)

## StubHub Event Scraper Functions

In [43]:
def generate_stubhub_url(artist, location= "Toronto, Canada", venue = ""):
    """
    Generates a StubHub search URL for a given string.
    This function takes a string input, replaces spaces with plus signs,
    and appends it to a predefined StubHub search URL. If the input is 
    None or NaN, it returns a placeholder string "lol".
    
    Args:
        search_query (str): The search query string.
    
    Returns:
        str: A formatted StubHub search URL or "lol" if the input is None or NaN.
    """
    if artist in [None, np.nan]:
        artist = "lol"
    if location in [None, np.nan, nan]:
        location = "Toronto"
    if venue in [None, np.nan, nan]:
        venue = ""
    return "https://www.stubhub.ca/secure/search?q=" + artist + " " + location + " "+ venue+ "&sellSearch=false&sortBy="

def get_event_link():
    """
    Retrieves the href attribute of the event link from the specified XPath.
    
    Returns:
        str: The href attribute of the event link.
    """
    
    
    event_link_element = driver.find_element(By.XPATH, "//*[@target='_blank']")
    return event_link_element.get_attribute("href") + "&betterValueTickets=false" + "&estimatedFees=false"


def extract_date_from_url(url):
    """
    Extracts a date from a given URL string using regex.
    
    Args:
        url (str): The URL string to extract the date from.
    
    Returns:
        datetime: The extracted date as a datetime object, or None if no date is found.
    """
    date_match = re.search(r'(\d{1,2})-(\d{1,2})-(\d{4})', url)

    month, day, year = date_match.groups()
    return pd.Timestamp(datetime.strptime(f"{month}-{day}-{year}", '%m-%d-%Y'))


venues = ["Budweiser", "History", " Rogers Stadium", "Massey Hall", "DPRTMNT", "Rogers Arena", "Axis", "Noir", "Rebel", "Cabana", "Woodbine Park", "CODA", "Metropolis"]

def get_venue_text():
    """
    Retrieves the text of the venue from the specified XPath from the search query page
    
    Args:
    
    Returns:
        str: The text of the venue.
    """                                                
    venue_element = driver.find_element(By.XPATH, "//*[@id='event-detail-header']//div/div/div[1]/div[2]/div/div/div[2]/button")
    venue_text = venue_element.text.split(',')[0]

    for v in venues:
        if v in venue_text:
            return v
    return venue_text



def get_user_ticket_price():
    """
    Retrieves the user's ticket price from the list of ticket listings.
    
    Returns:
        float: The user's ticket price, or 0 if no user ticket price is found.
    """
    user_ticket_price = 0
    ticket_listings = driver.find_elements(By.CLASS_NAME, "sc-57jg3s-0")
    for listing in ticket_listings:
        try:
            listing.find_element(By.CLASS_NAME, "sc-1l8fa2j-14")
            user_ticket_price = listing.find_element(By.CLASS_NAME, "sc-1bp3ico-0").get_attribute("data-price")
            user_ticket_price = float(re.sub(r'[^\d.]', '', user_ticket_price))
            break
        except:
            pass    
    return user_ticket_price

def get_lowest_ticket_price():
    # Retrieve the price from the listing
    ticket_price = driver.find_element(By.CLASS_NAME, "sc-1bp3ico-0").get_attribute("data-price")
    ticket_price = float(re.sub(r'[^\d.]', '', ticket_price))
    return ticket_price


def get_location():
    """
    Retrieves the text of the location from the specified XPath.
    
    Returns:
        str: The text of the city.
    """
    location_element = driver.find_element(By.XPATH, "//*[@id='event-detail-header']//div/div/div[1]/div[2]/div/div/div[2]/button")
    return location_element.text.split(',')[1]




## Helper Functions for StubHub Automation

In [44]:
def close_prompts():
    """
    Closes any modal that appears and applies ticket filters.
    """
    # Close any modal that appears
    driver.find_element(By.XPATH, '//*[@id="modal-root"]/div/div/div/div[2]/div[3]/button').click()
   
def apply_ticket_filters():   
    driver.find_element(By.CSS_SELECTOR, "div.sc-xrltsx-2").click()
    reccomended_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-popular-filters']/div/div/div/div[2]/div/div/div/div[2]/div/input")
    if reccomended_filter.get_attribute("value") == "true":
        time.sleep(2)
        reccomended_filter.click()


def filter_tickets_by_venue(venue_name):
    """
    Filters tickets by venue and clicks on the appropriate ticket filter.

    Args:
        venue_name (str): The name of the venue to filter tickets by.
    """
    if venue_name == "Scotiabank Arena":
        ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
        ticket_filter.click()
        # Find elements with the specified class and loop through them
        div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
        for div in div_elements:
            if "Lower" in div.text:
                div.click()
                break
    if venue_name == "Massey Hall":
        ticket_filter = driver.find_element(By.XPATH, "//*[@id='stubhub-event-detail-ticket-class-filter']")
        ticket_filter.click()
        # Find elements with the specified class and loop through them
        div_elements = driver.find_elements(By.CLASS_NAME, "sc-vt2wmu-3")
        for div in div_elements:
            if "Main Floor" in div.text:
                div.click()
                break


In [45]:

def process_artist(index, row):
    """
    Processes an artist's event by navigating to the search URL, applying filters, and retrieving ticket prices.
    
    Args:
        row (pd.Series): A row from the sales DataFrame containing artist and event information.
    
    Returns:
        tuple: A tuple containing the artist's name, the ticket price from StubHub, and the user's ticket price.
    """

    artist_search_url = generate_stubhub_url(row["Artist"], row["Location"], row["Venue"])
    driver.get(artist_search_url)
    event_link = get_event_link()

    if pd.isna(row["Date"]):
        events.at[index, "Date"] = extract_date_from_url(event_link)
    if pd.isna(row["Purchased"]):
        events.at[index, "Purchased"] = 0
    if pd.isna(row["Sold"]):
        events.at[index, "Sold"] = 0
    if pd.isna(row["Cost"]):
        events.at[index, "Cost"] = 0
    if pd.isna(row["Resell"]):
        events.at[index, "Resell"] = 0
    if pd.isna(row["Lowest Cost"]):
        events.at[index, "Lowest Cost"] = 0
    if row["Max Resell"] in [None, np.nan, nan]:
        events.at[index, "Max Resell"] = 0
 
    driver.get(event_link)
    # Close any modal that appears
    close_prompts()
    if  row["Location"] in [None, np.nan, nan]:
        events.at[index, "Location"] = get_location()
    if row["Venue"] in [None, np.nan, nan]:
        events.at[index, "Venue"] = get_venue_text()
    time.sleep(1)
    try:
        apply_ticket_filters()
        filter_tickets_by_venue(row["Venue"])
        time.sleep(6)
    except:
        pass
    
    user_ticket_price = get_user_ticket_price()
    
    ticket_price = get_lowest_ticket_price()
    if (ticket_price) > (events.at[index, "Max Resell"]) and (ticket_price) < 200 and row["Purchased"] == 0:
        events.at[index, "Max Resell"] = ticket_price
    return row["Artist"], ticket_price, user_ticket_price





## Processing Artists and Ticket Prices


In [46]:
# Initialize lists to store results
processed_artists = []
ticket_prices = []
user_ticket_prices = []
event_dates = []


# Iterate over each row in the sales DataFrame
for index, row in events.iterrows():
    # Check if the event date is in the future and the artist is not already processed
    if (current_date < row["Date"] or row["Date"] in [None, np.nan, nan] or pd.isna(row["Date"])) and row["Artist"] not in processed_artists:
        artist, ticket_price, user_ticket_price = process_artist(index, row)
        if artist:
            processed_artists.append(artist)
            ticket_prices.append(ticket_price)
            user_ticket_prices.append(user_ticket_price)
            event_dates.append(row["Date"])
            print(artist, ticket_price, user_ticket_price)


Allan Walker 122.0 123.0
Atliens 48.0 54.0
Billie Eillish 458.0 0
Black Tiger Sex Machine 38.0 0
Chelsea Cutler and Jeremy Zucker 81.0 0
Cloone 79.0 0
Dion Timmer 67.0 67.0
Disco Lines 126.0 0
Emorfik 62.0 62.0
Frank Walker 35.0 0
Hol! 87.0 87.0
John Marr 72.0 0
Justin Timberlake 87.0 0
Kaivon 40.0 40.0
Kiss of Life 200.0 0
Lavern 57.0 57.0
Layz 67.0 67.0
Lilly Palmer 93.0 93.0
Linsey Stirling 102.0 0
Maddix 90.0 90.0
Markus Schulz 76.0 76.0
Maya Hawke 30.0 0
Megan Monrey 428.0 0
Metalica 324.0 0
MK 72.0 74.0
Myles Smith 65.0 0
Oasis 326.0 0
Pitbull 222.0 0
Polo G 59.0 0
Shawn Mendez 1250.0 0
Ship Wrek 57.0 57.0
Snow Patrol 71.0 0
Sofi Tukker 94.0 0
Sullivan King 74.0 74.0
Timmy Trumpet 73.0 73.0
Tinashe 41.0 65.0
Tinflicker 76.0 0
Trivecta 72.0 72.0
Two Friends 99.0 0
Umek 27.0 67.0
Vini Vici 35.0 0
Virtual Riot 58.0 0
Yuridia 92.0 0


In [47]:
events["Profit"] = events["Resell"] - events["Cost"]
events["Total Profit"] = events["Profit"]*events["Sold"]
events["Max Margin"] = (events["Max Resell"] - events["Lowest Cost"])/events["Lowest Cost"]*100
events["Remaining Inventory Cost"] = (events["Purchased"] - events["Sold"]) * events["Cost"]

## Combining and Saving Processed Data

In [48]:
stubhub  = pd.concat([pd.Series(processed_artists), pd.Series(event_dates), pd.Series(ticket_prices), pd.Series(user_ticket_prices)], axis=1)

# Set the column names for the DataFrame
stubhub.columns = ["Artist", "Dates", "Stubhub", "Me"]

# Sort the DataFrame by the 'Me' column in descending order
stubhub = stubhub.sort_values(by="Me", ascending=False)

# Write the DataFrame to an Excel file, replacing the existing sheet if it exists
with pd.ExcelWriter(path, mode='a', engine="openpyxl", if_sheet_exists="replace") as writer:
    stubhub.to_excel(writer, sheet_name="stubhub", header=True)



with pd.ExcelWriter(path, mode='a', engine="openpyxl", if_sheet_exists="replace") as writer:
    events.to_excel(writer, sheet_name="events 2.0", header=True)
 

In [49]:
# Filter the DataFrame to find shows where 'Me' price is higher than 'Stubhub' price
higher_price_shows = stubhub[stubhub["Me"] > stubhub["Stubhub"]]

# Display the filtered DataFrame
print(higher_price_shows)

          Artist      Dates  Stubhub     Me
0   Allan Walker 2025-02-07    122.0  123.0
24            MK 2024-11-22     72.0   74.0
39          Umek 2024-10-25     27.0   67.0
35       Tinashe 2024-11-11     41.0   65.0
1        Atliens 2024-11-16     48.0   54.0


In [50]:
element = driver.find_element(By.XPATH, "//*[@target='_blank']")

href = element.get_attribute("href")
print(href)

https://www.mapbox.com/


In [51]:
filter_tickets_by_venue("Massey Hall")
# driver.find_element(By.XPATH, "//*[text()='Zones']").click()

In [52]:
events[events["Artist"] == "Emorfik"]

pd.isna(events.loc[30]["Sold"])
# events.loc[30]["Sold"] 

False