# Web Scraping Ticket Prices from StubHub

This Jupyter Notebook demonstrates how to scrape ticket prices from StubHub using Selenium and process the data with pandas. The workflow includes:

1. Setting up the Selenium WebDriver with custom options.
2. Reading ticket sales data from an Excel file.
3. Logging into StubHub and navigating to the search results.
4. Extracting ticket prices for specified artists.
5. Saving the scraped data back to an Excel file for further analysis.

Below are the detailed steps and code implementation.


In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import re
from datetime import datetime

import json
import pandas as pd
import numpy as np
from math import nan
import re
import time
from bs4 import BeautifulSoup




In [10]:
# Define a test user-agent string to simulate a browser request
test_ua = 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'

# Initialize Chrome options for the WebDriver
options = Options()

# Uncomment the following line if you want to run the browser in headless mode (no GUI)
# options.add_argument("--headless")

# Set the window size for the browser
options.add_argument("--window-size=1920,1080")

# Set the user-agent to the test user-agent defined above
options.add_argument(f'--user-agent={test_ua}')

# Add additional options to improve stability and compatibility
options.add_argument('--no-sandbox')  # Required for running in some environments
options.add_argument("--disable-extensions")  # Disable extensions to avoid potential conflicts

# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)

# Get today's date
current_date = datetime.today()


In [137]:
# The following code reads an Excel file and loads the data into a pandas DataFrame
path = "../../Documents/Ticket Sales.xlsx"
events = pd.read_excel(path, sheet_name ="events 2.0")
# events = pd.read_excel(path, sheet_name ="Sheet1")


In [15]:
"""
This code snippet is designed to automate the process of logging into the StubHub website using Selenium WebDriver. Here's a step-by-step breakdown of what the code does:
1. Navigates to StubHub's homepage.
2. Clicks on the 'Sign In' button.
3. Waits for the email input field to be present and enters the email.
4. Enters the password and submits the form.
5. Attempts to click the submit button if it appears.
Note: Ensure that the necessary imports for Selenium WebDriver, WebDriverWait, and expected conditions (EC) are included in your script.
"""
# Navigate to StubHub's homepage
driver.get("https://www.stubhub.ca")

# Click on the 'Sign In' button
driver.find_element(By.XPATH, "//*[text() ='Sign In']").click()

# Wait for the email input field to be present and enter the email
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='email']")))
driver.find_element(By.CSS_SELECTOR, "input[type='email']").send_keys("eric9090909090@hotmail.com")

# Enter the password and submit the form
driver.find_element(By.CSS_SELECTOR, "input[type=password]").send_keys("BlckPnk39!@!" + Keys.ENTER)
time.sleep(5)

# Attempt to click the submit button if it appears
try:
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
except:
    pass
time.sleep(5)

## StubHub Event Scraper Functions

In [139]:

def generate_stubhub_url(artist, location= "Toronto"):
    """
    Generates a StubHub search URL for a given string.
    This function takes a string input, replaces spaces with plus signs,
    and appends it to a predefined StubHub search URL. If the input is 
    None or NaN, it returns a placeholder string "lol".
    
    Args:
        search_query (str): The search query string.
    
    Returns:
        str: A formatted StubHub search URL or "lol" if the input is None or NaN.
    """

    if pd.isna(artist):
        artist = "lol"
    if pd.isna(location):
        location = "Toronto"
    return "https://www.stubhub.ca/secure/search?q=" + artist + "%20" + location

def get_event_link(event_grid, date):
    """
    Retrieves the href attribute of the event link from the specified XPath.
    
    Returns:
        str: The href attribute of the event link.
    """
    events =  event_grid["0"]["items"]
    if pd.isna(date):
        return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
    for event in events:
        if event["formattedDate"] == date.strftime("%d %b %Y"):
            return event["url"] + "&betterValueTickets=false" + "&estimatedFees=false"
    return events[0]["url"] + "&betterValueTickets=false" + "&estimatedFees=false"+"&quantity=0"

def get_date(event_grid):
    """
    Extracts a date from a given URL string using regex.
    
    Args:
        url (str): The URL string to extract the date from.
    
    Returns:
        datetime: The extracted date as a datetime object, or None if no date is found.
    """
    date_str = event_grid["0"]["items"][0]["formattedDate"]

    # Check if the date string matches the format '27 Nov' or '27 Nov 2025'
    if re.match(r'\d{2} \w{3} \d{4}', date_str):
        date = datetime.strptime(date_str, "%d %b %Y")
    elif re.match(r'\d{2} \w{3}', date_str):
        date = datetime.strptime(date_str + " " + str(current_date.year), "%d %b %Y")
    else:
        date = None

    return date


venues = ["Budweiser", "History", " Rogers Stadium", "Massey Hall", "DPRTMNT", "Rogers Arena", "Axis", "Noir", "Rebel", "Cabana", "Woodbine Park", "CODA", "Metropolis"]
def get_venue_text(event_grid):
    """
    Retrieves the text of the venue from the specified XPath from the search query page
    
    Args:
    
    Returns:
        str: The text of the venue.
    """                                                
    venue = event_grid["0"]["items"][0]["venueName"]
    for v in venues:
        if v in venue:
            return v
    return venue

def get_location(event_grid):
    """
    Retrieves the text of the location from the specified XPath.
    
    Returns:
        str: The text of the city.
    """
    return event_grid["0"]["items"][0]["venueCity"]

def get_prices(listings):
    """
    Retrieves the text of the location from the specified XPath.
    
    Returns:
        str: The text of the city.
    """
    user_price= 0
    prices = []
    for listing in listings:
        if listing["isUsersListing"]:
            user_price = listing["rawPrice"]
        prices.append(listing["rawPrice"])
    if len(prices) < 1:
        prices.append(0)

    return user_price, prices[:3]


In [146]:

def process_artist(index, row):
    """
    Processes an artist's event by navigating to the search URL, applying filters, and retrieving ticket prices.
    
    Args:
        row (pd.Series): A row from the sales DataFrame containing artist and event information.
    
    Returns:
        tuple: A tuple containing the artist's name, the ticket price from StubHub, and the user's ticket price.
    """

    artist_search_url = generate_stubhub_url(row["Artist"], row["Location"])
    driver.get("view-source:" + artist_search_url)
    event_grid = driver.page_source
    soup = BeautifulSoup(event_grid, 'html.parser')
    event_grid = soup.find('td', class_='line-content', string=lambda x: x and 'eventGrids' in x)
    json_str = event_grid.text.strip()
    data = json.loads(json_str)

    event_grid = data["eventGrids"]
    if pd.isna(row["Date"]):
        events.at[index, "Date"] = get_date(event_grid)
    if pd.isna(row["Venue"]):
        events.at[index, "Venue"] = get_venue_text(event_grid)
    if pd.isna(row["Location"]):
        events.at[index, "Location"] = get_location(event_grid)
    if pd.isna(row["Purchased"]):
        events.at[index, "Purchased"] = 0
    if pd.isna(row["Sold"]):
        events.at[index, "Sold"] = 0
    if pd.isna(row["Cost"]):
        events.at[index, "Cost"] = 0
    if pd.isna(row["Resell"]):
        events.at[index, "Resell"] = 0
    if pd.isna(row["Lowest Cost"]):
        events.at[index, "Lowest Cost"] = 0
    if pd.isna(row["Max Resell"]):
        events.at[index, "Max Resell"] = 0
    
    event_link = get_event_link(event_grid, row["Date"])
    driver.get("view-source:"+event_link)
    # Close any modal that appears
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    listings = soup.find('td', class_='line-content', string=lambda x: x and '{"appName":"viagogo-event","grid' in x)
    json_str = listings.text.strip()
    data = json.loads(json_str)

    sections = [(item.get('sectionId'), item.get('sectionMapName')) for item in data.get('grid', {}).get('items', [])]

    # # Print the section IDs and their names
    # print("Section IDs and their Names:")
    # for section_id, section_name in sections:
    #     print(f"Section ID: {section_id}, Section Name: {section_name}")
    listings = data["grid"]["items"]
    print(listings)
    user_price, prices = get_prices(listings)
    if (prices[0] > events.at[index, "Max Resell"] or pd.isna(row["Max Resell"])) and row["Purchased"] == 0:
        events.at[index, "Max Resell"] = prices[0]
    return row["Artist"], prices, user_price





## Processing Artists and Ticket Prices


In [148]:
# Initialize lists to store results
processed_artists = []
prices = []
user_prices = []
event_dates = []

# Iterate over each row in the sales DataFrame
for index, row in events.iterrows():
    # Check if the event date is in the future and the artist is not already processed
        
    if (current_date < pd.to_datetime(row["Date"]) or pd.isna(row["Date"])):
        if row["Artist"]  == "Alan Walker":
            artist, price, user_price = process_artist(index, row)
            if artist:
                processed_artists.append(artist)
                prices.append(price)
                user_prices.append(user_price)
                event_dates.append(row["Date"])
                print(artist, prices[-1], user_price)


[{'id': 8088201504, 'clientApplicationId': 312, 'eventId': 155395422, 'section': 'General Admission', 'sectionId': 588075, 'sectionMapName': 'General Admission', 'sectionType': 4, 'row': '', 'seat': '118392726_118392728', 'seatFromInternal': '118392726', 'hasSeatDetails': False, 'hasSeatDetailsUS': False, 'availableTickets': 3, 'listingPreviewPriceAndFeeDisclosure': {'hasValue': False}, 'showRecentlySold': False, 'availableQuantities': [1, 2, 3], 'ticketClass': 233, 'ticketClassName': 'General Admission', 'maxQuantity': 3, 'hasListingNotes': False, 'listingNotes': [], 'rowId': 194273, 'isUsersListing': False, 'isPreUploaded': False, 'rowContent': 'Row', 'rawPrice': 101.56, 'price': 'C$102', 'sellerNetProceeds': '', 'priceWithFees': 'C$131', 'ticketTypeId': 1, 'ticketTypeGroupId': 0, 'listingTypeId': 1, 'listingCurrencyCode': 'USD', 'buyerCurrencyCode': 'CAD', 'faceValue': 0.0, 'faceValueCurrencyCode': 'USD', 'vfsUrl': '', 'formattedActiveSince': '5 hours ago', 'isSeatedTogether': False

In [142]:
events["Profit"] = events["Resell"] - events["Cost"]
events["Total Profit"] = events["Profit"] * events["Sold"]
events["Remaining Inventory Cost"] = (events["Purchased"] - events["Sold"]) * events["Cost"]

events["Date"] = pd.to_datetime(events["Date"]).dt.date
events["Presale"] = pd.to_datetime(events["Presale"]).dt.date

# events["Sellout Date"] = pd.to_datetime(events["Sellout Date"], errors='ignore').dt.date
# 

## Combining and Saving Processed Data

In [143]:
stubhub  = pd.concat([pd.Series(processed_artists), pd.Series(event_dates), pd.Series(ticket_prices), pd.Series(user_ticket_prices)], axis=1)

# Set the column names for the DataFrame
stubhub.columns = ["Artist", "Dates", "Stubhub", "Me"]

# Sort the DataFrame by the 'Me' column in descending order
stubhub = stubhub.sort_values(by="Me", ascending=False)


NameError: name 'ticket_prices' is not defined

In [80]:
# Expand the 'Stubhub' column into separate columns for each listing
stubhub[['Stubhub_1', 'Stubhub_2', 'Stubhub_3']] = pd.DataFrame(stubhub['Stubhub'].tolist(), index=stubhub.index)


# Filter the DataFrame to find shows where 'Me' price is higher than 'Stubhub' price
higher_price_shows = stubhub[stubhub["Me"] > stubhub["Stubhub_1"]]

# Display the filtered DataFrame
print(higher_price_shows)

                              Artist      Dates                Stubhub     Me  \
6   Chelsea Cutler and Jeremy Zucker 2024-11-24     [80.0, 81.0, 83.0]  149.0   
0                        Alan Walker 2025-02-07  [114.0, 120.0, 122.0]  123.0   
3            Black Tiger Sex Machine 2024-11-23     [86.0, 86.0, 91.0]  108.0   
10                       Dion Timmer 2024-10-26     [65.0, 67.0, 67.0]   67.0   
51                              Umek 2024-10-25     [27.0, 67.0, 69.0]   67.0   
47                           Tinashe 2024-11-11     [44.0, 46.0, 46.0]   65.0   
1                            Atliens 2024-11-16     [49.0, 49.0, 49.0]   54.0   

    Stubhub_1  Stubhub_2  Stubhub_3  
6        80.0       81.0       83.0  
0       114.0      120.0      122.0  
3        86.0       86.0       91.0  
10       65.0       67.0       67.0  
51       27.0       67.0       69.0  
47       44.0       46.0       46.0  
1        49.0       49.0       49.0  


In [91]:
higher_price_shows = stubhub[(stubhub["Stubhub_2"]- stubhub["Me"] >2) & (stubhub["Me"] > 0)]
higher_price_shows

Unnamed: 0,Artist,Dates,Stubhub,Me,Stubhub_1,Stubhub_2,Stubhub_3
40,Ship Wrek,2024-11-15,"[57.0, 64.0, 69.0]",57.0,57.0,64.0,69.0


In [81]:
# Write the DataFrame to an Excel file, replacing the existing sheet if it exists
with pd.ExcelWriter(path, mode='a', engine="openpyxl",date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD',  if_sheet_exists="replace") as writer:
    stubhub.to_excel(writer, sheet_name="stubhub", header=True, index=False)



with pd.ExcelWriter(path, mode='a', engine="openpyxl",date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD', if_sheet_exists="replace") as writer:
    events.to_excel(writer, sheet_name="events 2.0", header=True, index=False)
 

In [22]:
import requests

url = "https://www.facebook.com/tr/"
params = {
    'id': '440862442988419',
    'ev': 'PageView',
    'dl': 'https://www.stubhub.ca/alan-walker-toronto-tickets-2-7-2025/event/155395422/?quantity=2&priceRange=0%2C100',
    'rl': '',
    'if': 'false',
    'ts': '1728968486025',
    'sw': '2560',
    'sh': '1440',
    'v': '2.9.171',
    'r': 'stable',
    'ec': '3',
    'o': '4125',
    'fbp': 'fb.1.1728965615906.92094318827965092',
    'ler': 'empty',
    'cdl': 'API_unavailable',
    'it': '1728968450005',
    'coo': 'false',
    'rqm': 'GET'
}

response = requests.get(url, params=params)

print(response.status_code)
print(response.text)

200

