# Web Scraping Ticket Prices from StubHub

This Jupyter Notebook demonstrates how to scrape ticket prices from StubHub using Selenium and process the data with pandas. The workflow includes:

1. Setting up the Selenium WebDriver with custom options.
2. Reading ticket sales data from an Excel file.
3. Logging into StubHub and navigating to the search results.
4. Extracting ticket prices for specified artists.
5. Saving the scraped data back to an Excel file for further analysis.

Below are the detailed steps and code implementation.


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
from datetime import datetime
import re
import time


# Define a test user-agent string to simulate a browser request
test_ua = 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'

# Initialize Chrome options for the WebDriver
options = Options()

# Uncomment the following line if you want to run the browser in headless mode (no GUI)
# options.add_argument("--headless")

# Set the window size for the browser
options.add_argument("--window-size=1920,1080")

# Set the user-agent to the test user-agent defined above
options.add_argument(f'--user-agent={test_ua}')

# Add additional options to improve stability and compatibility
options.add_argument('--no-sandbox')  # Required for running in some environments
options.add_argument("--disable-extensions")  # Disable extensions to avoid potential conflicts

# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)




In [2]:
# The following code reads an Excel file and loads the data into a pandas DataFrame
path = "../../Documents/Ticket Sales.xlsx"
sales = pd.read_excel(path, sheet_name ="Sheet1")


## Logging into StubHub

The following cell demonstrates how to log into StubHub using Selenium. It navigates to the StubHub website, clicks on the "Sign In" button, and enters the login credentials. After logging in, it handles any potential pop-ups or additional steps required to complete the login process.

In [3]:

driver.get("https://www.stubhub.ca")

driver.find_element(By.XPATH, "//*[text() ='Sign In']").click()
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='email']")))
driver.find_element(By.CSS_SELECTOR, "input[type='email']").send_keys("eric9090909090@hotmail.com")
driver.find_element(By.CSS_SELECTOR, "input[type=password]").send_keys("BlckPnk39!@!"+ Keys.ENTER)
time.sleep(5)
try:
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
except:
    pass
time.sleep(5)

In [4]:

def stub_url(s):
    """
    Generates a StubHub search URL for a given string.
    This function takes a string input, replaces spaces with plus signs,
    and appends it to a predefined StubHub search URL. If the input is 
    None or NaN, it returns a placeholder string "lol".
    Args:
        s (str): The search query string.
    Returns:
        str: A formatted StubHub search URL or "lol" if the input is None or NaN.
    """
    if s in [None, np.nan]:
        return "lol"
    else:
        s = s.replace(" ", "+")
        return "https://www.stubhub.ca/secure/search?q="+s+ " Toronto"+"&sellSearch=false&sortBy="



In [5]:
"""
Reads ticket sales data from an Excel file.

The input Excel file contains names of artists and dates of their shows.
"""
path = "../../Documents/Ticket Sales.xlsx"

sales = pd.read_excel(path, sheet_name ="Sheet1")

In [13]:
# Extract artist names from the sales DataFrame
names = sales["Artist"]

# Initialize lists to store results
remain = []
prices = []
my_prices = []
dates  = []

# Get today's date
today = datetime.today()

# Iterate over each row in the sales DataFrame
for i, r in sales.iterrows():
    # Check if the event date is in the future and the artist is not already processed
    if today < r["Date"] and r["Artist"] not in remain:
        # Generate the search URL for the artist
        artist_search = stub_url(r["Artist"])
        driver.get(artist_search)
        try:
            # Wait for the event link to be present and get its href attribute
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div[4]/div[2]/div/div[1]/div/div[2]/ul/li[2]/a")))
            event_link = driver.find_element(By.XPATH, "/html/body/div[1]/div[1]/div[4]/div[2]/div/div[1]/div/div[2]/ul/li[2]/a").get_attribute("href") + "&betterValueTickets=false" + "&estimatedFees=false"
            driver.get(event_link)
        except:
            # Print a message if no event link is found
            print(r["Artist"] + ' no event link')
            continue
        
        # Close any modal that appears
        driver.find_element(By.XPATH, '//*[@id="modal-root"]/div/div/div/div[2]/div[3]/button').click()
        
        # Apply filters
        driver.find_element(By.CSS_SELECTOR, "div.sc-xrltsx-2").click()
        time.sleep(1)
        if driver.find_element(By.CSS_SELECTOR, "input[type=checkbox]").get_attribute("value") == "true":
            driver.find_element(By.CSS_SELECTOR, "input[type=checkbox]").click()
        time.sleep(4)
        
        # Initialize the price for the current user
        me_price = "0"
        
        # Get the list of ticket listings
        listings = driver.find_elements(By.CLASS_NAME, "sc-57jg3s-0")
        for l in listings:
            try:
                # Check if the listing is for the current user and get the price
                l.find_element(By.CLASS_NAME, "sc-kj9927-14")
                me_price = l.find_element(By.CLASS_NAME, "sc-1bp3ico-0").get_attribute("data-price")
                break
            except:
                pass
        
        # Print a message if no price is found for the current user
        if me_price == "0":
            print(r["Artist"] + ' no me price')
        
        try:
            # Get the price from the listing and append the results to the lists
            price = driver.find_element(By.CLASS_NAME, "sc-1bp3ico-0").get_attribute("data-price")
            remain.append(r["Artist"])
            prices.append(price)
            my_prices.append(me_price)
            dates.append(r["Date"])
            print(r["Artist"], price, me_price)
        except:
            # Print a message if no price is found
            print('no price')
            pass

Dua Lipa no me price
Dua Lipa C$143 0
Myles Smith no me price
Myles Smith C$81 0
Allan Walker no me price
Allan Walker C$106 0
Two Friends no me price
Two Friends C$78 0
Markus Schulz no me price
Markus Schulz C$76 0
Layz no me price
Layz C$67 0
Virtual Riot no me price
Virtual Riot C$49 0
Chelsea Cutter and Jeremy Zucker no me price
Chelsea Cutter and Jeremy Zucker C$59 0
Black Tiger Sex Machine no me price
Black Tiger Sex Machine C$70 0
Atliens no me price
Atliens C$46 0
Sullivan King no me price
Sullivan King C$74 0
Ship Wrek no me price
Ship Wrek C$57 0
Kiss of Life no me price
Kiss of Life C$184 0
Tinashe no me price
Tinashe C$57 0
Maddix no me price
Maddix C$937 0
Vini Vici no me price
Vini Vici C$35 0
Polo G no me price
Polo G C$64 0
Pitbull no me price
Pitbull C$212 0
Timmy Trumpet no me price
Timmy Trumpet C$73 0
Kaivon no me price
Kaivon C$50 0
Lavern no me price
Lavern C$57 0
Umek no me price
Umek C$67 0
Suicide Boys no me price
Suicide Boys C$76 0
Justin Timberlake no me pr

In [11]:
# Concatenate the lists into a DataFrame
stubhub = pd.concat([pd.Series(remain), pd.Series(dates), pd.Series(prices), pd.Series(my_prices)], axis=1)

# Set the column names for the DataFrame
stubhub.columns = ["Artist", "Dates", "Stubhub", "Me"]

# Extract numeric values from the 'Stubhub' column and convert to float
stubhub["Stubhub"] = stubhub["Stubhub"].str.extract('(\\d+)').astype(float)

# Extract numeric values from the 'Me' column and convert to float
stubhub["Me"] = stubhub["Me"].str.extract('(\\d+)').astype(float)

# Sort the DataFrame by the 'Me' column in descending order
stubhub = stubhub.sort_values(by="Me", ascending=False)

# Write the DataFrame to an Excel file, replacing the existing sheet if it exists
with pd.ExcelWriter(path, mode='a', engine="openpyxl", if_sheet_exists="replace") as writer:
    stubhub.to_excel(writer, sheet_name="stubhub", header=True)

In [12]:
stubhub[stubhub['Stubhub'] < stubhub['Me']]


Unnamed: 0,Artist,Dates,Stubhub,Me
