In [1]:
# ===== STANDARD LIBRARY IMPORTS =====
import json
import subprocess
import sys
import time
import re
from datetime import datetime
from urllib.parse import quote_plus, urlsplit, urlunsplit

# ===== THIRD-PARTY DATA & ANALYSIS =====
import pandas as pd
import numpy as np
from sympy import flatten

# ===== WEB SCRAPING & HTTP =====
import requests
from bs4 import BeautifulSoup

# ===== SELENIUM WEB AUTOMATION =====
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

# ===== DATABASE CONNECTIVITY =====
import mysql.connector
from mysql.connector import Error
from sqlalchemy import create_engine

# ===== CONCURRENCY & THREADING =====
import concurrent.futures
import queue

# Get today's date and start timer
current_date = datetime.today()
script_start_time = time.time()

print("✅ All libraries imported successfully!")
print(f"📅 Current date: {current_date.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"⏱️  Script execution started at: {datetime.now().strftime('%H:%M:%S')}")


✅ All libraries imported successfully!
📅 Current date: 2025-07-22 13:03:28
⏱️  Script execution started at: 13:03:28


In [2]:
# URL utility functions (imports now in cell 1)
def remove_query_from_url(url):
    url_parts = urlsplit(url)
    url_without_query = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, '', ''))
    return url_without_query



In [3]:
def generate_stubhub_url(artist):
    return "https://www.stubhub.ca/secure/search?q=" + artist


### Driver Interactions

In [4]:

def click_checkboxes(driver):
    """
    Clicks each checkbox, unchecks all others, and returns the link each time.
    """
    # Find and click the "Zones" button to expand the filter options

    zones =  driver.find_element(By.ID, "stubhub-event-detail-ticket-class-filter")
    zones.click()
    checkboxes = zones.find_elements(By.CSS_SELECTOR, 'input[type="checkbox"]')
    links = []

    for checkbox in checkboxes:
        # Uncheck all checkboxes first
        for cb in checkboxes:
            if cb.is_selected():
                cb.click()
        checkbox.click()
        links.append(driver.current_url)
    if len(links) == 0:
        links.append(driver.current_url)
    return links

# Call the function and store the links




In [5]:
path = "../../Documents/Ticket Sales.xlsx"
unique_artists = pd.read_excel(path, sheet_name ="Events")["Artist"].unique()


### Cocunrrency Executors

In [6]:
# Initialize variables
event_listing = []
driver_queue = queue.Queue()
processed_titles = []
short_cols = ['id', 'eventId', 'Artist', 'Venue', 'City', 'Event Date', 'countryName', 'Event Name', 'Event Type', 'Performer Type', 'Performer', 'section', 'sectionId', 'sectionMapName', 'sectionType', 'row', 'rowId', 'seat', 'seatFrom', 'seatTo', 'faceValue', 'rawPrice', 'priceWithFees', 'price', 'ticketClass', 'ticketClassName', 'ticketTypeId', 'ticketTypeGroupId', 'listingTypeId', 'listingCurrencyCode', 'buyerCurrencyCode', 'faceValueCurrencyCode', 'Updated', 'formattedDealScore']

# Function to fetch artist data
def fetch_artist_data(artist):
    artist_search_url = generate_stubhub_url(artist)
    response = requests.get(artist_search_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    event_grid = soup.find('script', type='application/json', string=lambda x: x and 'eventGrids' in x)
    if event_grid is None:
        return None
    json_str = event_grid.text.strip()
    data = json.loads(json_str)
    event_grid = data["eventGrids"]
    events = event_grid["0"]["items"][0:7]
    for event in events:
        event["Artist"] = artist
    return events

# Function to get listing info
def get_listing_info(url):
    response = requests.get(url)
    page_source = response.text
    soup = BeautifulSoup(page_source, 'html.parser')
    script_tag = soup.find('script', id='index-data', type='application/json')
    if script_tag is None:
        return None
    json_string = script_tag.string
    index_data = json.loads(json_string)
    grid_items = index_data['grid']['items']
    df = pd.DataFrame(grid_items)
    formatted_date = datetime.strptime(index_data["formattedEventDateTime"], '%a %b %d %Y %I:%M %p').date()
    df["Event Date"] = formatted_date
    df["Updated"] = pd.Timestamp.today().strftime('%Y-%m-%d')


    script_tag = soup.find('script', type='application/ld+json')
    if script_tag:
        event_data = json.loads(script_tag.string)
        df["Event Type"] = event_data["@type"]
        df["Performer Type"] = event_data["performer"][0]["@type"]
        df["Performer"] = event_data["performer"][0]["name"]
    
    df = df[df.columns.intersection(short_cols)]
    return df

# Function to attach event data
def attach_event_data(df, event):
    df["Venue"] = event["venueName"]
    df["Artist"] = event["Artist"]
    df["Event Name"] = event["name"]
    df["City"] = event["venueCity"]
    df["countryName"] = event['countryName']
    return df

# Function to get listing dataframe
def get_listing_df(checkbox_links, event):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        listings = list(executor.map(get_listing_info, checkbox_links))
    curr = pd.concat(listings, ignore_index=True)
    curr = attach_event_data(curr, event)
    event_listing.append(curr)

# Function to get checkbox links
def get_checkbox_links(event, listing_executor):
    driver = driver_queue.get(block=True)

    try:
        url = event["url"]
        url = remove_query_from_url(url) + "?listingQty=&quantity=0"+"&betterValueTickets=false" + "&estimatedFees=false"
        driver.get(url)
        try:
            WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Continue"]'))).click()
        except:
            pass
        WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//div[@aria-label="Filters"]'))).click() # open filter menu
        checkbox_links = click_checkboxes(driver)
    except:
        checkbox_links = [url]
    listing_executor.submit(get_listing_df, checkbox_links, event)
    driver_queue.put(driver)

# Function to create driver
def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=options)
    driver_queue.put(driver)
# Close drivers
def close_driver(driver):
    driver.quit()



ChunkedEncodingError: ('Connection broken: IncompleteRead(3266 bytes read, 6974 more expected)', IncompleteRead(3266 bytes read, 6974 more expected))

In [None]:
# unique_artists = ['black pink']
with concurrent.futures.ThreadPoolExecutor() as executor:
    events = list(executor.map(fetch_artist_data, unique_artists))
events = flatten(events)
events = list({v['eventId']: v for v in events if v is not None}.values())
events = [event for event in events if event['countryName'] in ['Canada', 'USA']]

# Get checkbox links
with concurrent.futures.ThreadPoolExecutor(10) as selenium_executor, concurrent.futures.ThreadPoolExecutor() as listing_executor, concurrent.futures.ThreadPoolExecutor() as driver_executor:
    selenium_executor.map(lambda _: create_driver(), range(8))
    selenium_executor.map(lambda x: get_checkbox_links(x, listing_executor), events)
    selenium_executor.shutdown(wait=True)
    driver_executor.map(close_driver, list(driver_queue.queue))
    listing_executor.shutdown(wait=True)

combined_df = pd.concat(event_listing, ignore_index=True)
combined_df.drop_duplicates(subset=['id', 'eventId'], keep='last', inplace=True)

combined_df


In [None]:
# Price cleaning functions (imports now in cell 1)
def clean_price_value(value):
    """
    Clean price values by removing currency symbols and converting to float.
    """
    if pd.isna(value) or value is None:
        return None
    str_value = str(value)
    cleaned = re.sub(r'[C$€£¥,\s]', '', str_value)
    try:
        return float(cleaned) if cleaned else None
    except ValueError:
        return None

print("Cleaning price columns...")



if 'price' in combined_df.columns:
    combined_df['price'] = combined_df['price'].apply(clean_price_value)
    print("✓ Cleaned price")

if 'faceValue' in combined_df.columns:
    combined_df['faceValue'] = combined_df['faceValue'].apply(clean_price_value)
    print("✓ Cleaned faceValue")

print(f"\nSample cleaned priceWithFees values:")


Cleaning price columns...
✓ Cleaned price
✓ Cleaned faceValue

Sample cleaned priceWithFees values:


In [None]:
combined_df.rename(columns={'Artist Name': 'Artist'}, inplace=True)

In [None]:
combined_df['Artist'] = combined_df['Artist'].astype(str).str.title()


## Database Integration

### Insert Scraped Data into CONCERT_SEATS Table

Now we'll insert the scraped concert seat data from `combined_df` into the database.

In [None]:
# Load database configuration (imports now in cell 1)
import json
import subprocess
import sys

# Load database configuration
try:
    with open('db.json', 'r') as f:
        db_config = json.load(f)
    
    print("\nDatabase configuration loaded:")
    print(f"Host: {db_config['host']}")
    print(f"User: {db_config['user']}")
    print(f"Database: {db_config['database']}")
    print("Password: [HIDDEN]")
except FileNotFoundError:
    print("❌ db.json file not found. Please ensure it exists in the current directory.")
    db_config = None
except Exception as e:
    print(f"❌ Error loading database configuration: {e}")
    db_config = None


Database configuration loaded:
Host: 192.168.68.74
User: root
Database: concert
Password: [HIDDEN]


In [None]:
# Prepare the combined_df for database insertion
print(f"Preparing to insert {len(combined_df)} rows into CONCERT_SEATS table...")
print(f"Combined DataFrame columns: {list(combined_df.columns)}")

# Ensure all required columns exist (add missing ones as None)
required_columns = [
    'id', 'section', 'sectionId', 'row', 'rowId', 'faceValue', 'rawPrice', 'priceWithFees',
    'eventId', 'sectionMapName', 'sectionType', 'seat', 'seatFrom', 'seatTo', 'ticketClass',
    'ticketClassName', 'price', 'ticketTypeId', 'ticketTypeGroupId', 'listingTypeId', 'City',
    'Event Date', 'countryName', 'Event Name', 'Event Type', 'Performer Type', 'Performer',
    'isFavorite', 'aggregateFavorites', 'listingCurrencyCode', 'buyerCurrencyCode',
    'faceValueCurrencyCode', 'Updated', 'isStanding', 'formattedFees', 'Artist', 'Venue',
    'formattedDealScore'
]

# Add missing columns with None values
for col in required_columns:
    if col not in combined_df.columns:
        combined_df[col] = None
        print(f"Added missing column: {col}")

# Rename 'row' to 'row_name' to match database schema
if 'row' in combined_df.columns:
    combined_df.rename(columns={'row': 'row_name'}, inplace=True)

# Ensure date columns are properly formatted for MySQL
if 'Event Date' in combined_df.columns:
    # Convert to datetime first, then to string format for MySQL
    combined_df['Event Date'] = pd.to_datetime(combined_df['Event Date'], errors='coerce')
    combined_df['Event Date'] = combined_df['Event Date'].dt.strftime('%Y-%m-%d')
    
if 'Updated' in combined_df.columns:
    # Convert to datetime first, then to string format for MySQL  
    combined_df['Updated'] = pd.to_datetime(combined_df['Updated'], errors='coerce')
    combined_df['Updated'] = combined_df['Updated'].dt.strftime('%Y-%m-%d %H:%M:%S')

print("✓ Data preparation completed")

Preparing to insert 36057 rows into CONCERT_SEATS table...
Combined DataFrame columns: ['id', 'eventId', 'section', 'sectionId', 'sectionMapName', 'sectionType', 'row', 'seat', 'ticketClass', 'ticketClassName', 'rowId', 'rawPrice', 'price', 'ticketTypeId', 'ticketTypeGroupId', 'listingTypeId', 'listingCurrencyCode', 'buyerCurrencyCode', 'faceValue', 'faceValueCurrencyCode', 'formattedDealScore', 'Event Date', 'Updated', 'Event Type', 'Performer Type', 'Performer', 'Venue', 'Artist', 'Event Name', 'City', 'countryName', 'seatFrom', 'seatTo']
Added missing column: priceWithFees
Added missing column: isFavorite
Added missing column: aggregateFavorites
Added missing column: isStanding
Added missing column: formattedFees
✓ Data preparation completed
✓ Data preparation completed


In [None]:
# Insert data into CONCERT_SEATS table using raw SQL (most reliable method)
try:
    # Build SQLAlchemy connection string
    user = db_config['user']
    password = db_config['password']
    host = db_config['host']
    database = db_config['database']
    
    # URL-encode the password to handle special characters
    encoded_password = quote_plus(password)
    connection_string = f"mysql+mysqlconnector://{user}:{encoded_password}@{host}/{database}"
    
    # Create SQLAlchemy engine
    engine = create_engine(connection_string)
    
    # Prepare final column mapping to match database schema
    db_columns = {
        'id': 'id',
        'section': 'section', 
        'sectionId': 'sectionId',
        'row_name': 'row_name',
        'rowId': 'rowId',
        'faceValue': 'faceValue',
        'rawPrice': 'rawPrice',
        'priceWithFees': 'priceWithFees',
        'eventId': 'eventId',
        'sectionMapName': 'sectionMapName',
        'sectionType': 'sectionType',
        'seat': 'seat',
        'seatFrom': 'seatFrom', 
        'seatTo': 'seatTo',
        'ticketClass': 'ticketClass',
        'ticketClassName': 'ticketClassName',
        'price': 'price',
        'ticketTypeId': 'ticketTypeId',
        'ticketTypeGroupId': 'ticketTypeGroupId',
        'listingTypeId': 'listingTypeId',
        'City': 'city',
        'Event Date': 'event_date',
        'countryName': 'countryName',
        'Event Name': 'event_name',
        'Event Type': 'event_type',
        'Performer Type': 'performer_type',
        'Performer': 'performer',
        'isFavorite': 'isFavorite',
        'aggregateFavorites': 'aggregateFavorites',
        'listingCurrencyCode': 'listingCurrencyCode',
        'buyerCurrencyCode': 'buyerCurrencyCode',
        'faceValueCurrencyCode': 'faceValueCurrencyCode',
        'Updated': 'updated_date',
        'isStanding': 'isStanding',
        'formattedFees': 'formattedFees',
        'Artist': 'artist',
        'Venue': 'venue',
        'formattedDealScore': 'formattedDealScore'
    }
    
    # Select and rename columns to match database schema
    df_for_insert = combined_df.rename(columns=db_columns)
    
    # Clean the data for MySQL compatibility
    print("🧹 Cleaning data for MySQL compatibility...")
    
    # Replace NaN/None values with appropriate defaults for string columns
    df_for_insert = df_for_insert.fillna({
        'section': '',
        'sectionMapName': '',
        'seat': '',
        'row_name': '',
        'ticketClass': '',
        'ticketClassName': '',
        'listingCurrencyCode': 'CAD',
        'buyerCurrencyCode': 'CAD', 
        'faceValueCurrencyCode': 'CAD',
        'city': '',
        'countryName': '',
        'event_name': '',
        'event_type': '',
        'performer_type': '',
        'performer': '',
        'artist': '',
        'venue': '',
        'formattedFees': '',
        'formattedDealScore': ''
    })
    
    # Fill numeric columns with 0 or appropriate defaults
    numeric_cols = ['faceValue', 'rawPrice', 'priceWithFees', 'price', 'seatFrom', 'seatTo']
    for col in numeric_cols:
        if col in df_for_insert.columns:
            df_for_insert[col] = pd.to_numeric(df_for_insert[col], errors='coerce').fillna(0)
    
    # Fill integer columns with 0 - INCLUDING sectionType which should be numeric
    int_cols = ['sectionId', 'rowId', 'ticketTypeId', 'ticketTypeGroupId', 'listingTypeId', 'eventId', 'sectionType']
    for col in int_cols:
        if col in df_for_insert.columns:
            df_for_insert[col] = pd.to_numeric(df_for_insert[col], errors='coerce').fillna(0).astype(int)
    
    # Fill boolean columns with False
    bool_cols = ['isFavorite', 'isStanding']
    for col in bool_cols:
        if col in df_for_insert.columns:
            df_for_insert[col] = df_for_insert[col].fillna(False)
    
    print(f"✓ Data cleaned. Shape: {df_for_insert.shape}")
    print(f"✓ sectionType converted to numeric: {df_for_insert['sectionType'].dtype}")
    
    # Use raw SQL INSERT with ON DUPLICATE KEY UPDATE
    batch_size = 1000
    
    # Get connection from engine
    connection_sql = engine.raw_connection()
    cursor_sql = connection_sql.cursor()
    
    try:
        # Get column names for the INSERT statement
        columns = list(df_for_insert.columns)
        placeholders = ', '.join(['%s'] * len(columns))
        columns_str = ', '.join([f'`{col}`' for col in columns])
        
        # Create UPDATE part (all columns except id)
        update_columns = [col for col in columns if col != 'id']
        update_str = ', '.join([f'`{col}` = VALUES(`{col}`)' for col in update_columns])
        
        # Build the UPSERT query
        upsert_query = f"""
        INSERT INTO CONCERT_SEATS ({columns_str})
        VALUES ({placeholders})
        ON DUPLICATE KEY UPDATE {update_str}
        """
        
        print("🔄 Performing UPSERT operation...")
        
        # Execute batch insert with upsert logic
        total_rows = len(df_for_insert)
        rows_processed = 0
        
        for i in range(0, total_rows, batch_size):
            batch_df = df_for_insert.iloc[i:i + batch_size]
            batch_data = [tuple(row) for row in batch_df.values]
            
            cursor_sql.executemany(upsert_query, batch_data)
            rows_processed += len(batch_data)
            print(f"  Processed {rows_processed}/{total_rows} rows...")
        
        connection_sql.commit()
        print(f"✅ Successfully inserted/updated {len(df_for_insert)} rows into CONCERT_SEATS table!")
        print(f"   Batch size: {batch_size}")
        print(f"   Database: {database}")
        
    finally:
        cursor_sql.close()
        connection_sql.close()
    
except Exception as e:
    print(f"❌ Error inserting data into database: {e}")
    print(f"   Make sure the CONCERT_SEATS table exists and has the correct schema.")
    
finally:
    # Close engine connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")

🧹 Cleaning data for MySQL compatibility...
✓ Data cleaned. Shape: (36057, 38)
✓ sectionType converted to numeric: int64
✓ Data cleaned. Shape: (36057, 38)
✓ sectionType converted to numeric: int64


  df_for_insert[col] = df_for_insert[col].fillna(False)


🔄 Performing UPSERT operation...
  Processed 1000/36057 rows...
  Processed 1000/36057 rows...
  Processed 2000/36057 rows...
  Processed 2000/36057 rows...
  Processed 3000/36057 rows...
  Processed 3000/36057 rows...
  Processed 4000/36057 rows...
  Processed 4000/36057 rows...
  Processed 5000/36057 rows...
  Processed 5000/36057 rows...
  Processed 6000/36057 rows...
  Processed 6000/36057 rows...
  Processed 7000/36057 rows...
  Processed 7000/36057 rows...
  Processed 8000/36057 rows...
  Processed 8000/36057 rows...
  Processed 9000/36057 rows...
  Processed 9000/36057 rows...
  Processed 10000/36057 rows...
  Processed 10000/36057 rows...
  Processed 11000/36057 rows...
  Processed 11000/36057 rows...
  Processed 12000/36057 rows...
  Processed 12000/36057 rows...
  Processed 13000/36057 rows...
  Processed 13000/36057 rows...
  Processed 14000/36057 rows...
  Processed 14000/36057 rows...
  Processed 15000/36057 rows...
  Processed 15000/36057 rows...
  Processed 16000/36057 r