# Web Scraping and Data Analysis with Selenium and Pandas

This Jupyter Notebook demonstrates how to perform web scraping using Selenium and analyze the scraped data with Pandas. The workflow includes setting up the Selenium WebDriver with custom options, solving reCAPTCHA challenges, and extracting event details from a website. The extracted data is then loaded into a Pandas DataFrame for further analysis.

In [1]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

from googlesearch import search

from bs4 import BeautifulSoup
import pandas as pd
from math import nan
from datetime import datetime
from requests import get



In [2]:
# Define a test user-agent string to simulate a browser request
test_ua = 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'

# Initialize Chrome options for the WebDriver
options = Options()

# Uncomment the following line if you want to run the browser in headless mode (no GUI)
# options.add_argument("--headless")

# Set the window size for the browser
options.add_argument("--window-size=1920,1080")

# Set the user-agent to the test user-agent defined above
options.add_argument(f'--user-agent={test_ua}')

# Add additional options to improve stability and compatibility
options.add_argument('--no-sandbox')  # Required for running in some environments
options.add_argument("--disable-extensions")  # Disable extensions to avoid potential conflicts
# options.add_argument("user-data-dir=C:\\Users\\eric9\\AppData\\Local\\Google\\Chrome\\User Data\\Default")

# Initialize the Chrome WebDriver with the specified options
driver = webdriver.Chrome(options=options)

In [3]:
# The following code reads an Excel file and loads the data into a pandas DataFrame
path = "../../Documents/Ticket Sales.xlsx"
events = pd.read_excel(path, sheet_name ="Events")
embrace = pd.read_excel(path, sheet_name ="Embrace")


In [4]:
def get_embrace(artist):
    artist = artist.replace(" ", "+")
    pagesource = get("https://www.embracepresents.com/?s=" + artist).content
    soup = BeautifulSoup(pagesource, 'html.parser')
    result = soup.find('h2', class_='entry-title').find('a')



    if artist.split('+')[0].strip().lower() not in result.get_text().strip().lower():
        return None
    try:
        event_link = result['href']
    except:
        # raise Exception("No events found for " + artist)
        print ("No events found for " + artist)

    event_page = get(event_link).content
    event_soup = BeautifulSoup(event_page, 'html.parser')
    meta_div = event_soup.find('div', class_='meta')
    
    date_text = meta_div.find('div', class_='date').get_text(strip=True).replace('Date:', '').strip()
    date_parts = date_text.split('-')

    if len(date_parts) == 1:
        start_date = datetime.strptime(date_parts[0].strip(), '%B %d, %Y')
        end_date = start_date
    else:
        start_date = datetime.strptime(date_parts[0].strip(), '%B %d')
        end_date = datetime.strptime(date_parts[1].strip(), '%B %d')
        end_date = end_date.replace(year=start_date.year)
    
    # Extract price information
    price_div = meta_div.find('div', class_='pricing')
    if price_div:
        price_text = price_div.get_text(strip=True).replace('Tickets:', '').strip()
    else:
        price_text = meta_div.find('div', class_='gen-admission').get_text(strip=True).replace('Gen Admission:', '').strip()
    
    # Get the end of the price range
    price = price_text.split('-')[-1].split()[0].replace('$', '')

    event_details = {
        'start_date': start_date,
        'end_date': end_date,
        'Price': price,
        # 'Venue': meta_div.find('div', class_='venue').get_text(strip=True).replace('Venue:', '').strip(),
        # 'Location': meta_div.find('div', class_='location').get_text(strip=True).replace('Location:', '').strip(),
    }
    # fb_link = event_soup.find('a', class_='button rsvp popup')['href']
    # # print(fb_link)
    # driver.get(fb_link)
    # fb_soup = BeautifulSoup(driver.page_source, 'html.parser')
    # event_guests_section = fb_soup.find('h3', string='Event Guests').find_next('section')
    # interested_count = event_guests_section.find_all('div', class_='_4g34 _3jcq')[0].find('div', class_='_52jf _52jb _52jh').get_text(strip=True)
    # went_count = event_guests_section.find_all('div', class_='_4g34 _3jcq')[1].find('div', class_='_52jf _52jb _52jh').get_text(strip=True)
    # interested_count = interested_count.replace('K', '000').replace('.', '')
    # went_count = went_count.replace('K', '000').replace('.', '')
    # event_details['Interested'] = int(interested_count)
    # event_details['Went'] = int(went_count)
    return event_details




In [5]:
# Iterate through the events DataFrame and get embrace details for each artist



embrace_details = []

for index, row in events.iterrows():
    if row["Date"] > datetime.today():
        continue
    if row["Location"] != "Toronto":
        continue
    if embrace[(embrace['Artist'] == row["Artist"]) & ((embrace['start_date'] == row["Date"]) | (embrace['end_date']  ==  row["Date"]))].size > 0:
        continue
    try:
        details = get_embrace(row["Artist"])
        if details is None:
            continue  
        details['Artist'] = row["Artist"]
        details['Venue'] = row["Venue"]
        if details['start_date'] == row['Date'] or details['end_date'] == row['Date']:
            embrace_details.append(details)

            if pd.isna(events.loc[index,'Min Cost']) or (events.loc[index,'Min Cost'] == 0) :
                events.loc[index, 'Min Cost'] = details['Price']
                print(f"Price for {row['Artist']} updated successfully")
    except Exception as e:
        print(f"Error fetching details for {row["Artist"]}: {e}")

# Convert the list of dictionaries to a DataFrame
new_embrace = pd.DataFrame(embrace_details)



Error fetching details for seven lions: 'NoneType' object has no attribute 'find'
Error fetching details for gryffin: 'NoneType' object has no attribute 'get_text'
Error fetching details for Inquisitive: 'NoneType' object has no attribute 'find'
Error fetching details for veld 3 day 2023: 'NoneType' object has no attribute 'find'
Error fetching details for veld 1 day 2023: 'NoneType' object has no attribute 'find'
Error fetching details for Inquisitive: 'NoneType' object has no attribute 'find'
Error fetching details for loud luxury: 'NoneType' object has no attribute 'find'
Error fetching details for armin van buuren: 'NoneType' object has no attribute 'find'
Error fetching details for deadmau: 'NoneType' object has no attribute 'find'
Error fetching details for Gareth Emery: 'NoneType' object has no attribute 'find'
Error fetching details for Gareth Emery: 'NoneType' object has no attribute 'find'
Error fetching details for Sara Landry: 'NoneType' object has no attribute 'get_text'
E

In [6]:
new_embrace

In [7]:
# Concatenate the new embrace_df with the existing embrace DataFrame
embrace = pd.concat([embrace, new_embrace], ignore_index=True)


In [8]:
embrace

Unnamed: 0,Artist,Venue,start_date,end_date,Price,Interested,Went
0,Artemas,,2024-10-03,2024-10-03,20.00,20.0,7.0
1,Atarashi,History,2024-10-06,2024-10-06,60.00,59.0,21.0
2,Boris Brejcha,Casino,2024-09-27,2024-09-27,100.00,975.0,356.0
3,Bunt,CODA,2024-10-16,2024-10-16,TBD,0.0,4.0
4,Dom Dolla,Budweiser,2024-09-06,2024-09-06,125.00,11000.0,285.0
5,Excision,Casino,2024-03-15,2024-03-15,79.50,15000.0,587.0
6,Friday Pilots Club,The Drake Underground,2024-10-15,2024-10-15,22.00,0.0,1.0
7,Jason Ross,History,2024-04-26,2024-04-26,60.00,102.0,35.0
8,Kaytrana,Budweiser,2024-09-27,2024-09-27,45.00,366.0,74.0
9,Knocked Loose,History,2024-06-05,2024-06-05,63.50,469.0,195.0


In [9]:
embrace["start_date"] = pd.to_datetime(embrace["start_date"]).dt.date
embrace["end_date"] = pd.to_datetime(embrace["end_date"]).dt.date
events["Date"] = pd.to_datetime(events["Date"]).dt.date
events["Presale"] = pd.to_datetime(events["Presale"]).dt.date

In [10]:
# Save the embrace details to a new sheet called 'embrace'

with pd.ExcelWriter(path, mode='a', engine='openpyxl',date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD', if_sheet_exists='replace') as writer:
    embrace.to_excel(writer, sheet_name='Embrace', index=False)


# Save the events DataFrame to the 'events' sheet
with pd.ExcelWriter(path, mode='a', engine='openpyxl', date_format= 'YYYY-MM-DD', datetime_format='YYYY-MM-DD', if_sheet_exists='replace') as writer:
    events.to_excel(writer, sheet_name='Events', index=False)