In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import time

# --- Configuration ---
# Use a specific page to start, e.g., an artist page or a specific city's archive.
# You will need to build logic later to iterate through many pages/cities.
CONCERTARCHIVES_URL = "https://www.concertarchives.org/cities/atlanta"
TOURDATESEARCH_URL = "https://www.tourdatesearch.com/tour-dates/us"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [None]:
def scrape_concertarchives(url: str) -> List[Dict]:
    """Scrapes event metadata from Concert Archives."""
    print(f"-> Scraping Concert Archives: {url}")
    events = []
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # NOTE: This selector is based on common event listing practices and MUST BE VERIFIED.
        # It assumes a common class wraps each event listing.
        event_containers = soup.find_all('div', class_='event-list-item') # Placeholder Selector
        
        for container in event_containers:
            # Extract Date (Often an h3 or span)
            date_element = container.find('time', class_='event-date') # Placeholder Selector
            date = date_element.text.strip() if date_element else None
            
            # Extract Headliners/Artists (Often an anchor tag <a> in a prominent header)
            artist_element = container.find('h4', class_='artist-name') # Placeholder Selector
            artist = artist_element.text.strip() if artist_element else None

            # Extract Venue (Often a span or small text below the artist)
            venue_element = container.find('a', class_='venue-name') # Placeholder Selector
            venue = venue_element.text.strip() if venue_element else None
            
            if artist and date and venue:
                events.append({
                    "source": "concertarchives",
                    "artist_name": artist,
                    "event_date": date,
                    "venue_name": venue,
                    "city": url.split('/')[-1] # Basic way to get city from URL
                })
        
        print(f"-> Found {len(events)} events on Concert Archives.")
    except Exception as e:
        print(f"!!! Error scraping Concert Archives: {e}")
        
    return events

In [None]:
def scrape_tourdatesearch(url: str) -> List[Dict]:
    """Scrapes event metadata from Tour Date Search."""
    print(f"-> Scraping Tour Date Search: {url}")
    events = []
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # NOTE: This selector is a general placeholder and MUST BE VERIFIED.
        tour_containers = soup.find_all('div', class_='tour-listing') # Placeholder Selector
        
        for container in tour_containers:
            # Extract Date
            date_element = container.find('span', class_='date') # Placeholder Selector
            date = date_element.text.strip() if date_element else None
            
            # Extract Artist
            artist_element = container.find('h3', class_='tour-artist') # Placeholder Selector
            artist = artist_element.text.strip() if artist_element else None

            # Extract Venue/City
            location_element = container.find('span', class_='tour-location') # Placeholder Selector
            location = location_element.text.strip() if location_element else None
            
            if artist and date and location:
                # Simple split to separate city and venue (may require refinement)
                location_parts = location.split(' at ')
                venue = location_parts[1].strip() if len(location_parts) > 1 else location
                city = location_parts[0].strip() if location_parts else location
                
                events.append({
                    "source": "tourdatesearch",
                    "artist_name": artist,
                    "event_date": date,
                    "venue_name": venue,
                    "city": city
                })

        print(f"-> Found {len(events)} events on Tour Date Search.")
    except Exception as e:
        print(f"!!! Error scraping Tour Date Search: {e}")
        
    return events

In [None]:
if __name__ == "__main__":
    
    # 1. Execute scrapes
    all_concert_archives = scrape_concertarchives(CONCERTARCHIVES_URL)
    time.sleep(15) # Be respectful: wait before hitting the next site
    all_tour_dates = scrape_tourdatesearch(TOURDATESEARCH_URL)

    # 2. Consolidate and Clean
    all_events_raw = all_concert_archives + all_tour_dates
    all_concert_archives = scrape_concertarchives(CONCERTARCHIVES_URL)
    all_tour_dates = scrape_tourdatesearch(TOURDATESEARCH_URL)

    all_events_raw = all_concert_archives + all_tour_dates

if not all_events_raw:
    print("WARNING: all_events_raw is empty. Check your URLs and selectors.")
else:
    # Inspect the keys of the first dictionary
    print("Keys found in first event dictionary:", all_events_raw[0].keys())
    df_features = pd.DataFrame(all_events_raw)
    
    # 3. Create the unique identifier (Join Key)
    # The join key must be consistent! Use Artist + Date + Venue as a unique identifier.
    df_features['join_key'] = (
        df_features['artist_name'].str.lower().str.replace('[^a-z0-9]', '', regex=True) + '_' +
        df_features['event_date'].str.replace('[^a-z0-9]', '', regex=True)
    )

    # 4. Save to Parquet
    df_features.to_parquet("data/processed/event_features_raw.parquet", index=False)
    
    print("\n--- Feature Data Consolidation Complete ---")
    print(f"Total unique events found: {df_features['join_key'].nunique()}")
    print("Data saved to data/processed/event_features_raw.parquet")
    print(df_features.head().to_markdown())

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import time

# --- Configuration ---
CONCERTARCHIVES_URL = "https://www.concertarchives.org/cities/atlanta"
TOURDATESEARCH_URL = "https://www.tourdatesearch.com/tour-dates/us"

# FIX: Robust Headers to bypass 403 Forbidden Error
HEADERS = {
    # Most Critical: Mimics a real Chrome browser on Windows
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
    # Tells the server we accept HTML
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    # Pretends the request came from a search engine
    'Referer': 'https://www.google.com/',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0'
}

In [None]:
def scrape_concertarchives(url: str) -> List[Dict]:
    """Scrapes event metadata from Concert Archives."""
    print(f"-> Scraping Concert Archives: {url}")
    events = []
    
    try:
        # FIX: Implement a delay to prevent immediate rate-limiting/banning
        time.sleep(10) 
        
        # FIX: Pass the new robust HEADERS dictionary
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status() # This will raise an error if the status is still 403
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # NOTE: Selectors MUST BE VERIFIED against the live HTML.
        event_containers = soup.find_all('div', class_='event-list-item') # Placeholder Selector
        
        for container in event_containers:
            date_element = container.find('time', class_='event-date') # Placeholder Selector
            date = date_element.text.strip() if date_element else None
            
            artist_element = container.find('h4', class_='artist-name') # Placeholder Selector
            artist = artist_element.text.strip() if artist_element else None

            venue_element = container.find('a', class_='venue-name') # Placeholder Selector
            venue = venue_element.text.strip() if venue_element else None
            
            if artist and date and venue:
                events.append({
                    "source": "concertarchives",
                    "artist_name": artist,
                    "event_date": date,
                    "venue_name": venue,
                    "city": url.split('/')[-1]
                })
        
        print(f"-> Found {len(events)} events on Concert Archives.")
    except requests.exceptions.HTTPError as e:
        print(f"!!! Error scraping Concert Archives: Failed to access URL ({e})")
    except Exception as e:
        print(f"!!! Error scraping Concert Archives: Parsing failed ({e})")
        
    return events

In [None]:
def scrape_tourdatesearch(url: str) -> List[Dict]:
    """Scrapes event metadata from Tour Date Search."""
    print(f"-> Scraping Tour Date Search: {url}")
    events = []
    
    try:
        # We assume TourDateSearch is less strict, but still use the good headers
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # NOTE: Selectors MUST BE VERIFIED against the live HTML.
        tour_containers = soup.find_all('div', class_='tour-listing') # Placeholder Selector
        
        for container in tour_containers:
            date_element = container.find('span', class_='date') # Placeholder Selector
            date = date_element.text.strip() if date_element else None
            
            artist_element = container.find('h3', class_='tour-artist') # Placeholder Selector
            artist = artist_element.text.strip() if artist_element else None

            location_element = container.find('span', class_='tour-location') # Placeholder Selector
            location = location_element.text.strip() if location_element else None
            
            if artist and date and location:
                # Simple split to separate city and venue (may require refinement)
                location_parts = location.split(' at ')
                venue = location_parts[1].strip() if len(location_parts) > 1 else location
                city = location_parts[0].strip() if location_parts else location
                
                events.append({
                    "source": "tourdatesearch",
                    "artist_name": artist,
                    "event_date": date,
                    "venue_name": venue,
                    "city": city
                })

        print(f"-> Found {len(events)} events on Tour Date Search.")
    except Exception as e:
        print(f"!!! Error scraping Tour Date Search: {e}")
        
    return events

In [None]:
if __name__ == "__main__":
    
    # 1. Execute scrapes
    # If the 403 persists, your IP may be temporarily blocked. Wait 15-30 minutes and try again.
    all_concert_archives = scrape_concertarchives(CONCERTARCHIVES_URL)
    time.sleep(15) # Pause before hitting the next site
    all_tour_dates = scrape_tourdatesearch(TOURDATESEARCH_URL)

    # 2. Consolidate and Clean
    all_events_raw = all_concert_archives + all_tour_dates
    df_features = pd.DataFrame(all_events_raw)
    
    # Check for empty data before creating join key (prevents KeyError)
    if df_features.empty:
        print("\n--- WARNING: No data collected from either source. Cannot create join_key. ---")
        exit()
        
    # 3. Create the unique identifier (Join Key)
    # This key is vital for linking to your StubHub price data.
    df_features['join_key'] = (
        df_features['artist_name'].str.lower().str.replace('[^a-z0-9]', '', regex=True) + '_' +
        df_features['event_date'].str.replace('[^a-z0-9]', '', regex=True)
    )

    # 4. Save to Parquet
    df_features.to_parquet("data/processed/event_features_raw.parquet", index=False)
    
    print("\n--- Feature Data Consolidation Complete ---")
    print(f"Total unique events found: {df_features['join_key'].nunique()}")
    print("Data saved to data/processed/event_features_raw.parquet")
    print(df_features.head().to_markdown())

In [2]:
import asyncio
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import time

# --- Playwright Configuration ---
from playwright.async_api import async_playwright

# --- Global Configuration ---
CONCERTARCHIVES_URL = "https://www.concertarchives.org/cities/atlanta"
TOURDATESEARCH_URL = "https://www.tourdatesearch.com/tour-dates/us"

# Headers are still used by the synchronous scraper and Playwright's page setup
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
} 

# ----------------------------------------------------------------------
# ASYNC SCRAPING FUNCTION (Concert Archives - Uses Playwright)
# ----------------------------------------------------------------------

async def scrape_concertarchives_async(url: str) -> List[Dict]:
    """
    Scrapes event metadata from Concert Archives using Playwright to bypass 403/JS checks.
    """
    print(f"-> Scraping Concert Archives (ASYNC Playwright): {url}")
    events = []
    
    async with async_playwright() as p:
        # Launch a headless Chromium browser
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page(extra_http_headers=HEADERS)

        try:
            # Navigate to the URL and wait until the network is mostly idle (page loaded)
            await page.goto(url, wait_until="networkidle", timeout=30000)
            await asyncio.sleep(5) # Give extra time for rendering/JS checks
            
            # Get the fully rendered HTML content
            content = await page.content()
            
            # --- BeautifulSoup Parsing ---
            soup = BeautifulSoup(content, 'html.parser')
            
            # NOTE: Selectors MUST BE VERIFIED against the live HTML.
            event_containers = soup.find_all('div', class_='event-list-item') # Placeholder Selector
            
            for container in event_containers:
                date_element = container.find('time', class_='event-date') # Placeholder Selector
                date = date_element.text.strip() if date_element else None
                
                artist_element = container.find('h4', class_='artist-name') # Placeholder Selector
                artist = artist_element.text.strip() if artist_element else None

                venue_element = container.find('a', class_='venue-name') # Placeholder Selector
                venue = venue_element.text.strip() if venue_element else None
                
                if artist and date and venue:
                    events.append({
                        "source": "concertarchives",
                        "artist_name": artist,
                        "event_date": date,
                        "venue_name": venue,
                        "city": url.split('/')[-1]
                    })
            
            print(f"-> Found {len(events)} events on Concert Archives.")

        except Exception as e:
            print(f"!!! Error scraping Concert Archives (Playwright): {e}")

        finally:
            await browser.close()
            
    return events

# ----------------------------------------------------------------------
# SYNCHRONOUS SCRAPING FUNCTION (Tour Date Search - Uses Requests)
# ----------------------------------------------------------------------

def scrape_tourdatesearch_sync(url: str) -> List[Dict]:
    """Scrapes event metadata from Tour Date Search using simple requests."""
    print(f"-> Scraping Tour Date Search (SYNC Requests): {url}")
    events = []
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # NOTE: Selectors MUST BE VERIFIED against the live HTML.
        tour_containers = soup.find_all('div', class_='tour-listing') # Placeholder Selector
        
        for container in tour_containers:
            date_element = container.find('span', class_='date') # Placeholder Selector
            date = date_element.text.strip() if date_element else None
            
            artist_element = container.find('h3', class_='tour-artist') # Placeholder Selector
            artist = artist_element.text.strip() if artist_element else None

            location_element = container.find('span', class_='tour-location') # Placeholder Selector
            location = location_element.text.strip() if location_element else None
            
            if artist and date and location:
                location_parts = location.split(' at ')
                venue = location_parts[1].strip() if len(location_parts) > 1 else location
                city = location_parts[0].strip() if location_parts else location
                
                events.append({
                    "source": "tourdatesearch",
                    "artist_name": artist,
                    "event_date": date,
                    "venue_name": venue,
                    "city": city
                })

        print(f"-> Found {len(events)} events on Tour Date Search.")
    except Exception as e:
        print(f"!!! Error scraping Tour Date Search: {e}")
        
    return events

# ----------------------------------------------------------------------
# MAIN EXECUTION BLOCK (Uses asyncio.run to call the async function)
# ----------------------------------------------------------------------

async def main_scraper_task():
    # 1. Execute async scrape for Concert Archives
    all_concert_archives = await scrape_concertarchives_async(CONCERTARCHIVES_URL)
    
    # 2. Execute sync scrape for Tour Date Search
    time.sleep(5) # Pause before hitting the next site
    all_tour_dates = scrape_tourdatesearch_sync(TOURDATESEARCH_URL)

    # 3. Consolidate and Clean
    all_events_raw = all_concert_archives + all_tour_dates
    df_features = pd.DataFrame(all_events_raw)
    
    # Check for empty data before creating join key (prevents KeyError)
    if df_features.empty:
        print("\n--- WARNING: No data collected from either source. Aborting save. ---")
        return pd.DataFrame() 
        
    # 4. Create the unique identifier (Join Key)
    # This key is vital for linking to your StubHub price data.
    df_features['join_key'] = (
        df_features['artist_name'].str.lower().str.replace('[^a-z0-9]', '', regex=True) + '_' +
        df_features['event_date'].str.replace('[^a-z0-9]', '', regex=True)
    )

    # 5. Save to Parquet
    df_features.to_parquet("data/processed/event_features_raw.parquet", index=False)
    
    print("\n--- Feature Data Consolidation Complete ---")
    print(f"Total unique events found: {df_features['join_key'].nunique()}")
    print("Data saved to data/processed/event_features_raw.parquet")
    return df_features

In [None]:
# --- EXECUTE IN A NEW CELL ---

# Call the function directly using await
df_final = await main_scraper_task()

if not df_final.empty:
    print("\n--- FINAL FEATURE DATASET SAMPLE ---")
    print(df_final.head().to_markdown())
else:
    print("\n--- ASYNC PIPELINE RAN, BUT RETURNED NO DATA ---")

In [None]:
import asyncio
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import time

# --- Playwright Configuration ---
from playwright.async_api import async_playwright

# --- Global Configuration (Define your target URLs) ---
# CORRECTED URL for Concert Archives
CONCERTARCHIVES_URL = "https://www.concertarchives.org/locations/atlanta-ga"
# CORRECTED URL for Tour Date Search (Base domain to avoid 404)
TOURDATESEARCH_URL = "https://www.tourdatesearch.com/" 
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
} 

# ----------------------------------------------------------------------
# ASYNC SCRAPER 1: Concert Archives (Bypasses 403 using Playwright and Corrected Selectors)
# ----------------------------------------------------------------------

async def scrape_concertarchives_async(url: str) -> List[Dict]:
    """
    Scrapes event metadata from Concert Archives using Playwright and CORRECTED selectors
    for the HTML table structure.
    """
    print(f"-> Scraping Concert Archives: {url}")
    events = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page(extra_http_headers=HEADERS)

        try:
            # Bypass 403/JS checks
            await page.goto(url, wait_until="networkidle", timeout=30000)
            await asyncio.sleep(5) 
            content = await page.content()
            
            soup = BeautifulSoup(content, 'html.parser')
            
            # **CORRECTED SELECTOR 1: Targets table rows inside the concert listing table**
            event_rows = soup.select('table#band-show-table tbody tr') 
            
            if not event_rows:
                print("-> WARNING: Concert Archives table found but contains 0 event rows.")
                
            for row in event_rows:
                # Use .select_one for precision within the row
                
                # CORRECTED SELECTOR 2: Date (first column TD, span)
                date_element = row.select_one('td:nth-of-type(1) span')
                date = date_element.text.strip() if date_element else None
                
                # CORRECTED SELECTOR 3: Artist (second column TD, strong a)
                artist_element = row.select_one('td:nth-of-type(2) strong a') 
                artist = artist_element.text.strip() if artist_element else None

                # CORRECTED SELECTOR 4: Venue (third column TD, a link)
                venue_element = row.select_one('td:nth-of-type(3) a')
                venue = venue_element.text.strip() if venue_element else None
                
                if artist and date and venue:
                    # Skip the first cancelled event which still shows up
                    if date.startswith('Mar 10, 2027') and 'Cancelled' in row.text:
                         continue
                         
                    events.append({
                        "source": "concertarchives",
                        "artist_name": artist,
                        "event_date": date,
                        "venue_name": venue,
                        "city": url.split('/')[-1] # Extracts 'atlanta-ga'
                    })
            
            print(f"-> Found {len(events)} events on Concert Archives.")

        except Exception as e:
            print(f"!!! Fatal Error scraping Concert Archives: {e}")

        finally:
            await browser.close()
            
    return events

# ----------------------------------------------------------------------
# SYNCHRONOUS SCRAPER 2: Tour Date Search (Corrected Selectors for div structure)
# ----------------------------------------------------------------------

def scrape_tourdatesearch_sync(url: str) -> List[Dict]:
    """Scrapes event metadata from Tour Date Search using simple requests and CORRECTED selectors."""
    print(f"-> Scraping Tour Date Search: {url}")
    events = []
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # 1. Target the main results container
        results_container = soup.find('div', id='results')
        if not results_container:
            print("-> WARNING: Could not find main results container #results.")
            return events

        # **CORRECTED SELECTOR 1: Find all event rows**
        event_containers = results_container.find_all('div', class_='searchrow') 
        
        for container in event_containers:
            # 2. Extract Date components
            date_wrapper = container.find('div', class_='datewrapper')
            if not date_wrapper: continue

            # Combine date parts into a standard string
            dow = date_wrapper.find('div', class_='dow').text.strip()
            day = date_wrapper.find('div', class_='day').text.strip()
            year = date_wrapper.find('div', class_='year').text.strip()
            date = f"{dow}, {day} {year}"
            
            # 3. Extract Artist and Venue/Location
            venue_col = container.find('div', class_='vcol')
            if not venue_col: continue
            
            # CORRECTED SELECTOR 2: Artist Name (inside div.venue b)
            artist_bold = venue_col.find('div', class_='venue').find('b')
            artist = artist_bold.text.strip() if artist_bold else None

            if not artist: continue
            
            # Full text structure is "<b>Artist</b> at Venue in City, ST"
            venue_text = venue_col.find('div', class_='venue').text.strip()
            clean_text = venue_text.replace(artist, '').strip() 
            
            # Parse Venue and City from the remaining text
            try:
                # Looks for "at [Venue] in [City, ST]"
                venue_city_raw = clean_text.split(' at ', 1)[1]
                venue_name = venue_city_raw.split(' in ', 1)[0].strip()
                location_full = venue_city_raw.split(' in ', 1)[1].strip()
            except IndexError:
                venue_name = "Unknown"
                location_full = "Unknown"

            events.append({
                "source": "tourdatesearch",
                "artist_name": artist,
                "event_date": date,
                "venue_name": venue_name,
                "city": location_full
            })

        print(f"-> Found {len(events)} events on Tour Date Search.")
    except Exception as e:
        print(f"!!! Error scraping Tour Date Search: {e}")
        
    return events


In [None]:

# ----------------------------------------------------------------------
# MAIN EXECUTION BLOCK (Entry point for Notebook execution)
# ----------------------------------------------------------------------

async def main_scraper_task():
    print("\n--- STARTING ASYNC FEATURE SCRAPING PIPELINE ---")
    
    # 1. Execute async scrape for Concert Archives
    all_concert_archives = await scrape_concertarchives_async(CONCERTARCHIVES_URL)
    
    # 2. Execute sync scrape for Tour Date Search
    time.sleep(5) 
    all_tour_dates = scrape_tourdatesearch_sync(TOURDATESEARCH_URL)

    # 3. Consolidate and Clean
    all_events_raw = all_concert_archives + all_tour_dates
    df_features = pd.DataFrame(all_events_raw)
    
    if df_features.empty:
        print("\n--- WARNING: No data collected. Aborting save. ---")
        return pd.DataFrame() 
        
    # 4. Create the unique identifier (Join Key)
    df_features['join_key'] = (
        df_features['artist_name'].str.lower().str.replace('[^a-z0-9]', '', regex=True) + '_' +
        df_features['event_date'].str.replace('[^a-z0-9]', '', regex=True)
    )

    # 5. Save to Parquet
    df_features.to_parquet("data/processed/event_features_raw.parquet", index=False)
    
    print("\n--- Feature Data Consolidation Complete ---")
    print(f"Total unique events found: {df_features['join_key'].nunique()}")
    print("Data saved to data/processed/event_features_raw.parquet")
    return df_features


In [None]:

# ----------------------------------------------------------------------
# EXECUTION
# 
# Execute the following line in a new cell to run the entire pipeline:

df_final = await main_scraper_task()
