In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import time

# --- Configuration ---
# Use a specific page to start, e.g., an artist page or a specific city's archive.
# You will need to build logic later to iterate through many pages/cities.
CONCERTARCHIVES_URL = "https://www.concertarchives.org/cities/atlanta"
TOURDATESEARCH_URL = "https://www.tourdatesearch.com/tour-dates/us"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [None]:
def scrape_concertarchives(url: str) -> List[Dict]:
    """Scrapes event metadata from Concert Archives."""
    print(f"-> Scraping Concert Archives: {url}")
    events = []
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # NOTE: This selector is based on common event listing practices and MUST BE VERIFIED.
        # It assumes a common class wraps each event listing.
        event_containers = soup.find_all('div', class_='event-list-item') # Placeholder Selector
        
        for container in event_containers:
            # Extract Date (Often an h3 or span)
            date_element = container.find('time', class_='event-date') # Placeholder Selector
            date = date_element.text.strip() if date_element else None
            
            # Extract Headliners/Artists (Often an anchor tag <a> in a prominent header)
            artist_element = container.find('h4', class_='artist-name') # Placeholder Selector
            artist = artist_element.text.strip() if artist_element else None

            # Extract Venue (Often a span or small text below the artist)
            venue_element = container.find('a', class_='venue-name') # Placeholder Selector
            venue = venue_element.text.strip() if venue_element else None
            
            if artist and date and venue:
                events.append({
                    "source": "concertarchives",
                    "artist_name": artist,
                    "event_date": date,
                    "venue_name": venue,
                    "city": url.split('/')[-1] # Basic way to get city from URL
                })
        
        print(f"-> Found {len(events)} events on Concert Archives.")
    except Exception as e:
        print(f"!!! Error scraping Concert Archives: {e}")
        
    return events

In [None]:
def scrape_tourdatesearch(url: str) -> List[Dict]:
    """Scrapes event metadata from Tour Date Search."""
    print(f"-> Scraping Tour Date Search: {url}")
    events = []
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # NOTE: This selector is a general placeholder and MUST BE VERIFIED.
        tour_containers = soup.find_all('div', class_='tour-listing') # Placeholder Selector
        
        for container in tour_containers:
            # Extract Date
            date_element = container.find('span', class_='date') # Placeholder Selector
            date = date_element.text.strip() if date_element else None
            
            # Extract Artist
            artist_element = container.find('h3', class_='tour-artist') # Placeholder Selector
            artist = artist_element.text.strip() if artist_element else None

            # Extract Venue/City
            location_element = container.find('span', class_='tour-location') # Placeholder Selector
            location = location_element.text.strip() if location_element else None
            
            if artist and date and location:
                # Simple split to separate city and venue (may require refinement)
                location_parts = location.split(' at ')
                venue = location_parts[1].strip() if len(location_parts) > 1 else location
                city = location_parts[0].strip() if location_parts else location
                
                events.append({
                    "source": "tourdatesearch",
                    "artist_name": artist,
                    "event_date": date,
                    "venue_name": venue,
                    "city": city
                })

        print(f"-> Found {len(events)} events on Tour Date Search.")
    except Exception as e:
        print(f"!!! Error scraping Tour Date Search: {e}")
        
    return events

In [None]:
if __name__ == "__main__":
    
    # 1. Execute scrapes
    all_concert_archives = scrape_concertarchives(CONCERTARCHIVES_URL)
    time.sleep(15) # Be respectful: wait before hitting the next site
    all_tour_dates = scrape_tourdatesearch(TOURDATESEARCH_URL)

    # 2. Consolidate and Clean
    all_events_raw = all_concert_archives + all_tour_dates
    df_features = pd.DataFrame(all_events_raw)
    
    # 3. Create the unique identifier (Join Key)
    # The join key must be consistent! Use Artist + Date + Venue as a unique identifier.
    df_features['join_key'] = (
        df_features['artist_name'].str.lower().str.replace('[^a-z0-9]', '', regex=True) + '_' +
        df_features['event_date'].str.replace('[^a-z0-9]', '', regex=True)
    )

    # 4. Save to Parquet
    df_features.to_parquet("data/processed/event_features_raw.parquet", index=False)
    
    print("\n--- Feature Data Consolidation Complete ---")
    print(f"Total unique events found: {df_features['join_key'].nunique()}")
    print("Data saved to data/processed/event_features_raw.parquet")
    print(df_features.head().to_markdown())