In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
import json
import re
from urllib.parse import urljoin, urlparse, parse_qs

In [8]:
# CONFIG
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; setlist-scraper/1.0; +https://example.com/)",
    # You can add Accept-Language etc. if you prefer.
}
REQUEST_TIMEOUT = 15
SLEEP_RANGE = (1.0, 2.5)  # polite delay between requests
MAX_RETRIES = 3

# Example artist listing (replace as needed)
ARTIST_LISTING_BASE = "https://www.setlist.fm/setlists/billy-strings-73c3fe21.html"
# If you want to restrict to pages 1..N, pass pages=N to crawl_artist


In [10]:
def get_soup(url, session=None):
    """GET a URL with retries and return BeautifulSoup"""
    session = session or requests.Session()
    for attempt in range(MAX_RETRIES):
        try:
            r = session.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
            r.raise_for_status()
            return BeautifulSoup(r.text, "lxml")
        except requests.RequestException as e:
            wait = 1 + attempt * 2
            print(f"Request failed ({e}), retrying in {wait}s...")
            time.sleep(wait)
    raise RuntimeError(f"Failed to GET {url} after {MAX_RETRIES} attempts")

def polite_sleep():
    time.sleep(random.uniform(*SLEEP_RANGE))

def extract_show_links(listing_soup, base_url):
    """
    Find all show subpage links on a listing page inside elements with
    class 'col-xs-12 setlistPreview vevent' (per your note).
    Returns absolute URLs (unique, in order).
    """
    links = []
    # find containers with the exact class or that include those tokens
    containers = listing_soup.find_all(class_=lambda c: c and 'setlistPreview' in c and 'vevent' in c)
    for c in containers:
        a = c.find("a", href=True)
        if a:
            href = a["href"]
            full = urljoin(base_url, href)
            links.append(full)
    # fallback: find any link under .setlistPreview
    if not links:
        for a in listing_soup.select(".setlistPreview a[href]"):
            links.append(urljoin(base_url, a["href"]))
    # dedupe while preserving order
    seen = set()
    uniq = []
    for u in links:
        if u not in seen:
            uniq.append(u)
            seen.add(u)
    return uniq

import re
from datetime import datetime

def parse_date(soup):
    """
    Try multiple patterns to locate the event date and return in YYYY/MM/dd format.
    """
    date_str = ""
    # 1) <time itemprop="startDate" datetime="..."> or generic <time datetime="...">
    time_tag = soup.find("time", {"itemprop": "startDate"}) or soup.find("time", datetime=True)
    if time_tag and time_tag.has_attr("datetime"):
        date_str = time_tag["datetime"]

    # 2) element with class eventDate or .date
    if not date_str:
        date_el = soup.select_one(".eventDate, .date, .setlistDate")
        if date_el:
            date_str = date_el.get_text(strip=True)

    # Convert the date string to the desired YYYY/MM/dd format
    if date_str:
        # Common setlist.fm formats are YYYY-MM-DD (ISO) or Month DD, YYYY
        try:
            # Attempt to parse YYYY-MM-DD
            dt_obj = datetime.strptime(date_str, "%Y-%m-%d")
        except ValueError:
            try:
                # Attempt to parse Month DD, YYYY (e.g., November 15, 2025)
                dt_obj = datetime.strptime(date_str, "%B %d, %Y")
            except ValueError:
                try:
                    # Attempt to parse MMMddYYYY (e.g., Oct232025) which was in your output
                    dt_obj = datetime.strptime(date_str, "%b%d%Y")
                except ValueError:
                    # If all else fails, return the original string or empty string
                    return date_str

        return dt_obj.strftime("%Y/%m/%d")

    return ""

def expand_location_abbr(abbr):
    """Expands state/country abbreviations to full names."""
    state_map = {
        "NJ": "New Jersey", "NY": "New York", "CA": "California", "TX": "Texas",
        "IL": "Illinois", "PA": "Pennsylvania", "MA": "Massachusetts", # Add more as needed
    }
    continent_map = {
        "USA": "North America", "US": "North America", "CA": "North America", # US, Canada
        "UK": "Europe", "DE": "Europe", "FR": "Europe",
        "AU": "Australia",
    }

    # Check for State
    if abbr in state_map:
        return state_map[abbr], "US" # Assuming state abbr means US

    # Check for Continent (based on country code)
    if abbr in continent_map:
        return abbr, continent_map[abbr]

    return abbr, "Unknown"


def parse_venue_and_location(soup):
    """
    Extract venue, city, state, and country/continent from the setlist page.
    Returns (venue_name, city, state, continent)
    """
    venue_name = ""
    event_city = ""
    event_state = ""
    event_continent = ""

    # 1. Target the specific link structure: <a title="More setlists from Prudential Center, Newark, NJ, USA">
    venue_link = soup.select_one('a[title^="More setlists from"]')

    if venue_link:
        # Get the title attribute text: "More setlists from Prudential Center, Newark, NJ, USA"
        full_location_str = venue_link.get("title").replace("More setlists from ", "").strip()

        # This string usually follows the pattern: Venue Name, City, State/Province, Country/Continent
        parts = [p.strip() for p in full_location_str.split(",")]

        if len(parts) >= 4:
            venue_name = parts[0]
            event_city = parts[1]
            # Use helper function to expand state/country
            state_abbr = parts[2]
            continent_abbr = parts[3]

            expanded_state, _ = expand_location_abbr(state_abbr)
            event_state = expanded_state if expanded_state != state_abbr else state_abbr

            # For continent, we will use the country code for 'eventContinent'
            event_continent = continent_abbr

        elif len(parts) == 3:
            # Handle Venue, City, Country (e.g., "The Venue, London, UK")
            venue_name = parts[0]
            event_city = parts[1]
            event_continent = parts[2]
            event_state = "" # State is empty if not provided

    # 2. Cleanup Venue Name: Remove pre-pended text
    header = soup.select_one("h1")
    if venue_name and header:
        header_text = header.get_text(" ", strip=True)
        # Check if the header contains the venue name and "Setlist at"
        match = re.search(r"Setlist at\s*(.*?)\s*—", header_text, re.I)
        if match:
             # Use the captured group if it looks like a venue name
             venue_name = match.group(1).strip()

    # 3. Fallback: If structured parsing failed, try original parsing method (using .setlistVenue, etc.)
    if not venue_name or not event_city:
         # Fallback to the original implementation's logic (kept for robustness)
         # Note: You need to keep the ORIGINAL implementation of parse_venue_and_location
         # in your script, but modify it to return the venue name and a combined location string
         # (location_string). Then you would parse the location_string here.
         # For simplicity, I'm sticking to the targeted approach above, assuming the <a> tag is present.

         # --- Simplified Fallback ---
         venue_el = soup.select_one(".venue, .venueName, .setlistVenue")
         if venue_el and not venue_name:
             venue_name = venue_el.get_text(" ", strip=True)

         loc_el = soup.select_one(".venueLocation, .city, .location")
         if loc_el and not event_city:
             location_str = loc_el.get_text(" ", strip=True)
             parts = [p.strip() for p in location_str.split(",")]
             if len(parts) >= 1:
                 event_city = parts[0]
             if len(parts) >= 2:
                 # Assume parts[1] is State/Country, need more logic for robust splitting
                 event_continent = parts[-1]


    # Final required return: (venue_name, city, state, continent)
    return venue_name, event_city, event_state, event_continent

def parse_sets(soup):
    sets = []
    current_set = None

    # Target the main song list container using its ID or class
    # We look for the main <ol class="songsList"> inside the content div.
    songs_list = soup.select_one(".setlistContent .songsList")

    if not songs_list:
        print("    [!] Error: Could not find the main '.songsList' container.")
        return []

    # Iterate through all direct children <li> of the songsList (which include headers, songs, and ads)
    for li in songs_list.find_all("li", recursive=False):

        # 1. Identify Set/Encore Header
        # Check for classes 'section' (for Set 1, 2, 3) or 'encore'
        if "section" in li.get("class", []) or "encore" in li.get("class", []):

            # Close the previous set if it had songs
            if current_set and current_set["songs"]:
                sets.append(current_set)

            set_name = li.get_text(" ", strip=True)
            is_encore = "encore" in li.get("class", [])
            set_num = None

            # Extract set number from text like "Set 1:", "Set 2:", etc.
            m = re.search(r"Set\s*(\d+)", set_name, re.I)
            if m:
                set_num = int(m.group(1))

            # Initialize a new set object
            current_set = {
                "setNumber": set_num,
                "encore": is_encore,
                "songs": []
            }
            continue # Move to the next <li>

        # 2. Identify Song Item
        # Check for class 'song' and ensure we have an active set object
        elif "song" in li.get("class", []) and current_set is not None:

            # The cleanest song title is inside the <a> tag with class 'songLabel'
            a_tag = li.select_one(".songLabel")

            if a_tag and a_tag.get_text(strip=True):
                song_text = a_tag.get_text(strip=True)

                # We can also capture the full text (including cover/notes in the li) for context,
                # but the user requested clean song titles, so we use the <a> tag.

                # Cleanup (removed unnecessary regex since we target the clean <a> text)

                if song_text:
                    current_set["songs"].append(song_text)

        # 3. Skip Advertisements and other non-setlist items
        elif "setlistFluidAd" in li.get("class", []):
             continue

    # After the loop, append the last set if it exists and has songs
    if current_set and current_set["songs"]:
        sets.append(current_set)

    # --- Final Normalization ---
    # Assign sequential numbers and ensure Encore flag is correct
    next_set_idx = 1
    for s in sets:
        # Fill in missing set numbers (e.g., if Set 1 was found, but the next was an unnumbered Encore)
        if s["setNumber"] is None:
            s["setNumber"] = next_set_idx
        next_set_idx = max(next_set_idx, s["setNumber"] + 1)

        # Ensure 'encore' flag is only true if the set was identified as such
        # (This is mostly done by the loop, but safety check remains)

    return sets

def parse_show_page(url, session=None):
    """Parse a single show page and return the structured dict"""
    session = session or requests.Session()
    soup = get_soup(url, session=session)
    date = parse_date(soup) # Returns YYYY/MM/dd

    # --- UPDATED LINE ---
    venue, city, state, continent = parse_venue_and_location(soup)
    sets = parse_sets(soup)

    return {
        "date": date,
        "venue": venue,
        "eventCity": city,      # New field
        "eventState": state,    # New field
        "eventContinent": continent, # New field
        "setlist": sets,
        "source": url
    }

def crawl_artist(listing_url, pages=3, session=None):
    """
    Crawl the artist listing pages page=1..pages, extract show links, follow each show link,
    and parse each show into the structured format.
    """
    session = session or requests.Session()
    results = []
    listing_base = listing_url.split("?")[0]
    for p in range(1, pages + 1):
        # page param: first page may or may not require '?page=1' — it works both ways but safe to include '?page=N'
        page_url = f"{listing_base}?page={p}"
        print(f"[+] Fetching listing page: {page_url}")
        listing_soup = get_soup(page_url, session=session)
        show_links = extract_show_links(listing_soup, listing_url)
        print(f"    -> found {len(show_links)} show links on page {p}")
        for show_url in show_links:
            print(f"    [*] Parsing show: {show_url}")
            try:
                show_obj = parse_show_page(show_url, session=session)
                results.append(show_obj)
            except Exception as e:
                print(f"      ! Failed to parse {show_url}: {e}")
            polite_sleep()
        polite_sleep()
    return results

if __name__ == "__main__":
    # Run for pages 1..3 as requested
    s = requests.Session()
    try:
        data = crawl_artist(ARTIST_LISTING_BASE, pages=97, session=s)
    except Exception as e:
        print("Fatal error during crawl:", e)
        data = []

    # Print JSON to stdout (pretty) and save to file
    out_file = "string_cheese_incident_setlists.json"
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"\nDone. Extracted {len(data)} shows. Output written to: {out_file}")

[+] Fetching listing page: https://www.setlist.fm/setlists/the-string-cheese-incident-13d6ade1.html?page=1
    -> found 10 show links on page 1
    [*] Parsing show: https://www.setlist.fm/setlist/the-string-cheese-incident/2025/spirit-of-the-suwannee-music-park-live-oak-fl-b415922.html
    [*] Parsing show: https://www.setlist.fm/setlist/the-string-cheese-incident/2025/spirit-of-the-suwannee-music-park-live-oak-fl-3415923.html
    [!] Error: Could not find the main '.songsList' container.
    [*] Parsing show: https://www.setlist.fm/setlist/the-string-cheese-incident/2025/spirit-of-the-suwannee-music-park-live-oak-fl-3415927.html
    [*] Parsing show: https://www.setlist.fm/setlist/the-string-cheese-incident/2025/spirit-of-the-suwannee-music-park-live-oak-fl-1341592d.html
    [!] Error: Could not find the main '.songsList' container.
    [*] Parsing show: https://www.setlist.fm/setlist/the-string-cheese-incident/2025/spirit-of-the-suwannee-music-park-live-oak-fl-13415929.html
    [*] 