In [1]:
import requests
import pandas as pd
import time
from datetime import datetime
from rich.console import Console
from rich.panel import Panel

console = Console()

# -------------------------------------------------
# CONFIG
# -------------------------------------------------
API_KEY = "7ZAjR1CbgGjPq_yVX9H_YO7yUIq5BOhbocOv"
BASE = "https://api.setlist.fm/rest/1.0"

HEADERS = {
    "x-api-key": API_KEY,
    "Accept": "application/json",
    "User-Agent": "ticket-hero-us-concert-harvester"
}

MAX_CALLS_PER_DAY = 1440       # YOUR FREE LIMIT
CALLS_MADE = 0
SLEEP_SECONDS = 0.6            # ~1.6 requests/second (safe)


# -------------------------------------------------
# RATE-LIMITED GET
# -------------------------------------------------
def safe_get(url):
    global CALLS_MADE

    if CALLS_MADE >= MAX_CALLS_PER_DAY:
        raise RuntimeError(f"Daily limit reached ({CALLS_MADE}/{MAX_CALLS_PER_DAY})")

    r = requests.get(url, headers=HEADERS)
    CALLS_MADE += 1

    # Progress heartbeat
    if CALLS_MADE % 50 == 0:
        console.print(f"[yellow]{CALLS_MADE} API calls used so far[/yellow]")

    # Respect API limits
    time.sleep(SLEEP_SECONDS)
    return r


# -------------------------------------------------
# HARVEST ALL US CONCERTS
# -------------------------------------------------
def fetch_us_concerts():
    console.print(
        Panel(
            "[cyan]Fetching ALL US concerts from Setlist.fm[/cyan]\n"
            "Rate-limited to 1440 requests/day",
            border_style="cyan",
        )
    )

    all_rows = []
    page = 1

    while True:
        url = f"{BASE}/search/setlists?countryCode=US&p={page}"
        r = safe_get(url)

        if r.status_code != 200:
            console.print(f"[red]Error {r.status_code}: {r.text}[/red]")
            break

        payload = r.json()
        setlists = payload.get("setlist", [])

        console.print(f"[green]Page {page}: {len(setlists)} concerts[/green]")

        # Stop when no more pages
        if not setlists:
            break

        # Extract structured concert metadata
        for s in setlists:
            venue = s.get("venue", {})
            city = venue.get("city", {})
            coords = city.get("coords", {})
            artist = s.get("artist", {})

            all_rows.append({
                "event_id": s.get("id"),
                "event_date": s.get("eventDate"),
                "event_last_updated": s.get("lastUpdated"),

                # Artist info
                "artist_name": artist.get("name"),
                "artist_mbid": artist.get("mbid"),

                # Tour
                "tour_name": s.get("tour", {}).get("name"),

                # Venue info
                "venue_name": venue.get("name"),
                "venue_id": venue.get("id"),
                "venue_city": city.get("name"),
                "venue_state": city.get("state"),
                "venue_country": city.get("country", {}).get("code"),
                "lat": coords.get("lat"),
                "lng": coords.get("long"),

                # Setlist & extras
                "sets": s.get("sets"),
                "info": s.get("info"),
                "url": s.get("url"),
            })

        page += 1

        # Absolute safety stop (should never hit)
        if page > 2000:
            console.print("[red]Stopping due to unusually large page count.[/red]")
            break

    df = pd.DataFrame(all_rows)
    return df


# -------------------------------------------------
# RUN HARVEST
# -------------------------------------------------
if __name__ == "__main__":
    df = fetch_us_concerts()

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    path = f"setlistfm_us_concerts_{timestamp}.parquet"

    df.to_parquet(path, index=False)

    console.print(
        Panel(
            f"[bold green]US Concert Harvest Complete[/bold green]\n\n"
            f"Concerts collected: {len(df):,}\n"
            f"API calls used: {CALLS_MADE}/{MAX_CALLS_PER_DAY}\n"
            f"Saved → {path}",
            border_style="green",
        )
    )


In [None]:
import requests
import pandas as pd
import time
import random
from datetime import datetime
from rich.console import Console
from rich.panel import Panel
from typing import Dict, List, Any

console = Console()

# -------------------------------------------------
# CONFIG
# -------------------------------------------------
# NOTE: Replace with your actual API key before running
API_KEY = "7ZAjR1CbgGjPq_yVX9H_YO7yUIq5BOhbocOv" 
BASE = "https://api.setlist.fm/rest/1.0"

HEADERS = {
    "x-api-key": API_KEY,
    "Accept": "application/json",
    "User-Agent": "ticket-hero-us-concert-harvester"
}

MAX_CALLS_PER_DAY = 1440       # Free tier limit
CALLS_MADE = 0
# CORRECTED THROTTLING: Safely under 1 request/second (1.1s base)
SAFE_SLEEP_BASE = 1.1          
# JITTER: Add or subtract up to 0.4 seconds randomly
SLEEP_JITTER = 0.4             


# -------------------------------------------------
# RATE-LIMITED GET (429 FIX)
# -------------------------------------------------
def safe_get(url: str) -> requests.Response:
    global CALLS_MADE

    if CALLS_MADE >= MAX_CALLS_PER_DAY:
        raise RuntimeError(f"Daily limit reached ({CALLS_MADE}/{MAX_CALLS_PER_DAY})")

    # 1. Determine random sleep time (Jitter applied)
    sleep_time = SAFE_SLEEP_BASE + random.uniform(-SLEEP_JITTER, SLEEP_JITTER)
    
    # 2. WAIT BEFORE THE REQUEST (to respect the per-second limit)
    time.sleep(sleep_time) 

    # 3. Make the request
    r = requests.get(url, headers=HEADERS)
    CALLS_MADE += 1

    # 4. Critical: Check for immediate 429
    if r.status_code == 429:
        # If we hit 429, log and pause execution for 5 minutes (300 seconds)
        console.print(f"[bold red]CRITICAL 429 BURST LIMIT HIT. Pausing for 5 minutes to reset limit...[/bold red]")
        time.sleep(300)
        # Attempt the request one more time after the long pause
        r = requests.get(url, headers=HEADERS)
        CALLS_MADE += 1
        
    # Progress heartbeat
    if CALLS_MADE % 50 == 0:
        console.print(f"[yellow]{CALLS_MADE} API calls used so far[/yellow]")

    return r


# -------------------------------------------------
# HARVEST ALL US CONCERTS
# -------------------------------------------------
def fetch_us_concerts() -> pd.DataFrame:
    console.print(
        Panel(
            "[cyan]Fetching ALL US concerts from Setlist.fm[/cyan]\n"
            "Rate-limited to 1440 requests/day",
            border_style="cyan",
        )
    )

    all_rows: List[Dict[str, Any]] = []
    page = 1

    while True:
        url = f"{BASE}/search/setlists?countryCode=US&p={page}"
        r = safe_get(url)

        if r.status_code != 200:
            console.print(f"[red]Stopping harvest. Error {r.status_code}: {r.text}[/red]")
            break

        try:
            payload = r.json()
        except requests.exceptions.JSONDecodeError:
            console.print("[red]Stopping harvest. Failed to decode JSON response.[/red]")
            break

        setlists = payload.get("setlist", [])

        console.print(f"[green]Page {page}: {len(setlists)} concerts[/green]")

        # Stop when no more pages
        if not setlists:
            break

        # Extract structured concert metadata
        for s in setlists:
            venue = s.get("venue", {})
            city = venue.get("city", {})
            coords = city.get("coords", {})
            artist = s.get("artist", {})

            # Calculate setlist length for a strong feature (proxy for demand)
            sets = s.get("sets", {})
            total_songs = sum(len(s.get('song', [])) for s in sets.get('set', []))

            all_rows.append({
                "event_id": s.get("id"),
                "event_date": s.get("eventDate"),
                "event_last_updated": s.get("lastUpdated"),

                # FEATURE: Artist info
                "artist_name": artist.get("name"),
                "artist_mbid": artist.get("mbid"),

                # FEATURE: Tour
                "tour_name": s.get("tour", {}).get("name"),

                # FEATURE: Setlist length (quantitative demand feature)
                "setlist_length": total_songs,

                # FEATURE: Venue info
                "venue_name": venue.get("name"),
                "venue_id": venue.get("id"),
                "venue_city": city.get("name"),
                "venue_state": city.get("state"),
                "venue_country": city.get("country", {}).get("code"),
                "lat": coords.get("lat"),
                "lng": coords.get("long"),
                "url": s.get("url"),
            })

        page += 1

        # Hard stop if the loop runs excessively (over 100 pages should be impossible for a single user)
        if page > 100:
            console.print("[red]Stopping due to page count limit.[/red]")
            break

    df = pd.DataFrame(all_rows)
    return df


# -------------------------------------------------
# RUN HARVEST
# -------------------------------------------------
if __name__ == "__main__":
    df = fetch_us_concerts()

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    path = f"data/processed/setlistfm_us_concerts_{timestamp}.parquet"

    df.to_parquet(path, index=False)

    console.print(
        Panel(
            f"[bold green]US Concert Harvest Complete[/bold green]\n\n"
            f"Concerts collected: {len(df):,}\n"
            f"API calls used: {CALLS_MADE}/{MAX_CALLS_PER_DAY}\n"
            f"Saved → {path}",
            border_style="green",
        )
    )

In [None]:
import requests
import pandas as pd
import time
import random
from datetime import datetime
from rich.console import Console
from rich.panel import Panel
from typing import Dict, List, Any

console = Console()

# -------------------------------------------------
# CONFIG
# -------------------------------------------------
# NOTE: Replace with your actual API key before running
API_KEY = "7ZAjR1CbgGjPq_yVX9H_YO7yUIq5BOhbocOv" 
BASE = "https://api.setlist.fm/rest/1.0"

HEADERS = {
    "x-api-key": API_KEY,
    "Accept": "application/json",
    "User-Agent": "ticket-hero-us-concert-harvester"
}

MAX_CALLS_PER_DAY = 1440       # Free tier limit
CALLS_MADE = 0

# THROTTLING CONFIG: Base sleep time plus random factor (Jitter)
SAFE_SLEEP_BASE = 1.1          # Safely under 1 request/second
SLEEP_JITTER = 0.4             # Randomness factor (+/- 0.4 seconds)

# BACKOFF CONFIG
MAX_RETRIES = 5                # Maximum number of times to retry on a 429
INITIAL_BACKOFF_SECONDS = 2    # Starting delay before first retry (2s, 4s, 8s, 16s, ...)


# -------------------------------------------------
# RATE-LIMITED GET (FIXED WITH EXPONENTIAL BACKOFF)
# -------------------------------------------------
def safe_get(url: str) -> requests.Response:
    global CALLS_MADE

    if CALLS_MADE >= MAX_CALLS_PER_DAY:
        raise RuntimeError(f"Daily limit reached ({CALLS_MADE}/{MAX_CALLS_PER_DAY})")

    # 1. Base Throttling (Wait before the request)
    sleep_time = SAFE_SLEEP_BASE + random.uniform(-SLEEP_JITTER, SLEEP_JITTER)
    time.sleep(sleep_time) 

    # 2. Make the initial request
    r = requests.get(url, headers=HEADERS)
    CALLS_MADE += 1
    
    # 3. Check for failure and initiate backoff
    if r.status_code == 429:
        
        # --- Exponential Backoff Logic ---
        console.print(f"[bold red]WARNING: 429 BURST LIMIT HIT. Initiating backoff...[/bold red]")
        current_delay = INITIAL_BACKOFF_SECONDS
        
        for attempt in range(MAX_RETRIES):
            console.print(f"[yellow]  -> Retrying in {current_delay:.2f} seconds (Attempt {attempt + 1}/{MAX_RETRIES})[/yellow]")
            time.sleep(current_delay)
            
            # Retry the request
            r = requests.get(url, headers=HEADERS)
            CALLS_MADE += 1
            
            if r.status_code != 429:
                console.print("[green]  -> Retry successful. Resuming harvest.[/green]")
                break # Exit backoff loop
            
            # Double the delay for the next attempt (Exponential)
            current_delay *= 2
        
        # If all retries fail, return the last 429 response
        if r.status_code == 429:
            console.print("[bold red]FATAL: Max retries exceeded. API harvest aborted.[/bold red]")


    # Progress heartbeat
    if CALLS_MADE % 50 == 0:
        console.print(f"[yellow]{CALLS_MADE} API calls used so far[/yellow]")

    return r


# -------------------------------------------------
# HARVEST ALL US CONCERTS
# -------------------------------------------------
def fetch_us_concerts() -> pd.DataFrame:
    console.print(
        Panel(
            "[cyan]Fetching ALL US concerts from Setlist.fm[/cyan]\n"
            "Rate-limited to 1440 requests/day",
            border_style="cyan",
        )
    )

    all_rows: List[Dict[str, Any]] = []
    page = 1

    while True:
        url = f"{BASE}/search/setlists?countryCode=US&p={page}"
        r = safe_get(url)

        if r.status_code != 200:
            console.print(f"[red]Stopping harvest. Error {r.status_code}: {r.text}[/red]")
            break

        try:
            payload = r.json()
        except requests.exceptions.JSONDecodeError:
            console.print("[red]Stopping harvest. Failed to decode JSON response.[/red]")
            break

        setlists = payload.get("setlist", [])

        console.print(f"[green]Page {page}: {len(setlists)} concerts[/green]")

        # Stop when no more pages
        if not setlists:
            break

        # Extract structured concert metadata
        for s in setlists:
            venue = s.get("venue", {})
            city = venue.get("city", {})
            coords = city.get("coords", {})
            artist = s.get("artist", {})

            # Calculate setlist length for a strong feature (proxy for demand)
            sets = s.get("sets", {})
            total_songs = sum(len(s.get('song', [])) for s in sets.get('set', []))

            all_rows.append({
                "event_id": s.get("id"),
                "event_date": s.get("eventDate"),
                "event_last_updated": s.get("lastUpdated"),

                # FEATURE: Artist info
                "artist_name": artist.get("name"),
                "artist_mbid": artist.get("mbid"),

                # FEATURE: Tour
                "tour_name": s.get("tour", {}).get("name"),

                # FEATURE: Setlist length (quantitative demand feature)
                "setlist_length": total_songs,

                # FEATURE: Venue info
                "venue_name": venue.get("name"),
                "venue_id": venue.get("id"),
                "venue_city": city.get("name"),
                "venue_state": city.get("state"),
                "venue_country": city.get("country", {}).get("code"),
                "lat": coords.get("lat"),
                "lng": coords.get("long"),
                "url": s.get("url"),
            })

        page += 1

        # Hard stop if the loop runs excessively (e.g., if pagination breaks)
        if page > 200:
            console.print("[red]Stopping due to page count limit.[/red]")
            break

    df = pd.DataFrame(all_rows)
    return df


# -------------------------------------------------
# RUN HARVEST
# -------------------------------------------------
if __name__ == "__main__":
    df = fetch_us_concerts()

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    path = f"data/processed/setlistfm_us_concerts_{timestamp}.parquet"

    df.to_parquet(path, index=False)

    console.print(
        Panel(
            f"[bold green]US Concert Harvest Complete[/bold green]\n\n"
            f"Concerts collected: {len(df):,}\n"
            f"API calls used: {CALLS_MADE}/{MAX_CALLS_PER_DAY}\n"
            f"Saved → {path}",
            border_style="green",
        )
    )