In [2]:
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
from rich.console import Console
from pathlib import Path
import time

console = Console()

PHQ_TOKEN = "T5cY1FP0nY8-VBXC_GG3QGk1oxDPXmcHNAhXTs5Y"

HEADERS = {
    "Authorization": f"Bearer {PHQ_TOKEN}",
    "Accept": "application/json",
}

# According to your use case — these 3 categories matter.
CATEGORIES = "concerts,festivals,sports"

BASE_URL = "https://api.predicthq.com/v1/events/"
LIMIT = 100

# Free-tier allowed window: 90 days past, 90 days future
today = datetime.now(timezone.utc).date()
WINDOW_START = today - timedelta(days=90)
WINDOW_END = today + timedelta(days=90)

OUTPUT_DIR = Path("data/predicthq")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def do_phq_request(params):
    """Robust GET with rate-limit handling, per PredictHQ docs."""
    backoff = 1
    while True:
        r = requests.get(BASE_URL, headers=HEADERS, params=params)

        # Rate limit (docs specify exponential backoff)
        if r.status_code == 429:
            console.print(f"[yellow]Rate limit hit — sleeping {backoff}s[/yellow]")
            time.sleep(backoff)
            backoff = min(backoff * 2, 30)
            continue

        if r.status_code != 200:
            console.print(f"[red]Error {r.status_code}: {r.text}[/red]")
            return None

        return r.json()


def fetch_phq_events():
    console.print("[bold cyan]Fetching PredictHQ events (concerts, festivals, sports)...[/bold cyan]")

    offset = 0
    all_results = []

    while True:
        params = {
            # Category format per docs
            "category": CATEGORIES,

            # Correct date filters (docs)
            "active.gte": WINDOW_START.isoformat(),
            "active.lte": WINDOW_END.isoformat(),

            # You want U.S. only
            "country": "US",

            # Max allowed per request
            "limit": LIMIT,
            "offset": offset,

            # Correct sort field per PredictHQ docs
            "sort": "start",
        }

        data = do_phq_request(params)
        if not data:
            break

        results = data.get("results", [])
        if not results:
            break

        all_results.extend(results)
        console.print(f"Fetched batch ({offset} → +{len(results)})")

        if len(results) < LIMIT:
            break

        offset += LIMIT

    console.print(f"[green]Total events fetched: {len(all_results)}[/green]")
    return all_results


def flatten_events(events):
    rows = []
    for e in events:
        rows.append({
            "id": e.get("id"),
            "title": e.get("title"),
            "start": e.get("start"),
            "end": e.get("end"),
            "category": e.get("category"),
            "labels": e.get("labels"),
            "rank": e.get("rank"),
            "local_rank": e.get("local_rank"),
            "phq_attendance": e.get("phq_attendance"),
            "location": e.get("location"),
            "duration": e.get("duration"),
            "place_hierarchies": e.get("place_hierarchies"),
            "updated": e.get("updated"),
        })
    return pd.DataFrame(rows)


def collect_all():
    raw = fetch_phq_events()
    df = flatten_events(raw)

    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = OUTPUT_DIR / f"phq_events_{ts}.parquet"
    df.to_parquet(out_path, index=False)

    console.print(f"[bold green]Saved → {out_path}[/bold green]")
    console.print(df.head())

    return df


if __name__ == "__main__":
    collect_all()


In [3]:
import requests
import pandas as pd
from rich.console import Console
from datetime import datetime, timedelta
from pathlib import Path

console = Console()

PHQ_TOKEN = "T5cY1FP0nY8-VBXC_GG3QGk1oxDPXmcHNAhXTs5Y"

headers = {
    "Authorization": f"Bearer {PHQ_TOKEN}",
    "Accept": "application/json"
}

# PredictHQ max limit is 100 per page
LIMIT = 100

CATEGORIES = ["concerts", "festivals", "sports"]

def fetch_phq_category(cat: str) -> list:
    """Fetch all events for a category with full pagination."""
    
    console.print(f"[cyan]Fetching PredictHQ events: {cat}[/cyan]")
    
    url = "https://api.predicthq.com/v1/events/"
    offset = 0
    all_events = []

    # Free plan: 90 days backward + 90 days forward
    today = datetime.utcnow().date()
    start = today - timedelta(days=90)
    end   = today + timedelta(days=90)

    while True:
        params = {
            "category": cat,
            "country": "US",
            "limit": LIMIT,
            "offset": offset,
            "start.gte": start.isoformat(),
            "start.lte": end.isoformat(),
            "sort": "start",
        }

        r = requests.get(url, headers=headers, params=params)
        
        if r.status_code != 200:
            console.print(f"[red]Error {r.status_code}: {r.text}[/red]")
            break

        data = r.json()
        results = data.get("results", [])

        console.print(f"[green]Fetched batch {offset} → +{len(results)}[/green]")

        all_events.extend(results)

        if len(results) < LIMIT:
            # Last page
            break

        offset += LIMIT

    console.print(f"[bold green]Total {cat} events: {len(all_events)}[/bold green]")

    return all_events


def flatten_event(e):
    """Flatten PredictHQ event into a clean row."""
    return {
        "id": e["id"],
        "title": e.get("title"),
        "category": e.get("category"),
        "start": e.get("start"),
        "end": e.get("end"),
        "rank": e.get("rank"),
        "local_rank": e.get("local_rank"),
        "phq_attendance": e.get("phq_attendance"),
        "location": e.get("location", []),
        "labels": e.get("labels"),
        "entities": e.get("entities"),
        "duration": e.get("duration"),
        "updated": e.get("updated"),
    }


def harvest_all():
    console.print("[bold cyan]Starting full PredictHQ harvest...[/bold cyan]")

    all_results = []

    for cat in CATEGORIES:
        results = fetch_phq_category(cat)
        all_results.extend(results)

    console.print(f"[bold green]Grand total events: {len(all_results)}[/bold green]")

    df = pd.DataFrame([flatten_event(e) for e in all_results])

    # Save
    out_dir = Path("data/predicthq")
    out_dir.mkdir(parents=True, exist_ok=True)
    
    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    out_path = out_dir / f"phq_events_{timestamp}.parquet"

    df.to_parquet(out_path, index=False)
    console.print(f"[bold magenta]Saved → {out_path}[/bold magenta]")

    return df


if __name__ == "__main__":
    df = harvest_all()
    console.print(df.head())


  today = datetime.utcnow().date()


  timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


In [4]:
import requests
import pandas as pd
from rich.console import Console
from datetime import datetime, timedelta
import time

console = Console()

PHQ_TOKEN = "T5cY1FP0nY8-VBXC_GG3QGk1oxDPXmcHNAhXTs5Y"

headers = {
    "Authorization": f"Bearer {PHQ_TOKEN}",
    "Accept": "application/json",
}

CATEGORIES = ["concerts", "festivals", "sports"]


def fetch_category(category: str, start: str, end: str, country="US") -> list:
    """
    Fetches ALL pages (up to 100) for a category.
    Returns a list of event dicts.
    """

    url = "https://api.predicthq.com/v1/events/"
    params = {
        "category": category,
        "country": country,
        "limit": 50,
        "start.gte": start,
        "start.lte": end,
        "sort": "start",   # valid
    }

    events = []

    r = requests.get(url, headers=headers, params=params)
    data = r.json()

    # First page
    events.extend(data.get("results", []))
    console.print(f"Fetched page 1 ({len(data.get('results', []))} events)")

    # Follow pagination
    page = 1
    next_url = data.get("next")

    while next_url and page < 100:
        page += 1
        console.print(f"Fetching page {page}...")

        r = requests.get(next_url, headers=headers)
        data = r.json()
        events.extend(data.get("results", []))

        next_url = data.get("next")
        time.sleep(0.2)

    console.print(f"[green]Finished category '{category}' with {len(events)} total events.[/green]")
    return events


# ---------------------------------------------------------
# MAIN HARVEST
# ---------------------------------------------------------

console.print("[bold cyan]Starting full PredictHQ harvest...[/bold cyan]")

today = datetime.utcnow().date()
start_date = str(today)
end_date = str(today + timedelta(days=90))   # trial future visibility

all_records = []

for cat in CATEGORIES:
    console.print(f"\n[bold yellow]Fetching category: {cat}[/bold yellow]")
    records = fetch_category(cat, start_date, end_date)
    all_records.extend(records)


# Convert to DataFrame
df = pd.DataFrame([
    {
        "id": e["id"],
        "title": e.get("title"),
        "start": e.get("start"),
        "end": e.get("end"),
        "category": e.get("category"),
        "labels": e.get("labels"),
        "rank": e.get("rank"),
        "local_rank": e.get("local_rank"),
        "phq_attendance": e.get("phq_attendance"),
        "location": e.get("location"),
        "updated": e.get("updated"),
    }
    for e in all_records
])

console.print(df.head())
console.print(f"[bold green]Total events fetched: {len(df)}[/bold green]")


  today = datetime.utcnow().date()


In [None]:
import requests
import pandas as pd
from rich.console import Console
from datetime import datetime, timedelta
import time

console = Console()

PHQ_TOKEN = "T5cY1FP0nY8-VBXC_GG3QGk1oxDPXmcHNAhXTs5Y"

headers = {
    "Authorization": f"Bearer {PHQ_TOKEN}",
    "Accept": "application/json",
}

CATEGORIES = ["concerts", "festivals", "sports"]


def fetch_all_pages(initial_url: str) -> list:
    """Fetches all pages via PredictHQ's next-link pagination."""
    events = []

    r = requests.get(initial_url, headers=headers)
    data = r.json()

    # First page
    events.extend(data.get("results", []))
    next_url = data.get("next")
    page = 1

    console.print(f"Fetched page {page}: {len(data.get('results', []))} events")

    # Additional pages — capped at 100 by plan limits
    while next_url and page < 100:
        page += 1
        console.print(f"Fetching page {page}...")

        r = requests.get(next_url, headers=headers)
        data = r.json()
        events.extend(data.get("results", []))

        next_url = data.get("next")
        time.sleep(0.2)

    return events


def fetch_category_window(category: str, start_date: str, end_date: str, country="US") -> list:
    """
    Builds an initial request URL and passes it into fetch_all_pages().
    """

    base_url = "https://api.predicthq.com/v1/events/"
    params = {
        "category": category,
        "country": country,
        "start.gte": start_date,
        "start.lte": end_date,
        "limit": 50,
        "sort": "start"  # this IS allowed
    }

    r = requests.get(base_url, headers=headers, params=params)
    data = r.json()

    # If no results or error, bail early
    if "results" not in data:
        console.print(f"[red]Error for category {category}: {data}[/red]")
        return []

    # Build the initial URL including encoded params
    initial_url = r.url

    console.print(f"[blue]Fetching {category} from {start_date} → {end_date}[/blue]")
    return fetch_all_pages(initial_url)


# ---------------------------------------------------
# MAIN HARVEST: 90 days backward + 90 days forward
# ---------------------------------------------------

console.print("[bold cyan]Starting PredictHQ (backward + forward) harvest...[/bold cyan]")

today = datetime.utcnow().date()

# trial-limited windows
back_start = str(today - timedelta(days=90))
back_end = str(today)

forward_start = str(today)
forward_end = str(today + timedelta(days=90))

console.print(f"[yellow]Backward window:[/yellow] {back_start} → {back_end}")
console.print(f"[yellow]Forward window:[/yellow]  {forward_start} → {forward_end}")

all_records = []

for cat in CATEGORIES:
    # Backward
    back_records = fetch_category_window(cat, back_start, back_end)
    console.print(f"[green]{cat} backward events: {len(back_records)}[/green]")

    # Forward
    forward_records = fetch_category_window(cat, forward_start, forward_end)
    console.print(f"[green]{cat} forward events: {len(forward_records)}[/green]")

    all_records.extend(back_records)
    all_records.extend(forward_records)

# ---------------------------------------------------
# Convert to DataFrame
# ---------------------------------------------------

df = pd.DataFrame([
    {
        "id": e["id"],
        "title": e.get("title"),
        "start": e.get("start"),
        "end": e.get("end"),
        "category": e.get("category"),
        "labels": e.get("labels"),
        "rank": e.get("rank"),
        "local_rank": e.get("local_rank"),
        "phq_attendance": e.get("phq_attendance"),
        "location": e.get("location"),
        "updated": e.get("updated"),
    }
    for e in all_records
])

console.print(df.head())
console.print(f"[bold green]TOTAL events fetched: {len(df):,}[/bold green]")

# Save it
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
out_path = f"data/predicthq/phq_events_full_{ts}.parquet"
df.to_parquet(out_path, index=False)

console.print(f"[bold cyan]Saved → {out_path}[/bold cyan]")


In [None]:
import os
from pathlib import Path

list(Path("data/predicthq").glob("*.parquet"))


In [1]:
# scripts/phq_harvester.py

import requests
import pandas as pd
from rich.console import Console
from rich.status import Status
from rich.progress import Progress, SpinnerColumn, TextColumn
from datetime import datetime, timedelta
import time

console = Console()

PHQ_TOKEN = "T5cY1FP0nY8-VBXC_GG3QGk1oxDPXmcHNAhXTs5Y"

headers = {
    "Authorization": f"Bearer {PHQ_TOKEN}",
    "Accept": "application/json",
}

CATEGORIES = ["concerts", "festivals", "sports"]


# ---------------------------------------------------
# INTERNAL: SAFE REQUEST FUNCTION WITH RETRIES
# ---------------------------------------------------
def safe_get(url, params=None, retries=3):
    for attempt in range(1, retries + 1):
        try:
            r = requests.get(url, headers=headers, params=params, timeout=10)
            if r.status_code == 429:
                time.sleep(1.0)
                continue
            r.raise_for_status()
            return r.json(), r.url
        except Exception as e:
            if attempt == retries:
                console.print(f"[red]FAILED after retries → {url}[/red]")
                return None, None
            time.sleep(0.5)
    return None, None


# ---------------------------------------------------
# PAGINATION CLEAN VERSION
# ---------------------------------------------------
def fetch_all_pages(initial_url: str) -> list:
    events = []
    page = 0

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        transient=True,
    ) as progress:

        task = progress.add_task("Fetching pages...", total=None)

        next_url = initial_url

        while next_url and page < 100:
            page += 1
            data, real_url = safe_get(next_url)

            if not data or "results" not in data:
                break

            batch = data.get("results", [])
            events.extend(batch)

            # throttle safely
            time.sleep(0.15)

            next_url = data.get("next")

            # Light progress update (not spam)
            if page % 3 == 0:
                progress.update(task, description=f"Fetched {len(events)} events...")

        progress.update(task, description=f"Done → {len(events)} events")

    return events


# ---------------------------------------------------
# CATEGORY WINDOW WRAPPER
# ---------------------------------------------------
def fetch_category_window(category: str, start_date: str, end_date: str, country="US") -> list:
    base_url = "https://api.predicthq.com/v1/events/"
    params = {
        "category": category,
        "country": country,
        "start.gte": start_date,
        "start.lte": end_date,
        "limit": 50,
        "sort": "start",
    }

    data, url_used = safe_get(base_url, params=params)

    if not data or "results" not in data:
        console.print(f"[red]Error fetching initial page for {category}[/red]")
        return []

    console.print(f"[cyan]→ {category}: {start_date} → {end_date}[/cyan]")
    return fetch_all_pages(url_used)


# ---------------------------------------------------
# MAIN HARVEST LOGIC
# ---------------------------------------------------
def run_harvest():
    console.print("[bold magenta]Starting PredictHQ Harvest (clean mode)...[/bold magenta]")

    today = datetime.utcnow().date()

    back_start = str(today - timedelta(days=90))
    back_end = str(today)

    forward_start = str(today)
    forward_end = str(today + timedelta(days=90))

    console.print(f"[yellow]Backward:[/yellow] {back_start} → {back_end}")
    console.print(f"[yellow]Forward:[/yellow]  {forward_start} → {forward_end}")

    all_records = []

    for cat in CATEGORIES:
        # backward
        with Status(f"[green]Fetching {cat} backward...[/green]"):
            b = fetch_category_window(cat, back_start, back_end)
        console.print(f"[green]{cat} backward: {len(b)} events[/green]")

        # forward
        with Status(f"[green]Fetching {cat} forward...[/green]"):
            f = fetch_category_window(cat, forward_start, forward_end)
        console.print(f"[green]{cat} forward: {len(f)} events[/green]")

        all_records.extend(b)
        all_records.extend(f)

    # Convert to DataFrame
    df = pd.DataFrame([
        {
            "id": e.get("id"),
            "title": e.get("title"),
            "start": e.get("start"),
            "end": e.get("end"),
            "category": e.get("category"),
            "labels": e.get("labels"),
            "rank": e.get("rank"),
            "local_rank": e.get("local_rank"),
            "phq_attendance": e.get("phq_attendance"),
            "location": e.get("location"),
            "updated": e.get("updated"),
        }
        for e in all_records
    ])

    console.print(df.head())
    console.print(f"[bold green]TOTAL events fetched: {len(df):,}[/bold green]")

    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    out_path = f"data/predicthq/phq_events_full_{ts}.parquet"

    df.to_parquet(out_path, index=False)

    console.print(f"[bold cyan]Saved → {out_path}[/bold cyan]")
    return df


if __name__ == "__main__":
    run_harvest()


  today = datetime.utcnow().date()


Output()

Output()

Output()

Output()

Output()

Output()

  ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
