In [None]:
from pathlib import Path

# single source of truth for dashboard outputs
OUTPUT_DIR = Path("docs/data/derived")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def save_csv(df, name):
    p = OUTPUT_DIR / name
    df.to_csv(p, index=False)
    print(f"✔ saved: {p}")

# …then everywhere you save:
# save_csv(provider_scores_df, "provider_scores_latest.csv")
# save_csv(roi_df,              "roi_comparison.csv")


In [1]:
#scrape provider
# --- Async helper that works in notebooks and GitHub Actions ---
import asyncio

def await_safe(coro):
    """
    Run an async coroutine from anywhere:
    - If an event loop is already running (Jupyter/nbconvert), use nest_asyncio + run_until_complete
    - Otherwise, use asyncio.run
    """
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            try:
                import nest_asyncio
                nest_asyncio.apply()
            except Exception:
                pass
            return loop.run_until_complete(coro)
        else:
            return asyncio.run(coro)
    except RuntimeError:
        # No current loop
        return asyncio.run(coro)


In [5]:
# Lambda Labs 

import re, requests, pandas as pd
from typing import Optional
from bs4 import BeautifulSoup
from datetime import datetime, timezone

HEADERS = {"User-Agent": "Mozilla/5.0"}

SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _norm_gpu(s: str) -> str:
    s = re.sub(r"\bon[-\s]?demand\b", "", s, flags=re.I)
    s = s.replace("NVIDIA", "").strip()
    s = re.sub(r"\s+", " ", s)
    s = s.upper().replace("GH200", "H200")  # treat GH200 as H200
    return s.strip()

def _gpu_count(s: str) -> Optional[int]:
    if not isinstance(s, str): 
        return None
    m = re.search(r"(\d+)x", s, flags=re.I)
    return int(m.group(1)) if m else None

def _price_in(text: str) -> Optional[float]:
    if not isinstance(text, str): 
        return None
    m = re.search(r"\$\s*([0-9]+(?:\.[0-9]+)?)", text.replace(",", ""))
    return float(m.group(1)) if m else None

def _infer_region(table) -> str:
    hdr = table.find_previous(["h2","h3","h4","p"])
    if hdr:
        t = hdr.get_text(" ", strip=True).lower()
        if "europe" in t or "eu" in t: return "EU"
        if "united states" in t or "us" in t or "usa" in t: return "US"
    return "US"

def scrape_lambda_labs(region: Optional[str] = None) -> pd.DataFrame:
    """
    Scrapes https://cloud.lambdalabs.com/pricing and returns SLIM rows
    for H100/H200 (On-Demand, 1h). If `region` provided, overrides detected region.
    """
    url = "https://cloud.lambdalabs.com/pricing"
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    rows_out = []
    tables = soup.find_all("table")
    for table in tables:
        tbl_region = region or _infer_region(table)
        for tr in table.find_all("tr"):
            tds = [td.get_text(" ", strip=True) for td in tr.find_all("td")]
            if not tds:
                continue
            row_text = " | ".join(tds)

            if not (re.search(r"\bH100\b", row_text, re.I) or re.search(r"\bH200\b|\bGH200\b", row_text, re.I)):
                continue

            price = _price_in(row_text)
            if price is None:
                continue

            
            gpu_cell = next((c for c in tds if ("H100" in c.upper() or "H200" in c.upper() or "GH200" in c.upper())), None)
            gpu_model = _norm_gpu(gpu_cell or ("H100" if "H100" in row_text.upper() else "H200"))
            count = _gpu_count(gpu_model)

            rows_out.append({
                "provider": "Lambda Labs",
                "region": tbl_region,
                "gpu_model": gpu_model,      
                "type": "On-Demand",
                "duration": "1h",
                "gpu_count": count,
                "price_hourly_usd": price,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })

    df = pd.DataFrame(rows_out)
    if df.empty:
        return _ensure_slim(df)
    keep = df["gpu_model"].str.contains(r"\bH100\b|\bH200\b", regex=True, na=False)
    df = df[keep].reset_index(drop=True)
    return _ensure_slim(df)

# Example:
df_lambda = scrape_lambda_labs(region="US")
display(df_lambda.head())



Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,Lambda Labs,US,8X H100 SXM,On-Demand,1h,8,2.99,https://cloud.lambdalabs.com/pricing,2025-09-04 11:15:09
1,Lambda Labs,US,4X H100 SXM,On-Demand,1h,4,3.09,https://cloud.lambdalabs.com/pricing,2025-09-04 11:15:09
2,Lambda Labs,US,2X H100 SXM,On-Demand,1h,2,3.19,https://cloud.lambdalabs.com/pricing,2025-09-04 11:15:09
3,Lambda Labs,US,1X H200,On-Demand,1h,1,1.49,https://cloud.lambdalabs.com/pricing,2025-09-04 11:15:09
4,Lambda Labs,US,1X H100 SXM,On-Demand,1h,1,3.29,https://cloud.lambdalabs.com/pricing,2025-09-04 11:15:09


In [7]:
# ===== Lambda Labs (static) + RunPod (async) with per-provider history =====
# Slim schema: provider, region, gpu_model, type, duration, gpu_count,
#              price_hourly_usd, source_url, fetched_at_utc
# Py 3.8 compatible

import re, os, asyncio, pandas as pd, tempfile
from typing import Optional, Dict, Any
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from pathlib import Path
import requests

HEADERS = {"User-Agent": "Mozilla/5.0"}
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]

# -------- storage (per-provider history/snapshots) --------
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try:
        d.mkdir(parents=True, exist_ok=True)
    except Exception:
        # fallback to tmp if workspace is read-only
        tmp = Path(tempfile.gettempdir()) / "gpu_data"
        d = tmp / d.name
        d.mkdir(parents=True, exist_ok=True)

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _save_provider(df: pd.DataFrame, provider_slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    # snapshot
    snap_path = SNAP_DIR / f"{ts}_{provider_slug}.csv"
    try:
        df.to_csv(snap_path, index=False)
    except Exception:
        snap_path = Path(tempfile.gettempdir()) / f"{ts}_{provider_slug}.csv"
        df.to_csv(snap_path, index=False)
    # history (append + dedupe)
    hist_path = HIST_DIR / f"{provider_slug}_history.csv"
    if hist_path.exists():
        old = pd.read_csv(hist_path, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (all_df
              .dropna(subset=["gpu_model","price_hourly_usd"])
              .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                       "fetched_at_utc","price_hourly_usd"], keep="last")
              .sort_values("fetched_at_utc"))
    try:
        all_df.to_csv(hist_path, index=False)
    except Exception:
        hist_path = Path(tempfile.gettempdir()) / f"{provider_slug}_history.csv"
        all_df.to_csv(hist_path, index=False)
    # latest (newest rows only per gpu/type/region/duration)
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = LATEST_DIR / f"{provider_slug}_latest.csv"
    try:
        latest.to_csv(latest_path, index=False)
    except Exception:
        latest_path = Path(tempfile.gettempdir()) / f"{provider_slug}_latest.csv"
        latest.to_csv(latest_path, index=False)
    print(f"[{provider_slug}] snapshot -> {snap_path}\n[{provider_slug}] history  -> {hist_path}\n[{provider_slug}] latest   -> {latest_path}")
    return latest

# ------------------------- Lambda Labs (static) -------------------------
def _norm_gpu_lambda(s: str) -> str:
    s = re.sub(r"\bon[-\s]?demand\b", "", s, flags=re.I)
    s = s.replace("NVIDIA", "").strip()
    s = re.sub(r"\s+", " ", s)
    s = s.upper().replace("GH200", "H200")
    return s.strip()

def _gpu_count(text: str) -> Optional[int]:
    if not isinstance(text, str): return None
    m = re.search(r"(\d+)\s*x", text, flags=re.I)
    return int(m.group(1)) if m else None

def _price_dollar(text: str) -> Optional[float]:
    if not isinstance(text, str): return None
    m = re.search(r"\$\s*([0-9]+(?:\.[0-9]+)?)", text.replace(",", ""))
    return float(m.group(1)) if m else None

def _infer_region_lambda(table) -> str:
    hdr = table.find_previous(["h2","h3","h4","p"])
    if hdr:
        t = hdr.get_text(" ", strip=True).lower()
        if "europe" in t or "eu" in t: return "EU"
        if "united states" in t or "us" in t or "usa" in t: return "US"
    return "US"

def scrape_lambda_labs(region: Optional[str] = None) -> pd.DataFrame:
    url = "https://cloud.lambdalabs.com/pricing"
    r = requests.get(url, headers=HEADERS, timeout=30); r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    out = []
    for table in soup.find_all("table"):
        tbl_region = region or _infer_region_lambda(table)
        for tr in table.find_all("tr"):
            tds = [td.get_text(" ", strip=True) for td in tr.find_all("td")]
            if not tds: continue
            row_text = " | ".join(tds)
            if not (re.search(r"\bH100\b", row_text, re.I) or re.search(r"\bH200\b|\bGH200\b", row_text, re.I)):
                continue
            price = _price_dollar(row_text)
            if price is None: continue
            gpu_cell = next((c for c in tds if ("H100" in c.upper() or "H200" in c.upper() or "GH200" in c.upper())), None)
            model = _norm_gpu_lambda(gpu_cell or ("H100" if "H100" in row_text.upper() else "H200"))
            out.append({
                "provider": "Lambda Labs",
                "region": tbl_region,
                "gpu_model": model,
                "type": "On-Demand",
                "duration": "1h",
                "gpu_count": _gpu_count(model),
                "price_hourly_usd": price,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })
    df = pd.DataFrame(out)
    if df.empty: return _ensure_slim(df)
    keep = df["gpu_model"].str.contains(r"\bH100\b|\bH200\b", regex=True, na=False)
    return _ensure_slim(df[keep].reset_index(drop=True))

# --------------------------- RunPod (async) ---------------------------
def _extract_gpu_model_runpod(text: str) -> Optional[str]:
    if not isinstance(text, str): return None
    text_up = re.sub(r"\s+", " ", text.upper())
    m = re.search(r"(H(?:100|200)(?:\s*(?:SXM|PCIE|NVL))?(?:\s*\d{2,3}\s*GB)?)", text_up)
    return m.group(1).strip() if m else None

def _price_hourly_runpod(text: str) -> Optional[float]:
    if not isinstance(text, str): return None
    t = text.replace(",", "")
    m = re.search(r"\$\s*([0-9]+(?:\.[0-9]+)?)\s*(?:/|\s*(?:per|an)\s*)?(?:h|hr|hour)\b", t, flags=re.I)
    return float(m.group(1)) if m else None

async def scrape_runpod_async() -> pd.DataFrame:
    from playwright.async_api import async_playwright
    url = "https://www.runpod.io/pricing"; region = "Global"

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="domcontentloaded", timeout=60000)
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(1200)
        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    nodes = soup.find_all(["section","div","article","li","tr"], class_=re.compile(r"(price|pricing|card|grid|table)", re.I))
    if not nodes:
        nodes = soup.find_all(["section","div","article","li","tr","p","span"])

    out = []
    for n in nodes:
        text = n.get_text(" ", strip=True)
        if "H100" not in text and "H200" not in text: 
            continue
        price = _price_hourly_runpod(text)
        if price is None:
            continue
        model = _extract_gpu_model_runpod(text)
        if model is None:
            continue
        out.append({
            "provider": "RunPod",
            "region": region,
            "gpu_model": model,
            "type": "On-Demand",
            "duration": "1h",
            "gpu_count": _gpu_count(text),
            "price_hourly_usd": price,
            "source_url": url,
            "fetched_at_utc": _now_iso(),
        })
    df = pd.DataFrame(out)
    if df.empty: return _ensure_slim(df)
    df = df[df["gpu_model"].str.contains(r"\bH100\b|\bH200\b", na=False)]
    df = df[(df["price_hourly_usd"] > 0) & (df["price_hourly_usd"] < 200)].reset_index(drop=True)
    return _ensure_slim(df)

# --------------- Runner that works in scripts & notebooks ---------------
def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

# ------------------------------ RUN --------------------------------
# Lambda Labs
df_lambda = scrape_lambda_labs(region="US")
_save_provider(df_lambda, "lambda_labs")

# RunPod
df_runpod = arun(scrape_runpod_async())
_save_provider(df_runpod, "runpod")


[lambda_labs] snapshot -> /var/folders/2_/9wdv7zh56p95l_j0dkkc12zw0000gn/T/20250904_112436_lambda_labs.csv
[lambda_labs] history  -> /var/folders/2_/9wdv7zh56p95l_j0dkkc12zw0000gn/T/lambda_labs_history.csv
[lambda_labs] latest   -> /var/folders/2_/9wdv7zh56p95l_j0dkkc12zw0000gn/T/lambda_labs_latest.csv
[runpod] snapshot -> /var/folders/2_/9wdv7zh56p95l_j0dkkc12zw0000gn/T/20250904_112441_runpod.csv
[runpod] history  -> /var/folders/2_/9wdv7zh56p95l_j0dkkc12zw0000gn/T/runpod_history.csv
[runpod] latest   -> /var/folders/2_/9wdv7zh56p95l_j0dkkc12zw0000gn/T/runpod_latest.csv


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
4,RunPod,Global,H200 141 GB,On-Demand,1h,,3.59,https://www.runpod.io/pricing,2025-09-04 11:24:41
6,RunPod,Global,H100 PCIE 80 GB,On-Demand,1h,,1.99,https://www.runpod.io/pricing,2025-09-04 11:24:41


In [5]:
# --- Nebius H100/H200 scraper (uses YOUR parsing + per-provider history) ---

import re, time, tempfile, requests, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
import pytz

# ---------- your original config ----------
url = "https://nebius.com/prices"
UA = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36"}
TZ = pytz.utc
MIN_PRICE, MAX_PRICE = 0.3, 20.0   # sanity for $/GPU/hr

# ---------- storage dirs (snapshot/history/latest) ----------
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try:
        d.mkdir(parents=True, exist_ok=True)
    except Exception:
        pass  # we'll fall back to tmp if write fails later

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(path, index=False)
        return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name
        df.to_csv(tmp, index=False)
        return tmp

def _save_provider(df: pd.DataFrame, provider_slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(TZ).strftime("%Y%m%d_%H%M%S")

    # snapshot
    snap_path = SNAP_DIR / f"{ts}_{provider_slug}.csv"
    snap_path = _safe_to_csv(df, snap_path)

    # history (append + dedupe)
    hist_path = HIST_DIR / f"{provider_slug}_history.csv"
    if hist_path.exists():
        old = pd.read_csv(hist_path, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (all_df
              .dropna(subset=["gpu_model","price_hourly_usd"])
              .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                       "fetched_at_utc","price_hourly_usd"], keep="last")
              .sort_values("fetched_at_utc"))
    hist_path = _safe_to_csv(all_df, hist_path)

    # latest (newest per gpu/type/region/duration)
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = LATEST_DIR / f"{provider_slug}_latest.csv"
    latest_path = _safe_to_csv(latest, latest_path)

    print(f"[{provider_slug}] snapshot -> {snap_path}\n[{provider_slug}] history  -> {hist_path}\n[{provider_slug}] latest   -> {latest_path}")
    return latest

# ---------- your original parsing (unchanged) ----------
def find_price_strict(text: str):
    """Match $X/hr, $X per hour, $X/hour, case-insensitive."""
    if not text: return None
    m = re.search(r"\$([0-9]+(?:\.[0-9]+)?)\s*(?:/|\s*per\s*)?\s*(?:h|hr|hour)\b", text, flags=re.I)
    return float(m.group(1)) if m else None

def parse_nebius_from_html(html: str) -> dict:
    """Return {'H100': price, 'H200': price} if found."""
    soup = BeautifulSoup(html, "html.parser")
    results = {}

    # Focus on plausible pricing containers first (tables / pricing sections)
    blocks = []
    blocks.extend(soup.find_all("table"))
    if not blocks:
        blocks.extend(soup.find_all(["section","div"], class_=re.compile("price|pricing|compute", re.I)))
    if not blocks:
        blocks = soup.find_all(["div","tr","li","p","span"])

    for blk in blocks:
        t = blk.get_text(" ", strip=True)
        if not t: continue

        has_h100 = bool(re.search(r"\bH100\b", t, flags=re.I))
        has_h200 = bool(re.search(r"\bH200\b", t, flags=re.I))
        if not (has_h100 or has_h200): continue

        price = find_price_strict(t)
        if price is None or not (MIN_PRICE <= price <= MAX_PRICE): continue

        if has_h100 and "H100" not in results:
            results["H100"] = price
        if has_h200 and "H200" not in results:
            results["H200"] = price

        if len(results) == 2:
            break

    return results

# ---------- run (your flow) ----------
html = None
try:
    r = requests.get(url, headers=UA, timeout=30)
    if r.status_code == 200 and r.text:
        html = r.text
except Exception:
    html = None

results = {}
if html:
    results = parse_nebius_from_html(html)

# Optional Playwright fallback if nothing found
if not results:
    try:
        from playwright.sync_api import sync_playwright
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()
            page.goto(url, wait_until="networkidle", timeout=60000)
            page.wait_for_timeout(2000)  # allow dynamic content
            html_pw = page.content()
            browser.close()
        results = parse_nebius_from_html(html_pw)
    except Exception:
        pass  # proceed with whatever we have

# ---------- map YOUR results -> SLIM schema & save ----------
rows = []
ts = datetime.now(TZ).isoformat()
for gpu, price in results.items():
    rows.append({
        "provider": "Nebius",
        "region": "Global",            # keep simple; refine if you later detect regions
        "gpu_model": gpu,              # map gpu_type -> gpu_model
        "type": "On-Demand",
        "duration": "1h",
        "gpu_count": None,
        "price_hourly_usd": price,     # map on_demand_price -> price_hourly_usd
        "source_url": url,
        "fetched_at_utc": ts,          # map scraped_at -> fetched_at_utc
    })

df_nebius = pd.DataFrame(rows, columns=SLIM_COLS)
latest = _save_provider(df_nebius, "nebius")
print(f"Nebius rows this run: {len(df_nebius)}")
display(df_nebius.head())


[nebius] snapshot -> docs/data/snapshots/20250904_114258_nebius.csv
[nebius] history  -> docs/data/history/nebius_history.csv
[nebius] latest   -> docs/data/latest/nebius_latest.csv
Nebius rows this run: 2


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,Nebius,Global,H200,On-Demand,1h,,2.3,https://nebius.com/prices,2025-09-04T11:42:58.443649+00:00
1,Nebius,Global,H100,On-Demand,1h,,2.0,https://nebius.com/prices,2025-09-04T11:42:58.443649+00:00


In [6]:
# ================= VoltagePark (async) — slim schema + per-provider history =================
# Slim schema: provider, region, gpu_model, type, duration, gpu_count,
#              price_hourly_usd, source_url, fetched_at_utc
# Py 3.8 compatible

import re, asyncio, tempfile, pandas as pd
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from playwright.async_api import async_playwright

# ---- storage + schema helpers (same as other providers) ----
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try:
        d.mkdir(parents=True, exist_ok=True)
    except Exception:
        pass  # fallback handled in _safe_to_csv

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns:
            out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(path, index=False)
        return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name
        df.to_csv(tmp, index=False)
        return tmp

def _save_provider(df: pd.DataFrame, provider_slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

    # snapshot
    snap_path = SNAP_DIR / f"{ts}_{provider_slug}.csv"
    snap_path = _safe_to_csv(df, snap_path)

    # history (append + dedupe)
    hist_path = HIST_DIR / f"{provider_slug}_history.csv"
    if hist_path.exists():
        old = pd.read_csv(hist_path, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (
        all_df.dropna(subset=["gpu_model","price_hourly_usd"])
              .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                       "fetched_at_utc","price_hourly_usd"], keep="last")
              .sort_values("fetched_at_utc")
    )
    hist_path = _safe_to_csv(all_df, hist_path)

    # latest (newest per gpu/type/region/duration)
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = LATEST_DIR / f"{provider_slug}_latest.csv"
    latest_path = _safe_to_csv(latest, latest_path)

    print(f"[{provider_slug}] snapshot -> {snap_path}\n[{provider_slug}] history  -> {hist_path}\n[{provider_slug}] latest   -> {latest_path}")
    return latest

# ---- YOUR scraping logic, adapted to slim schema ----
async def scrape_voltagepark() -> pd.DataFrame:
    url = "https://dashboard.voltagepark.com/order/configure-deployment"
    rows = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=60000)
        await page.wait_for_timeout(5000)

        html = await page.content()
        await browser.close()

    for line in html.splitlines():
        if ("H100" in line or "H200" in line) and "$" in line:
            try:
                # your original pattern
                m = re.search(r"\$?(\d+(?:\.\d+)?)(?=/GPU/hour)", line)
                if m:
                    price = float(m.group(1))
                    gpu = "H100" if "H100" in line else "H200"
                    rows.append({
                        "provider": "VoltagePark",
                        "region": "US",
                        "gpu_model": gpu,
                        "type": "On-Demand",
                        "duration": "1h",
                        "gpu_count": None,
                        "price_hourly_usd": price,
                        "source_url": url,
                        "fetched_at_utc": _now_iso(),
                    })
            except Exception as e:
                # keep silent in prod; print minimal context if you want
                # print(f"[VoltagePark Parse Error] {e}")
                pass

    df = pd.DataFrame(rows, columns=SLIM_COLS)
    if df.empty:
        return _ensure_slim(df)

    # dedupe by (gpu_model, price)
    df = (df.sort_values(["gpu_model","price_hourly_usd","fetched_at_utc"])
            .drop_duplicates(subset=["gpu_model","price_hourly_usd"], keep="last")
            .reset_index(drop=True))

    # sanity: plausible $/hr range
    df = df[(df["price_hourly_usd"] > 0) & (df["price_hourly_usd"] < 200)]
    return _ensure_slim(df)

# ---- runner that works in notebooks & scripts ----
def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

# ------------------------------ RUN --------------------------------
df_voltage = arun(scrape_voltagepark())
_save_provider(df_voltage, "voltagepark")
display(df_voltage.head())


[voltagepark] snapshot -> docs/data/snapshots/20250904_123626_voltagepark.csv
[voltagepark] history  -> docs/data/history/voltagepark_history.csv
[voltagepark] latest   -> docs/data/latest/voltagepark_latest.csv


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,VoltagePark,US,H100,On-Demand,1h,,1.99,https://dashboard.voltagepark.com/order/config...,2025-09-04 12:36:26


In [8]:
# ================= Vast.ai (async) — slim schema + per-provider history =================
# Slim schema: provider, region, gpu_model, type, duration, gpu_count,
#              price_hourly_usd, source_url, fetched_at_utc
# Py 3.8 compatible

import re, asyncio, pandas as pd, tempfile
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright

# -------- storage + schema helpers (same as other providers) --------
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try:
        d.mkdir(parents=True, exist_ok=True)
    except Exception:
        pass  # fallback handled below

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns:
            out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(path, index=False)
        return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name
        df.to_csv(tmp, index=False)
        return tmp

def _save_provider(df: pd.DataFrame, provider_slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

    # snapshot
    snap_path = SNAP_DIR / f"{ts}_{provider_slug}.csv"
    snap_path = _safe_to_csv(df, snap_path)

    # history (append + dedupe)
    hist_path = HIST_DIR / f"{provider_slug}_history.csv"
    if hist_path.exists():
        old = pd.read_csv(hist_path, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (
        all_df.dropna(subset=["gpu_model","price_hourly_usd"])
              .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                       "fetched_at_utc","price_hourly_usd"], keep="last")
              .sort_values("fetched_at_utc")
    )
    hist_path = _safe_to_csv(all_df, hist_path)

    # latest (newest per gpu/type/region/duration)
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = LATEST_DIR / f"{provider_slug}_latest.csv"
    latest_path = _safe_to_csv(latest, latest_path)

    print(f"[{provider_slug}] snapshot -> {snap_path}\n[{provider_slug}] history  -> {hist_path}\n[{provider_slug}] latest   -> {latest_path}")
    return latest

# -------- YOUR scraping logic, adapted to slim schema --------
async def scrape_vast_products() -> pd.DataFrame:
    url = "https://vast.ai/products/gpu-cloud"
    rows = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=60000)
        # Try to reveal lazy content
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(1500)
        await page.evaluate("window.scrollTo(0, 0)")
        await page.wait_for_timeout(500)

        content = await page.content()
        await browser.close()

    # Your original approach: scan lines and pick $ numbers near H100/H200
    for line in content.splitlines():
        if ("H100" in line or "H200" in line) and "$" in line:
            try:
                gpu_model = "H100" if "H100" in line else "H200"
                # pull all $-bearing tokens in the line
                dollars = [s for s in re.split(r"\s+", line) if "$" in s]
                price_val = None
                for token in dollars:
                    clean = "".join(c for c in token if c.isdigit() or c == ".")
                    if not clean:
                        continue
                    price = float(clean)
                    if 0.1 < price < 100:  # sanity filter like you had
                        price_val = price
                        break
                if price_val is None:
                    continue

                rows.append({
                    "provider": "Vast.ai",
                    "region": "Global",
                    "gpu_model": gpu_model,
                    "type": "On-Demand",
                    "duration": "1h",
                    "gpu_count": None,
                    "price_hourly_usd": price_val,
                    "source_url": url,
                    "fetched_at_utc": _now_iso(),
                })
            except Exception:
                # swallow parse errors to keep the run clean
                pass

    df = pd.DataFrame(rows, columns=SLIM_COLS)
    if df.empty:
        return _ensure_slim(df)

    # Deduplicate by (gpu_model, price)
    df = (df.sort_values(["gpu_model","price_hourly_usd","fetched_at_utc"])
            .drop_duplicates(subset=["gpu_model","price_hourly_usd"], keep="last")
            .reset_index(drop=True))
    # Sanity clamp
    df = df[(df["price_hourly_usd"] > 0) & (df["price_hourly_usd"] < 200)]
    return _ensure_slim(df)

def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

df_vastp = arun(scrape_vast_products())
_save_provider(df_vastp, "vastai")
display(df_vastp.head())


[vastai] snapshot -> docs/data/snapshots/20250904_124214_vastai.csv
[vastai] history  -> docs/data/history/vastai_history.csv
[vastai] latest   -> docs/data/latest/vastai_latest.csv


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,Vast.ai,Global,H100,On-Demand,1h,,1.25,https://vast.ai/products/gpu-cloud,2025-09-04 12:42:14


In [10]:
# ==== Shadeform: precise matcher (nearest-price + hourly hint) ====

import re, asyncio, pandas as pd, tempfile
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright

# slim schema storage helpers (use the same ones you already have)
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]

def _now_iso():
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

# --------- robust extractors ----------
# require an hourly hint, allowing variants like "/GPU/hour"
PRICE_RE = re.compile(
    r"\$\s*([0-9]+(?:\.[0-9]+)?)\s*(?:/GPU)?\s*(?:/|\s*(?:per|an)\s*)?(?:h|hr|hour)\b",
    re.I
)

GPU_TOKENS = {
    "H100": re.compile(r"\bH100\b", re.I),
    "H200": re.compile(r"\bH200\b", re.I),
    # include B200 so we don't steal its prices
    "_OTHER": re.compile(r"\b(?:B200|H800|A100|A800)\b", re.I),
}

def _find_token_positions(text: str):
    positions = {k: [] for k in GPU_TOKENS.keys()}
    for name, pat in GPU_TOKENS.items():
        for m in pat.finditer(text):
            positions[name].append(m.start())
    return positions

def _find_price_positions(text: str):
    return [(float(m.group(1)), m.start()) for m in PRICE_RE.finditer(text)]

def _nearest_price_to_token(text: str, token: str, window: int = 220):
    """Yield (model, price) pairs by attaching each token occurrence
       to the nearest price with an hourly hint, only if it is closer
       to this token than to any other GPU token."""
    tok_positions = _find_token_positions(text)
    prices = _find_price_positions(text)
    if not tok_positions.get(token) or not prices:
        return []

    # all GPU-ish positions (to compete for 'closeness')
    competitor_positions = []
    for k, pos_list in tok_positions.items():
        if k == token:  # we compare against others later
            continue
        competitor_positions.extend(pos_list)

    rows = []
    for gpos in tok_positions[token]:
        # candidates within a window around the GPU string
        cands = [(price, ppos, abs(ppos - gpos)) for (price, ppos) in prices if abs(ppos - gpos) <= window]
        if not cands:
            continue
        # pick nearest price to this token
        price, ppos, dist = min(cands, key=lambda t: t[2])

        # ensure this price isn't actually closer to another GPU token (e.g., B200)
        if competitor_positions:
            nearest_other = min(abs(ppos - op) for op in competitor_positions)
            if nearest_other < dist:
                continue  # skip: price belongs to another GPU mention

        rows.append((token, price))
    return rows

# --------- scraper ----------
async def scrape_shadeform_rich() -> pd.DataFrame:
    url = "https://www.shadeform.ai/"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=60000, wait_until="domcontentloaded")
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(900)
        body = await page.inner_text("body")
        await browser.close()

    body = re.sub(r"\s+", " ", body)

    rows = []
    for gpu in ("H100", "H200"):
        for model, price in _nearest_price_to_token(body, gpu, window=220):
            # sanity clamp to avoid accidental captures (tune if needed)
            if not (0.25 <= price <= 8.0):
                continue
            rows.append({
                "provider": "Shadeform",
                "region": "Global",
                "gpu_model": model,
                "type": "On-Demand",
                "duration": "1h",
                "gpu_count": None,
                "price_hourly_usd": price,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })

    df = pd.DataFrame(rows, columns=SLIM_COLS)
    if df.empty:
        return _ensure_slim(df)

    # de-dupe (gpu_model, price)
    df = (df.sort_values(["gpu_model","price_hourly_usd","fetched_at_utc"])
            .drop_duplicates(subset=["gpu_model","price_hourly_usd"], keep="last")
            .reset_index(drop=True))
    return _ensure_slim(df)

# ---- runner ----
def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

# Example:
df_shade = arun(scrape_shadeform_rich())
display(df_shade)


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,Shadeform,Global,H100,On-Demand,1h,,1.99,https://www.shadeform.ai/,2025-09-04 12:49:12
1,Shadeform,Global,H100,On-Demand,1h,,2.35,https://www.shadeform.ai/,2025-09-04 12:49:12


In [12]:
# ================= CoreWeave (async) — slim schema + per-provider history =================
# Slim schema: provider, region, gpu_model, type, duration, gpu_count,
#              price_hourly_usd, source_url, fetched_at_utc
# Py 3.8 compatible

import re, asyncio, pandas as pd, tempfile
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright

# -------- storage + schema helpers --------
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try:
        d.mkdir(parents=True, exist_ok=True)
    except Exception:
        pass  # fallback handled in _safe_to_csv

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns:
            out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(path, index=False)
        return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name
        df.to_csv(tmp, index=False)
        return tmp

def _save_provider(df: pd.DataFrame, provider_slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

    # snapshot
    snap_path = SNAP_DIR / f"{ts}_{provider_slug}.csv"
    snap_path = _safe_to_csv(df, snap_path)

    # history (append + dedupe)
    hist_path = HIST_DIR / f"{provider_slug}_history.csv"
    if hist_path.exists():
        old = pd.read_csv(hist_path, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (
        all_df.dropna(subset=["gpu_model","price_hourly_usd"])
              .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                       "fetched_at_utc","price_hourly_usd"], keep="last")
              .sort_values("fetched_at_utc")
    )
    hist_path = _safe_to_csv(all_df, hist_path)

    # latest (newest per gpu/type/region/duration)
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = LATEST_DIR / f"{provider_slug}_latest.csv"
    latest_path = _safe_to_csv(latest, latest_path)

    print(f"[{provider_slug}] snapshot -> {snap_path}\n[{provider_slug}] history  -> {hist_path}\n[{provider_slug}] latest   -> {latest_path}")
    return latest

# -------- robust token/price matching --------
PRICE_HOURLY_RE = re.compile(
    r"\$\s*([0-9]+(?:\.[0-9]+)?)\s*(?:/|\s*(?:per|an)\s*)?(?:GPU\s*/\s*)?(?:h|hr|hour)\b",
    re.I
)
# Fallback if the site omits 'hr' text; avoid monthly and memory suffixes
PRICE_DOLLAR_RE = re.compile(
    r"\$\s*([0-9]+(?:\.[0-9]+)?)\b(?!\s*(?:k|m|b|/mo|per\s*month|/month|,?\s*GB))",
    re.I
)

GPU_PATS = {
    "H100": re.compile(r"\bH100\b", re.I),
    "H200": re.compile(r"\bH200\b", re.I),
    "_OTHER": re.compile(r"\b(?:B200|A100|A800|H800)\b", re.I),
}

def _find_positions(text: str, pat: re.Pattern):
    return [m.start() for m in pat.finditer(text)]

def _find_prices(text: str, prefer_hourly: bool = True):
    pats = [PRICE_HOURLY_RE] + ([] if not prefer_hourly else [])  # first pass
    prices = [(float(m.group(1)), m.start()) for m in PRICE_HOURLY_RE.finditer(text)]
    if not prices:
        prices = [(float(m.group(1)), m.start()) for m in PRICE_DOLLAR_RE.finditer(text)]
    return prices

def _nearest_prices(text: str, token: str, window: int = 240):
    # Positions of our token vs. competitors
    tok_pos = _find_positions(text, GPU_PATS[token])
    if not tok_pos:
        return []
    comp_pos = []
    for k, pat in GPU_PATS.items():
        if k == token: continue
        comp_pos.extend(_find_positions(text, pat))
    prices = _find_prices(text)
    out = []
    for gpos in tok_pos:
        cands = [(price, ppos, abs(ppos - gpos)) for (price, ppos) in prices if abs(ppos - gpos) <= window]
        if not cands: 
            continue
        price, ppos, dist = min(cands, key=lambda t: t[2])
        if comp_pos:
            nearest_other = min(abs(ppos - op) for op in comp_pos)
            if nearest_other < dist:
                continue
        out.append((token, price))
    return out

# -------- CoreWeave scraper --------
async def scrape_coreweave_async() -> pd.DataFrame:
    url = "https://www.coreweave.com/pricing"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=90000, wait_until="domcontentloaded")
        # help lazy content load
        for _ in range(3):
            await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
            await page.wait_for_timeout(800)
        # Wait for any pricing text to appear (best-effort)
        try:
            await page.wait_for_selector("text=/H100|H200/", timeout=5000)
        except Exception:
            pass
        body = await page.inner_text("body")
        await browser.close()

    body = re.sub(r"\s+", " ", body)

    rows = []
    for gpu in ("H100", "H200"):
        for model, price in _nearest_prices(body, gpu, window=240):
            # reasonable hourly range; widen if CoreWeave posts higher tiers
            if not (0.25 <= price <= 25.0):
                continue
            rows.append({
                "provider": "CoreWeave",
                "region": "US",
                "gpu_model": model,
                "type": "On-Demand",
                "duration": "1h",
                "gpu_count": None,
                "price_hourly_usd": price,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })

    df = pd.DataFrame(rows, columns=SLIM_COLS)
    if df.empty:
        return _ensure_slim(df)

    # Deduplicate (gpu_model, price)
    df = (df.sort_values(["gpu_model","price_hourly_usd","fetched_at_utc"])
            .drop_duplicates(subset=["gpu_model","price_hourly_usd"], keep="last")
            .reset_index(drop=True))
    return _ensure_slim(df)

# -------- runner (works in scripts & notebooks) --------
def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

# ------------------------------ RUN --------------------------------
df_coreweave = arun(scrape_coreweave_async())
latest_coreweave = _save_provider(df_coreweave, "coreweave")
display(df_coreweave.head())


[coreweave] snapshot -> docs/data/snapshots/20250904_125500_coreweave.csv
[coreweave] history  -> docs/data/history/coreweave_history.csv
[coreweave] latest   -> docs/data/latest/coreweave_latest.csv


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,CoreWeave,US,H100,On-Demand,1h,,20.0,https://www.coreweave.com/pricing,2025-09-04 12:55:00


In [17]:
# ================= Paperspace (async) — use your approach, keep correct rows =================
# Output schema (slim): provider, region, gpu_model, type, duration, gpu_count,
#                       price_hourly_usd, source_url, fetched_at_utc

import re, asyncio, pandas as pd, tempfile
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright

# ---- storage helpers (same as other providers) ----
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data"); HIST_DIR = BASE/"history"; SNAP_DIR = BASE/"snapshots"; LATEST_DIR = BASE/"latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try: d.mkdir(parents=True, exist_ok=True)
    except Exception: pass

def _now_iso(): return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True); df.to_csv(path, index=False); return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name; df.to_csv(tmp, index=False); return tmp

def _save_provider(df: pd.DataFrame, slug: str):
    df = _ensure_slim(df); ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    snap = _safe_to_csv(df, SNAP_DIR/f"{ts}_{slug}.csv")
    # history
    hist = HIST_DIR/f"{slug}_history.csv"
    if hist.exists():
        old = pd.read_csv(hist, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (all_df.dropna(subset=["gpu_model","price_hourly_usd"])
                    .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                             "fetched_at_utc","price_hourly_usd"], keep="last")
                    .sort_values("fetched_at_utc"))
    hist = _safe_to_csv(all_df, hist)
    # latest
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = _safe_to_csv(latest, LATEST_DIR/f"{slug}_latest.csv")
    print(f"[{slug}] snapshot -> {snap}\n[{slug}] history  -> {hist}\n[{slug}] latest   -> {latest_path}")
    return latest

# ---- strict extractors (but still tolerant to site markup) ----
GPU_PAT = re.compile(r"(H(?:100|200)(?:\s*(?:SXM|PCIE|NVL))?(?:\s*\d{2,3}\s*GB)?)", re.I)
# require an hourly hint somewhere in the same block to avoid platform prices, etc.
PRICE_HOURLY = re.compile(r"\$\s*([0-9]+(?:\.[0-9]+)?)\s*(?:/|\s*(?:per|an)\s*)?(?:GPU\s*/\s*)?(?:h|hr|hour)\b", re.I)

async def scrape_paperspace() -> pd.DataFrame:
    url = "https://www.paperspace.com/pricing"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=90000)               # your simple navigation
        await page.wait_for_timeout(8000)                  # your “just wait a few seconds”
        html = await page.content()
        await browser.close()

    rows = []
    soup = BeautifulSoup(html, "html.parser")
    # scan reasonable blocks; stick to your block-scan approach
    for blk in soup.find_all(["tr","div","section","article","li"], recursive=True):
        txt = blk.get_text(" ", strip=True)
        if not txt: 
            continue
        # must mention H100/H200 AND 'hour' to qualify
        if ("H100" not in txt and "H200" not in txt) or ("hour" not in txt.lower()):
            continue

        # model: first explicit H100/H200-ish token found
        mm = GPU_PAT.search(txt)
        if not mm:
            continue
        model = mm.group(1).upper()

        # price: $… with an hourly hint in the same block
        pm = PRICE_HOURLY.search(txt)
        if not pm:
            continue
        price = float(pm.group(1))
        # sanity band to drop weird captures
        if not (0.2 <= price <= 50.0):
            continue

        rows.append({
            "provider": "Paperspace",
            "region": "Global",
            "gpu_model": model,          # "H100", "H100 PCIE 80GB", etc.
            "type": "On-Demand",
            "duration": "1h",
            "gpu_count": None,
            "price_hourly_usd": price,
            "source_url": url,
            "fetched_at_utc": _now_iso(),
        })

    df = pd.DataFrame(rows, columns=SLIM_COLS)
    if df.empty:
        return _ensure_slim(df)

    # de-dupe (gpu_model, price)
    df = (df.sort_values(["gpu_model","price_hourly_usd","fetched_at_utc"])
            .drop_duplicates(subset=["gpu_model","price_hourly_usd"], keep="last")
            .reset_index(drop=True))
    return _ensure_slim(df)

# ---- runner that works in both scripts & notebooks ----
def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

# ------------------------------ RUN --------------------------------
df_paperspace = arun(scrape_paperspace())
latest_paperspace = _save_provider(df_paperspace, "paperspace")
display(df_paperspace.head(20))


[paperspace] snapshot -> docs/data/snapshots/20250904_130549_paperspace.csv
[paperspace] history  -> docs/data/history/paperspace_history.csv
[paperspace] latest   -> docs/data/latest/paperspace_latest.csv


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,Paperspace,Global,H100,On-Demand,1h,,2.24,https://www.paperspace.com/pricing,2025-09-04 13:05:49


In [20]:
# ================= TensorDock H100 (static) — slim schema + per-provider history =================
# Output schema: provider, region, gpu_model, type, duration, gpu_count,
#                price_hourly_usd, source_url, fetched_at_utc
# Py 3.8 compatible

import re, requests, pandas as pd, tempfile
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from pathlib import Path

# -------- pages & patterns (from your code) --------
PAGES = [
    "https://tensordock.com/gpu-h100",
    "https://tensordock.com/cloud-gpus",
    "https://tensordock.com/comparison-gcp",
]
PATTERNS = [
    re.compile(r"H100.*?\$([0-9]+(?:\.[0-9]+)?)\s*/?\s*hr", re.I|re.S),
    re.compile(r"from\s*\$([0-9]+(?:\.[0-9]+)?)\s*/?\s*hr.*?H100", re.I|re.S),
    re.compile(r"\$([0-9]+(?:\.[0-9]+)?)\s*/?\s*hour.*?H100", re.I|re.S),
]
HEADERS = {"User-Agent": "Mozilla/5.0"}

# -------- storage + schema helpers (same as other providers) --------
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try:
        d.mkdir(parents=True, exist_ok=True)
    except Exception:
        pass  # fall back handled in _safe_to_csv

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(path, index=False)
        return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name
        df.to_csv(tmp, index=False)
        return tmp

def _save_provider(df: pd.DataFrame, provider_slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    # snapshot
    snap_path = _safe_to_csv(df, SNAP_DIR / f"{ts}_{provider_slug}.csv")
    # history (append + dedupe)
    hist_path = HIST_DIR / f"{provider_slug}_history.csv"
    if hist_path.exists():
        old = pd.read_csv(hist_path, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (all_df
              .dropna(subset=["gpu_model","price_hourly_usd"])
              .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                       "fetched_at_utc","price_hourly_usd"], keep="last")
              .sort_values("fetched_at_utc"))
    hist_path = _safe_to_csv(all_df, hist_path)
    # latest (newest per gpu/type/region/duration)
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = _safe_to_csv(latest, LATEST_DIR / f"{provider_slug}_latest.csv")
    print(f"[{provider_slug}] snapshot -> {snap_path}\n[{provider_slug}] history  -> {hist_path}\n[{provider_slug}] latest   -> {latest_path}")
    return latest

# -------- scraper (uses your logic, mapped to slim schema) --------
def scrape_tensordock_public_h100() -> pd.DataFrame:
    rows = []
    for url in PAGES:
        try:
            r = requests.get(url, headers=HEADERS, timeout=30)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, "html.parser")
            text = soup.get_text(" ", strip=True)
            price = None
            for pat in PATTERNS:
                m = pat.search(text)
                if m:
                    price = float(m.group(1))
                    break
            if price and (0.2 <= price <= 50.0):  # sanity band for $/GPU/hr
                rows.append({
                    "provider": "TensorDock",
                    "region": "Global",
                    "gpu_model": "H100",
                    "type": "On-Demand",
                    "duration": "1h",
                    "gpu_count": 1,
                    "price_hourly_usd": price,
                    "source_url": url,
                    "fetched_at_utc": _now_iso(),
                })
        except Exception as e:
            print(f"[TensorDock] {url} -> {e}")

    if not rows:
        return _ensure_slim(pd.DataFrame(columns=SLIM_COLS))

    # Deduplicate: keep the **lowest** "from" price across pages
    df = pd.DataFrame(rows)
    df = (df.sort_values("price_hourly_usd")
            .drop_duplicates(subset=["provider","gpu_model"], keep="first")
            .reset_index(drop=True))
    return _ensure_slim(df)

# ------------------------------ RUN --------------------------------
df_tensordock = scrape_tensordock_public_h100()
latest_tensordock = _save_provider(df_tensordock, "tensordock")
print(df_tensordock)


[tensordock] snapshot -> docs/data/snapshots/20250904_131631_tensordock.csv
[tensordock] history  -> docs/data/history/tensordock_history.csv
[tensordock] latest   -> docs/data/latest/tensordock_latest.csv
     provider  region gpu_model       type duration  gpu_count  \
0  TensorDock  Global      H100  On-Demand       1h          1   

   price_hourly_usd                       source_url      fetched_at_utc  
0              2.25  https://tensordock.com/gpu-h100 2025-09-04 13:16:31  


In [21]:
# ============== Hydra Host (Brokkr) — slim schema + per-provider history ==============
# Output schema: provider, region, gpu_model, type, duration, gpu_count,
#                price_hourly_usd, source_url, fetched_at_utc
# Py 3.8 compatible

import re, asyncio, pandas as pd, tempfile
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright

# -------- storage + schema helpers (same pattern as other providers) --------
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try: d.mkdir(parents=True, exist_ok=True)
    except Exception: pass

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(path, index=False)
        return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name
        df.to_csv(tmp, index=False)
        return tmp

def _save_provider(df: pd.DataFrame, provider_slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    # snapshot
    snap_path = _safe_to_csv(df, SNAP_DIR / f"{ts}_{provider_slug}.csv")
    # history (append + dedupe)
    hist_path = HIST_DIR / f"{provider_slug}_history.csv"
    if hist_path.exists():
        old = pd.read_csv(hist_path, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (
        all_df.dropna(subset=["gpu_model","price_hourly_usd"])
              .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                       "fetched_at_utc","price_hourly_usd"], keep="last")
              .sort_values("fetched_at_utc")
    )
    hist_path = _safe_to_csv(all_df, hist_path)
    # latest (newest per gpu/type/region/duration)
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = _safe_to_csv(latest, LATEST_DIR / f"{provider_slug}_latest.csv")
    print(f"[{provider_slug}] snapshot -> {snap_path}\n[{provider_slug}] history  -> {hist_path}\n[{provider_slug}] latest   -> {latest_path}")
    return latest

# -------- your Brokkr scraper, tightened to only accept "per card-hour" prices --------
GPU_RE = r"(H100|H200)"
PRICE_PER_CARDHR = r"\$\s*([0-9]+(?:\.[0-9]+)?)\s*(?:per\s*card[-\s]?hour|/card[-\s]?hour)\b"
PATS = [
    re.compile(rf"{GPU_RE}.{{0,220}}?{PRICE_PER_CARDHR}", re.I | re.S),
    re.compile(rf"{PRICE_PER_CARDHR}.{{0,220}}?{GPU_RE}", re.I | re.S),
]

async def scrape_brokkr() -> pd.DataFrame:
    url = "https://brokkr.hydrahost.com/inventory"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=90000, wait_until="domcontentloaded")
        # help hydrate lazy content
        for _ in range(2):
            await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
            await page.wait_for_timeout(700)
        body_text = await page.inner_text("body")
        await browser.close()

    text = re.sub(r"\s+", " ", body_text)
    rows = []

    for pat in PATS:
        for m in pat.finditer(text):
            # Depending on which pattern matched, group order differs
            groups = m.groups()
            # Normalize extraction: model + price are always present
            if len(groups) == 2:
                # pattern 1: (GPU, price)
                gpu_model, price_str = groups
            elif len(groups) == 3:
                # pattern 2 returns (price, GPU) because of nested groups; pick numeric+gpu
                # groups could be ('12.34', 'H100') or ('12.34', 'card-hour', 'H100') depending on regex engine
                nums = [g for g in groups if g and re.fullmatch(r"[0-9]+(?:\.[0-9]+)?", g)]
                gpus = [g for g in groups if g and re.fullmatch(r"H100|H200", g, flags=re.I)]
                if not nums or not gpus:
                    continue
                price_str, gpu_model = nums[0], gpus[0]
            else:
                # Safe fallback: find first number and first GPU token in the match
                seg = m.group(0)
                pm = re.search(r"[0-9]+(?:\.[0-9]+)?", seg)
                gm = re.search(r"H100|H200", seg, flags=re.I)
                if not (pm and gm):
                    continue
                price_str, gpu_model = pm.group(0), gm.group(0)

            try:
                price = float(price_str)
            except Exception:
                continue

            # sanity band for per-card hour pricing
            if not (0.2 <= price <= 50.0):
                continue

            rows.append({
                "provider": "Hydra Host (Brokkr)",
                "region": "Global",
                "gpu_model": gpu_model.upper(),
                "type": "On-Demand",
                "duration": "1h",
                "gpu_count": 1,
                "price_hourly_usd": price,   # per card-hour = per-GPU hourly
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })

    df = pd.DataFrame(rows, columns=SLIM_COLS)
    if df.empty:
        return _ensure_slim(df)

    # de-dupe (gpu_model, price)
    df = (df.sort_values(["gpu_model","price_hourly_usd","fetched_at_utc"])
            .drop_duplicates(subset=["gpu_model","price_hourly_usd"], keep="last")
            .reset_index(drop=True))
    return _ensure_slim(df)

# -------- runner that works in both notebooks & scripts --------
def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

# ------------------------------ RUN --------------------------------
df_brokkr = arun(scrape_brokkr())
latest_brokkr = _save_provider(df_brokkr, "brokkr")
print(df_brokkr)
display(df_brokkr.head())


[brokkr] snapshot -> docs/data/snapshots/20250904_132203_brokkr.csv
[brokkr] history  -> docs/data/history/brokkr_history.csv
[brokkr] latest   -> docs/data/latest/brokkr_latest.csv
              provider  region gpu_model       type duration  gpu_count  \
0  Hydra Host (Brokkr)  Global      H100  On-Demand       1h          1   
1  Hydra Host (Brokkr)  Global      H200  On-Demand       1h          1   

   price_hourly_usd                              source_url  \
0               2.3  https://brokkr.hydrahost.com/inventory   
1               2.5  https://brokkr.hydrahost.com/inventory   

       fetched_at_utc  
0 2025-09-04 13:22:03  
1 2025-09-04 13:22:03  


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,price_hourly_usd,source_url,fetched_at_utc
0,Hydra Host (Brokkr),Global,H100,On-Demand,1h,1,2.3,https://brokkr.hydrahost.com/inventory,2025-09-04 13:22:03
1,Hydra Host (Brokkr),Global,H200,On-Demand,1h,1,2.5,https://brokkr.hydrahost.com/inventory,2025-09-04 13:22:03


In [22]:
# ============ Crusoe Cloud (async) — table scrape → slim schema + history ============

import re, asyncio, pandas as pd, tempfile
from datetime import datetime, timezone
from pathlib import Path
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

# ---------- slim schema + storage helpers (same as other providers) ----------
SLIM_COLS = [
    "provider","region","gpu_model","type","duration","gpu_count",
    "price_hourly_usd","source_url","fetched_at_utc"
]
BASE = Path("docs/data")
HIST_DIR = BASE / "history"
SNAP_DIR = BASE / "snapshots"
LATEST_DIR = BASE / "latest"
for d in (HIST_DIR, SNAP_DIR, LATEST_DIR):
    try: d.mkdir(parents=True, exist_ok=True)
    except Exception: pass

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _ensure_slim(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in SLIM_COLS:
        if c not in out.columns: out[c] = None
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"], errors="coerce")
    out["fetched_at_utc"] = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
    return out[SLIM_COLS]

def _safe_to_csv(df: pd.DataFrame, path: Path):
    try:
        path.parent.mkdir(parents=True, exist_ok=True); df.to_csv(path, index=False); return path
    except Exception:
        tmp = Path(tempfile.gettempdir()) / path.name; df.to_csv(tmp, index=False); return tmp

def _save_provider(df: pd.DataFrame, slug: str):
    df = _ensure_slim(df)
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    snap = _safe_to_csv(df, SNAP_DIR / f"{ts}_{slug}.csv")
    # history append+dedupe
    hist = HIST_DIR / f"{slug}_history.csv"
    if hist.exists():
        old = pd.read_csv(hist, low_memory=False)
        old["fetched_at_utc"] = pd.to_datetime(old["fetched_at_utc"], errors="coerce", utc=True).dt.tz_convert(None)
        all_df = pd.concat([old, df], ignore_index=True)
    else:
        all_df = df.copy()
    all_df = (all_df
        .dropna(subset=["gpu_model","price_hourly_usd"])
        .drop_duplicates(subset=["provider","region","gpu_model","type","duration",
                                 "fetched_at_utc","price_hourly_usd"], keep="last")
        .sort_values("fetched_at_utc"))
    hist = _safe_to_csv(all_df, hist)
    # latest per gpu/type/region/duration
    key = ["gpu_model","type","region","duration"]
    latest = all_df.sort_values("fetched_at_utc").drop_duplicates(subset=key, keep="last")
    latest_path = _safe_to_csv(latest, LATEST_DIR / f"{slug}_latest.csv")
    print(f"[{slug}] snapshot -> {snap}\n[{slug}] history  -> {hist}\n[{slug}] latest   -> {latest_path}")
    return latest

# ---------- parsing helpers ----------
PRICE_RE = re.compile(r"\$?\s*([0-9]+(?:\.[0-9]+)?)\s*(?:/|\s*per\s*)?\s*(?:h|hr|hour)\b", re.I)

def _parse_price(cell_text: str):
    if not cell_text: return None
    m = PRICE_RE.search(cell_text.replace(",", ""))
    if not m:
        # fallback: plain $N.NN without explicit /hr
        m2 = re.search(r"\$?\s*([0-9]+(?:\.[0-9]+)?)\b", cell_text.replace(",", ""))
        return float(m2.group(1)) if m2 else None
    return float(m.group(1))

def _is_h_model(text: str) -> bool:
    t = text.upper()
    return ("H100" in t) or ("H200" in t)

def _model_from(text: str) -> str:
    return "H100" if "H100" in text.upper() else "H200"

# ---------- scraper ----------
async def scrape_crusoe_table() -> pd.DataFrame:
    url = "https://www.crusoe.ai/cloud/pricing"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=90000, wait_until="domcontentloaded")
        # help render
        for _ in range(2):
            await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
            await page.wait_for_timeout(800)
        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    rows_out = []

    # Find all table rows; filter to those that mention H100/H200
    for tr in soup.find_all("tr"):
        tds = [c.get_text(" ", strip=True) for c in tr.find_all("td")]
        if not tds or not any(_is_h_model(c) for c in tds):
            continue

        model = _model_from(" ".join(tds))

        # Try to map columns conservatively:
        # Common layout: [Model, On-Demand, Spot?, Reserved 6m, Reserved 1y, Reserved 3y, ...]
        # We’ll grab by position if present, else try to read by header alignment.
        on_demand = _parse_price(tds[1]) if len(tds) > 1 else None
        res_6m    = _parse_price(tds[3]) if len(tds) > 3 else None
        res_1y    = _parse_price(tds[4]) if len(tds) > 4 else None
        res_3y    = _parse_price(tds[5]) if len(tds) > 5 else None

        # Build normalised slim rows
        if on_demand is not None:
            rows_out.append({
                "provider": "CrusoeCloud",
                "region": "Global",
                "gpu_model": model,
                "type": "On-Demand",
                "duration": "1h",
                "gpu_count": 1,
                "price_hourly_usd": on_demand,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })
        if res_6m is not None:
            rows_out.append({
                "provider": "CrusoeCloud",
                "region": "Global",
                "gpu_model": model,
                "type": "Reserved-6m",
                "duration": "1h",
                "gpu_count": 1,
                "price_hourly_usd": res_6m,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })
        if res_1y is not None:
            rows_out.append({
                "provider": "CrusoeCloud",
                "region": "Global",
                "gpu_model": model,
                "type": "Reserved-1y",
                "duration": "1h",
                "gpu_count": 1,
                "price_hourly_usd": res_1y,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })
        if res_3y is not None:
            rows_out.append({
                "provider": "CrusoeCloud",
                "region": "Global",
                "gpu_model": model,
                "type": "Reserved-3y",
                "duration": "1h",
                "gpu_count": 1,
                "price_hourly_usd": res_3y,
                "source_url": url,
                "fetched_at_utc": _now_iso(),
            })

    df = pd.DataFrame(rows_out, columns=SLIM_COLS)
    if df.empty:
        return _ensure_slim(df)

    # de-dupe & sanity
    df = (df.sort_values(["gpu_model","type","price_hourly_usd","fetched_at_utc"])
            .drop_duplicates(subset=["gpu_model","type","price_hourly_usd"], keep="last")
            .reset_index(drop=True))
    # plausible hourly band
    df = df[(df["price_hourly_usd"] > 0) & (df["price_hourly_usd"] < 200)]
    return _ensure_slim(df)

# ---------- runner that works in scripts & notebooks ----------
def arun(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        import nest_asyncio; nest_asyncio.apply()
        return loop.run_until_complete(coro)

# ------------------------------ RUN --------------------------------
df_crusoe = arun(scrape_crusoe_table())
latest_crusoe = _save_provider(df_crusoe, "crusoecloud")
print(df_crusoe.head(20))


[crusoecloud] snapshot -> docs/data/snapshots/20250904_132724_crusoecloud.csv
[crusoecloud] history  -> docs/data/history/crusoecloud_history.csv
[crusoecloud] latest   -> docs/data/latest/crusoecloud_latest.csv
      provider  region gpu_model         type duration  gpu_count  \
0  CrusoeCloud  Global      H100    On-Demand       1h          1   
1  CrusoeCloud  Global      H100  Reserved-1y       1h          1   
2  CrusoeCloud  Global      H100  Reserved-3y       1h          1   
3  CrusoeCloud  Global      H100  Reserved-6m       1h          1   
4  CrusoeCloud  Global      H200    On-Demand       1h          1   
5  CrusoeCloud  Global      H200  Reserved-1y       1h          1   
6  CrusoeCloud  Global      H200  Reserved-3y       1h          1   
7  CrusoeCloud  Global      H200  Reserved-6m       1h          1   

   price_hourly_usd                           source_url      fetched_at_utc  
0              3.90  https://www.crusoe.ai/cloud/pricing 2025-09-04 13:27:24  
1       

In [32]:
# OVHcloud H100/H200 — get the *correct per-GPU hourly price* from the public prices table
# - Ties each $…/hour to the same row as H100/H200
# - Extracts the GPU count from the row (1×/2×/4×/8× or “… GPUs”)
# - per_gpu = instance_price / parsed_gpu_count  (NO 8× assumption)
# - Returns both instance price and per-GPU price

import re, requests, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

def timestamp(): return datetime.utcnow().isoformat()

USD_HOURLY = re.compile(r"\$\s*([0-9]+(?:\.[0-9]+)?)\s*(?:/|\s*(?:per|an)\s*)?(?:h|hr|hour)\b", re.I)

# GPU count detectors (row-level + cell/variant-level)
COUNT_PATS = [
    re.compile(r"(\d+)\s*[×x]\s*(?:NVIDIA\s*)?(H100|H200)\b", re.I),  # "8× H100"
    re.compile(r"(H100|H200)\s*[×x]\s*(\d+)\b", re.I),               # "H100 × 8"
    re.compile(r"(\d+)\s*(?:GPU|GPUs)\b", re.I),                     # "8 GPUs"
    re.compile(r"\b(\d+)\s*[×x]\b", re.I),                           # "4x"
]

def _parse_count(text: str):
    for pat in COUNT_PATS:
        m = pat.search(text)
        if not m: 
            continue
        for g in m.groups():
            if g and g.isdigit():
                n = int(g)
                if 1 <= n <= 16:
                    return n
    return None

def scrape_ovhcloud_correct(url="https://www.ovhcloud.com/en/public-cloud/prices/") -> pd.DataFrame:
    html = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=60).text
    soup = BeautifulSoup(html, "html.parser")

    rows = []
    for table in soup.select("table"):
        # headers so we can label which column the number came from
        headers = [th.get_text(" ", strip=True) for th in table.select("thead th")]
        if not headers:
            first = table.find("tr")
            if first:
                headers = [td.get_text(" ", strip=True) for td in first.find_all(["th","td"])]

        body_rows = table.select("tbody tr") or table.select("tr")
        for tr in body_rows:
            tds = tr.find_all("td")
            if not tds: 
                continue
            cells = [td.get_text(" ", strip=True) for td in tds]
            row_txt = " ".join(cells)
            up = row_txt.upper()
            if ("H100" not in up) and ("H200" not in up):
                continue

            model = "H100" if "H100" in up else "H200"
            row_count = _parse_count(row_txt)

            for idx, (td, cell) in enumerate(zip(tds, cells)):
                m = USD_HOURLY.search(cell)
                if not m:
                    continue
                instance_price = float(m.group(1))

                # try counts in cell and header/variant too
                var = headers[idx] if idx < len(headers) and headers else f"col_{idx+1}"
                count = (
                    _parse_count(cell) or
                    _parse_count(var)  or
                    row_count
                )
                if count is None:
                    # if we can't prove node size, skip (prevents wrong divide)
                    continue

                per_gpu = instance_price / count
                if not (0.25 <= per_gpu <= 20.0):
                    continue

                rows.append({
                    "provider": "OVHcloud",
                    "region": "Global",
                    "gpu_model": model,
                    "instance_type": f"public-cloud/{var}",
                    "gpu_count": int(count),
                    "price_hourly_usd_instance": round(instance_price, 4),
                    "price_hourly_usd_per_gpu": round(per_gpu, 4),
                    "price_reserved_usd": None,
                    "reserved_duration": None,
                    "timestamp": timestamp(),
                })

    df = pd.DataFrame(rows)
    if not df.empty:
        df = (df.sort_values(["gpu_model","price_hourly_usd_instance"])
                .drop_duplicates(subset=["gpu_model","instance_type","price_hourly_usd_instance"], keep="last")
                .reset_index(drop=True))
    return df

# Example
df_ovh = scrape_ovhcloud_correct()
print(df_ovh.head(20))

   provider  region gpu_model       instance_type  gpu_count  \
0  OVHcloud  Global      H100  public-cloud/Price          2   
1  OVHcloud  Global      H100  public-cloud/Price          4   

   price_hourly_usd_instance  price_hourly_usd_per_gpu price_reserved_usd  \
0                       5.98                    2.9900               None   
1                      11.97                    2.9925               None   

  reserved_duration                   timestamp  
0              None  2025-09-04T15:18:12.093698  
1              None  2025-09-04T15:18:12.093773  


In [45]:
# === SAFETY HARDENER (drop-in) ===============================================
import pandas as pd, numpy as np

NUM_COLS = [
    "price_hourly_usd","price_hourly_usd_instance","price_reserved_usd",
    "effective_price_usd_per_gpu_hr","gpu_count",
    "market_med","market_mean","market_median","market_p25","market_p75","market_cnt","market_iqr",
    "price_score","premium_vs_median","premium_vs_mkt"
]
DATE_COLS = ["fetched_at_utc","timestamp","asof_utc","ts_utc","ts_iso","last_obs_date","date_target"]

def _sanitize(df: pd.DataFrame) -> pd.DataFrame:
    if not isinstance(df, pd.DataFrame): return df

    # 1) datetimes
    for c in DATE_COLS:
        if c in df.columns:
            s = pd.to_datetime(df[c], errors="coerce", utc=True)
            df[c] = s.dt.tz_convert(None)

    # keep a pure pandas datetime64 day column (not python date objects)
    if "date" in df.columns:
        d = pd.to_datetime(df["date"], errors="coerce")
        # if it was tz-aware, drop tz; always floor to day
        if getattr(d.dtype, "tz", None) is not None:
            d = d.dt.tz_convert(None)
        df["date"] = d.dt.floor("D")

    # 2) numerics (coerce before any clip/ratios)
    for c in NUM_COLS:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df

# Run on any frames that exist in this notebook cell
for _name in ("raw","df","qa","market","scored","od","norm","hist","model_df","last_rows"):
    if _name in globals() and isinstance(globals()[_name], pd.DataFrame):
        globals()[_name] = _sanitize(globals()[_name])

# If you're about to use last_rows ratios (ML baseline), ensure these are numeric:
if "last_rows" in globals() and isinstance(last_rows, pd.DataFrame):
    for c in ["market_p25","market_p75","market_med","price_hourly_usd"]:
        if c in last_rows.columns:
            last_rows[c] = pd.to_numeric(last_rows[c], errors="coerce")
# ============================================================================ 


In [47]:
# === ONE-CELL: unify → score ("Silicon-style") → ROI → write CSVs =============
import re, math, json, numpy as np, pandas as pd
from pathlib import Path
from datetime import datetime, timezone

# minimal hardening (drop-in before scoring/ROI)
import pandas as pd, numpy as np

df["fetched_at_utc"] = pd.to_datetime(df["fetched_at_utc"], errors="coerce", utc=True)
df["date"] = df["fetched_at_utc"].dt.tz_convert(None).dt.floor("D")

for c in ["price_hourly_usd","effective_price_usd_per_gpu_hr",
          "market_median","market_mean","market_p25","market_p75","market_cnt"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")


# ---------- config ----------
BASE = Path("docs/data")
LATEST_DIR  = BASE / "latest"
DERIVED_DIR = BASE / "derived"
for d in (LATEST_DIR, DERIVED_DIR): d.mkdir(parents=True, exist_ok=True)

def _now_iso(): return datetime.now(timezone.utc).isoformat(timespec="seconds")

# ---------- helpers ----------
def to_hours(s):
    if not isinstance(s,str) or not s.strip(): return np.nan
    s = s.strip().lower().rstrip("s")
    m = re.match(r"^(\d+(?:\.\d+)?)\s*([a-z]+)$", s)
    if not m: return np.nan
    q, u = float(m.group(1)), m.group(2)
    mult = {"h":1,"hr":1,"hour":1,"d":24,"day":24,"w":168,"wk":168,"week":168,"mo":720,"month":720}.get(u, np.nan)
    return q*mult

def _to_num(x):
    try:
        if x is None or (isinstance(x,str) and not x.strip()): return np.nan
        return float(re.sub(r"[^\d.\-]", "", str(x)))
    except: return np.nan

EXPECTED = {
    "provider":            ["provider","Provider","name"],
    "region":              ["region","Region","location","geo","cloud_region","area"],
    "gpu_model":           ["gpu_model","gpu","model","GPU"],
    "gpu_count":           ["gpu_count","count","gpus","GPU_count"],
    "type":                ["type","price_type"],
    "duration":            ["duration","reserved_duration","term"],
    "price_hourly_usd":    ["price_hourly_usd","price","usd_per_gpu_hr","price_hourly_usd_per_gpu"],
    "price_reserved_usd":  ["price_reserved_usd","reserved_usd_per_gpu_hr"],
    "price_hourly_usd_instance": ["price_hourly_usd_instance","instance_price"],
    "timestamp":           ["timestamp","fetched_at_utc","ts_utc","ts_iso","time","start_time_iso"],
    "source_url":          ["source_url","url","link"]
}

def normalise_any(df: pd.DataFrame) -> pd.DataFrame:
    """Map any provider frame into a canonical schema and compute effective per-GPU price."""
    df = df.copy()
    # map synonyms → columns
    for tgt, cands in EXPECTED.items():
        for c in cands:
            if c in df.columns:
                if c != tgt: df[tgt] = df[c]
                break
        if tgt not in df.columns: df[tgt] = pd.NA

    # types & defaults
    df["gpu_model"] = df["gpu_model"].astype(str).str.upper().str.strip()
    df["region"]    = df["region"].astype(str).replace({"nan":None}).fillna("Global").str.strip()
    df["type"]      = df["type"].astype(str).replace({"nan":None}).fillna("On-Demand").str.strip()
    df["duration"]  = df["duration"].astype(str).replace({"nan":None,"":None}).fillna("1h").str.strip()
    df["gpu_count"] = pd.to_numeric(df["gpu_count"], errors="coerce")
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
    df["price_hourly_usd"]    = pd.to_numeric(df["price_hourly_usd"].map(_to_num))
    df["price_reserved_usd"]  = pd.to_numeric(df["price_reserved_usd"].map(_to_num))
    df["price_hourly_usd_instance"] = pd.to_numeric(df["price_hourly_usd_instance"].map(_to_num))

    # if an instance price exists but per-GPU doesn't, divide by proven gpu_count
    mask_need_split = df["price_hourly_usd"].isna() & df["price_hourly_usd_instance"].notna() & df["gpu_count"].notna()
    df.loc[mask_need_split, "price_hourly_usd"] = df.loc[mask_need_split, "price_hourly_usd_instance"] / df.loc[mask_need_split, "gpu_count"]

    # effective per-GPU hourly (prefer on-demand; else reserved; if reserved looks "total for term", divide by hours)
    eff = df["price_hourly_usd"].copy()
    dur_h = df["duration"].map(to_hours)
    use_reserved = eff.isna() & df["price_reserved_usd"].notna()
    total_like   = (df["price_reserved_usd"] > 10) & dur_h.notna()
    eff[use_reserved & total_like] = df["price_reserved_usd"] / dur_h
    eff[use_reserved & ~total_like] = df["price_reserved_usd"]
    df["effective_price_usd_per_gpu_hr"] = eff

    # keep only sane rows
    df = df[(df["gpu_model"].isin(["H100","H200"])) & (df["effective_price_usd_per_gpu_hr"].between(0.05, 200))]
    # final timestamp ISO
    df["timestamp"] = df["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    cols = ["provider","region","gpu_model","type","duration","gpu_count",
            "price_hourly_usd","price_reserved_usd","effective_price_usd_per_gpu_hr",
            "timestamp","source_url"]
    extra = [c for c in df.columns if c not in cols]
    return df[cols+extra]

def percentile_rev(s: pd.Series) -> pd.Series:
    # lower price → higher score percentile
    r = s.rank(pct=True, method="average")
    return 1.0 - (r - r.min())/(r.max()-r.min()+1e-12)

def add_priceiq_score(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    grp = ["gpu_model","region"] if df["region"].nunique() > 1 else ["gpu_model"]
    df["_pct"] = df.groupby(grp)["effective_price_usd_per_gpu_hr"].transform(percentile_rev)
    # capacity bonus (0..15)
    cap = np.log1p(df["gpu_count"].fillna(1))/np.log1p(256)
    df["_cap"] = 15.0*cap.clip(0,1)
    # recency bonus (0..10)
    def rec(ts):
        try:
            t = pd.to_datetime(ts, utc=True)
            age_h = max(0.0, (pd.Timestamp.utcnow()-t).total_seconds()/3600)
            return 10.0*math.exp(-age_h/72.0)
        except: return 0.0
    df["_rec"] = df["timestamp"].map(rec)
    df["priceiq_score"] = (df["_pct"]*75.0 + df["_cap"] + df["_rec"]).clip(0,100)
    return df.drop(columns=["_pct","_cap","_rec"], errors="ignore")

# ---------- 1) gather frames: in-memory first, else docs/data/latest/*.csv ----------
frames = []
# any variable starting with df_ or latest_ that's a DataFrame
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame) and (name.startswith("df_") or name.startswith("latest_")):
        if not obj.empty:
            frames.append(obj.copy())

if not frames:
    for p in sorted(LATEST_DIR.glob("*_latest.csv")):
        try:
            frames.append(pd.read_csv(p))
        except Exception as e:
            print(f"[WARN] failed to load {p.name}: {e}")

if not frames:
    raise RuntimeError("No provider frames found (neither in-memory nor docs/data/latest/*.csv). Run scrapers first.")

raw = pd.concat(frames, ignore_index=True)
norm = normalise_any(raw)

# dedupe: latest by (provider, region, gpu_model, type, duration, gpu_count, effective price)
key = ["provider","region","gpu_model","type","duration","gpu_count","effective_price_usd_per_gpu_hr"]
norm = (norm.sort_values("timestamp").drop_duplicates(subset=key, keep="last").reset_index(drop=True))

# ---------- 2) build market baselines (On-Demand only) ----------
# ---------- 2) build market baselines IN-PLACE (no merge needed) ----------
scored = norm.copy()
gcols = ["gpu_model", "region"]
g = scored.groupby(gcols)["effective_price_usd_per_gpu_hr"]

scored["market_count"]  = g.transform("size")
scored["market_median"] = g.transform("median")
scored["market_mean"]   = g.transform("mean")
scored["market_p25"]    = g.transform(lambda s: s.quantile(0.25))
scored["market_p75"]    = g.transform(lambda s: s.quantile(0.75))
scored["market_iqr"]    = scored["market_p75"] - scored["market_p25"]
scored["asof_utc"]      = _now_iso()

# safe premium vs median
mm = scored["market_median"].replace(0, np.nan)
scored["premium_vs_median"] = (scored["effective_price_usd_per_gpu_hr"] - mm) / mm

# (optional) also keep a compact market table for debugging/export
market = (scored[[*gcols, "market_count","market_median","market_mean","market_p25","market_p75","market_iqr"]]
          .drop_duplicates().reset_index(drop=True))

# ---------- 3) add Silicon-style score & write provider_scores_latest.csv ----------
scored = add_priceiq_score(scored)
scores_path = DERIVED_DIR / "provider_scores_latest.csv"
scored.to_csv(scores_path, index=False)
print("✔ wrote", scores_path, f"({len(scored)} rows)")


# ---------- 4) ROI table (cheapest total cost per scenario) ----------
def roi_table(df: pd.DataFrame) -> pd.DataFrame:
    base = df[df["effective_price_usd_per_gpu_hr"].notna()].copy()
    if base.empty: return pd.DataFrame()
    scenarios = []
    models = [m for m in ["H100","H200"] if m in set(base["gpu_model"])] or sorted(base["gpu_model"].unique())
    counts = [8,16,32,64]
    durs   = ["1 hour","1 day","1 week","1 month"]
    for m in models:
        for c in counts:
            for d in durs:
                h = to_hours(d)
                if not h or np.isnan(h): continue
                sub = base[base["gpu_model"].eq(m)].copy()
                if sub.empty: continue
                sub["total_cost"] = sub["effective_price_usd_per_gpu_hr"] * c * h
                best = sub.sort_values(["total_cost","effective_price_usd_per_gpu_hr"]).head(1)
                if best.empty: continue
                r = best.iloc[0]
                scenarios.append({
                    "gpu_model": m,
                    "gpu_count": int(c),
                    "duration": d,
                    "best_provider": r["provider"],
                    "best_region": r["region"],
                    "price_per_gpu_hr": round(float(r["effective_price_usd_per_gpu_hr"]), 4),
                    "total_cost_usd": round(float(r["total_cost"]), 2),
                    "timestamp": r["timestamp"]
                })
    return pd.DataFrame(scenarios)

roi_df = roi_table(scored)
roi_path = DERIVED_DIR / "roi_comparison.csv"
roi_df.to_csv(roi_path, index=False)
print("✔ wrote", roi_path, f"({len(roi_df)} rows)")

# ---------- 5) (optional) market index for debugging ----------
market_path = DERIVED_DIR / "market_index.csv"
market.to_csv(market_path, index=False)
print("✔ wrote", market_path)

# quick peek
display(scored.head(10)[["provider","region","gpu_model","type","duration","gpu_count","effective_price_usd_per_gpu_hr","priceiq_score"]])
display(roi_df.head(8))


✔ wrote docs/data/derived/provider_scores_latest.csv (19 rows)
✔ wrote docs/data/derived/roi_comparison.csv (32 rows)
✔ wrote docs/data/derived/market_index.csv


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,effective_price_usd_per_gpu_hr,priceiq_score
0,Nebius,US,H200,On-Demand,1h,,2.3,79.238604
1,Nebius,Global,H100,On-Demand,1h,,2.0,69.864799
2,Nebius,Global,H200,On-Demand,1h,,2.3,79.239799
3,VoltagePark,US,H100,On-Demand,1h,,1.99,79.269266
4,Vast.ai,Global,H100,On-Demand,1h,,1.25,79.272484
5,Shadeform,Global,H200,On-Demand,1h,,2.45,66.774049
6,Paperspace,Global,H100,On-Demand,1h,,2.24,60.535615
7,CoreWeave,US,H100,On-Demand,1h,1.0,20.0,4.290226
8,TensorDock,Global,H100,On-Demand,1h,1.0,2.25,51.166597
9,Hydra Host (Brokkr),Global,H200,On-Demand,1h,1.0,2.5,54.294696


Unnamed: 0,gpu_model,gpu_count,duration,best_provider,best_region,price_per_gpu_hr,total_cost_usd,timestamp
0,H100,8,1 hour,Vast.ai,Global,1.25,10.0,2025-09-04T12:42:14Z
1,H100,8,1 day,Vast.ai,Global,1.25,240.0,2025-09-04T12:42:14Z
2,H100,8,1 week,Vast.ai,Global,1.25,1680.0,2025-09-04T12:42:14Z
3,H100,8,1 month,Vast.ai,Global,1.25,7200.0,2025-09-04T12:42:14Z
4,H100,16,1 hour,Vast.ai,Global,1.25,20.0,2025-09-04T12:42:14Z
5,H100,16,1 day,Vast.ai,Global,1.25,480.0,2025-09-04T12:42:14Z
6,H100,16,1 week,Vast.ai,Global,1.25,3360.0,2025-09-04T12:42:14Z
7,H100,16,1 month,Vast.ai,Global,1.25,14400.0,2025-09-04T12:42:14Z


In [44]:
# === SAFETY HARDENER (drop-in) ===============================================
import pandas as pd, numpy as np

NUM_COLS = [
    "price_hourly_usd","price_hourly_usd_instance","price_reserved_usd",
    "effective_price_usd_per_gpu_hr","gpu_count",
    "market_med","market_mean","market_median","market_p25","market_p75","market_cnt","market_iqr",
    "price_score","premium_vs_median","premium_vs_mkt"
]
DATE_COLS = ["fetched_at_utc","timestamp","asof_utc","ts_utc","ts_iso","last_obs_date","date_target"]

def _sanitize(df: pd.DataFrame) -> pd.DataFrame:
    if not isinstance(df, pd.DataFrame): return df

    # 1) datetimes
    for c in DATE_COLS:
        if c in df.columns:
            s = pd.to_datetime(df[c], errors="coerce", utc=True)
            df[c] = s.dt.tz_convert(None)

    # keep a pure pandas datetime64 day column (not python date objects)
    if "date" in df.columns:
        d = pd.to_datetime(df["date"], errors="coerce")
        # if it was tz-aware, drop tz; always floor to day
        if getattr(d.dtype, "tz", None) is not None:
            d = d.dt.tz_convert(None)
        df["date"] = d.dt.floor("D")

    # 2) numerics (coerce before any clip/ratios)
    for c in NUM_COLS:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df

# Run on any frames that exist in this notebook cell
for _name in ("raw","df","qa","market","scored","od","norm","hist","model_df","last_rows"):
    if _name in globals() and isinstance(globals()[_name], pd.DataFrame):
        globals()[_name] = _sanitize(globals()[_name])

# If you're about to use last_rows ratios (ML baseline), ensure these are numeric:
if "last_rows" in globals() and isinstance(last_rows, pd.DataFrame):
    for c in ["market_p25","market_p75","market_med","price_hourly_usd"]:
        if c in last_rows.columns:
            last_rows[c] = pd.to_numeric(last_rows[c], errors="coerce")
# ============================================================================ 


In [46]:
# ==== ML (robust): Next-day per-GPU price with safe fallback ==================
# Works on old sklearn. Trains only if enough history; else baseline fallback.
import re, numpy as np, pandas as pd
from pathlib import Path
from datetime import timedelta
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

# ---- helpers -----------------------------------------------------------------
def mape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    m = y_true != 0
    return np.mean(np.abs((y_true[m] - y_pred[m]) / np.abs(y_true[m]))) if np.any(m) else np.nan

def make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def _ensure_day_datetime(s: pd.Series) -> pd.Series:
    """datetime64[ns] day-floor; never python date; NaT on failure."""
    s = pd.to_datetime(s, errors="coerce")
    try: s = s.dt.tz_convert(None)
    except Exception: pass
    return s.dt.floor("D")

def _num(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

# ---- paths -------------------------------------------------------------------
BASE      = Path("docs/data")
HIST_DIR  = BASE / "history"
DERIVED   = BASE / "derived"
DERIVED.mkdir(parents=True, exist_ok=True)

# ---- small utils reused from your aggregator ---------------------------------
def to_hours(s):
    if not isinstance(s,str) or not s.strip(): return np.nan
    s = s.strip().lower().rstrip("s")
    m = re.match(r"^(\d+(?:\.\d+)?)\s*([a-z]+)$", s)
    if not m: return np.nan
    q,u = float(m.group(1)), m.group(2)
    mult = {"h":1,"hr":1,"hour":1,"d":24,"day":24,"w":168,"wk":168,"week":168,"mo":720,"month":720}.get(u, np.nan)
    return q*mult

def _to_num(x):
    try:
        if x is None or (isinstance(x,str) and not x.strip()): return np.nan
        return float(re.sub(r"[^\d.\-]", "", str(x)))
    except: return np.nan

def normalise_history(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    req = ["provider","region","gpu_model","type","duration","gpu_count","price_hourly_usd","fetched_at_utc"]
    for c in req:
        if c not in out.columns: out[c] = pd.PandasDtype.NA if hasattr(pd, "PandasDtype") else pd.NA
    out["gpu_model"] = out["gpu_model"].astype(str).str.upper().str.strip()
    out["region"]    = out["region"].astype(str).replace({"nan":None}).fillna("Global").str.strip()
    out["type"]      = out["type"].astype(str).replace({"nan":None}).fillna("On-Demand").str.strip()
    out["duration"]  = out["duration"].astype(str).replace({"nan":None,"":None}).fillna("1h").str.strip()
    out["gpu_count"] = pd.to_numeric(out["gpu_count"], errors="coerce")
    out["price_hourly_usd"] = pd.to_numeric(out["price_hourly_usd"].map(_to_num))
    out["fetched_at_utc"]   = pd.to_datetime(out["fetched_at_utc"], errors="coerce", utc=True)
    out = out[out["gpu_model"].isin(["H100","H200"])]
    out = out[out["price_hourly_usd"].between(0.05, 200)]
    out["hours"] = out["duration"].map(to_hours).fillna(1.0)
    # IMPORTANT: keep as datetime64[ns], not python date
    out["date"]  = _ensure_day_datetime(out["fetched_at_utc"])
    return out

# 1) Load & unify history
frames = []
for p in sorted(HIST_DIR.glob("*_history.csv")):
    try: frames.append(pd.read_csv(p, low_memory=False))
    except Exception as e: print(f"[WARN] read {p.name}: {e}")
if not frames:
    raise RuntimeError("No history CSVs in docs/data/history/ — run scrapers that save history first.")
hist = normalise_history(pd.concat(frames, ignore_index=True))

# Keep last obs per (provider,region,gpu,type,duration) per day
keys = ["provider","region","gpu_model","type","duration"]
hist = (hist.sort_values("fetched_at_utc")
            .drop_duplicates(subset=keys+["date"], keep="last")
            .reset_index(drop=True))

# 2) Daily market features
g = hist.groupby(["date","gpu_model","region"])["price_hourly_usd"]
market = pd.DataFrame({
    "market_cnt": g.size(),
    "market_med": g.median(),
    "market_p25": g.quantile(0.25),
    "market_p75": g.quantile(0.75),
}).reset_index()

df = hist.merge(market, on=["date","gpu_model","region"], how="left")
# ensure numeric before ratios/clip
_num(df, ["price_hourly_usd","market_med","market_p25","market_p75","market_cnt"])
df["premium_vs_mkt"] = (df["price_hourly_usd"] - df["market_med"]) / df["market_med"]

# 3) Supervised set with lags (t → t+1)
df = df.sort_values(keys+["date"]).reset_index(drop=True)
for col in ["price_hourly_usd","premium_vs_mkt","market_med","market_p25","market_p75","market_cnt"]:
    df[f"{col}_lag1"] = df.groupby(keys)[col].shift(1)

model_df = df.dropna(subset=["price_hourly_usd_lag1","market_med_lag1"]).copy()

# ---- SAFETY GATES ----
enough_rows   = len(model_df) >= 60
enough_dates  = model_df["date"].nunique() >= 12
can_train_ml  = bool(enough_rows and enough_dates)

def build_next_features(df_all: pd.DataFrame, model_df: pd.DataFrame):
    # normalize date column defensively
    if "date" in df_all.columns:
        df_all = df_all.copy()
        df_all["date"] = _ensure_day_datetime(df_all["date"])
    last_date = model_df["date"].max()
    next_date = last_date + timedelta(days=1)
    last_per_group = (df_all[df_all["date"]==last_date]
                      .dropna(subset=["price_hourly_usd","market_med"])
                      .copy())
    return last_per_group, last_date, next_date

# 4A) ML path (only if enough data)
pred = None
if can_train_ml:
    y = np.log1p(model_df["price_hourly_usd"])
    X = model_df[[
        "gpu_model","region","type","duration",
        "gpu_count",
        "market_med_lag1","market_p25_lag1","market_p75_lag1","market_cnt_lag1",
        "price_hourly_usd_lag1","premium_vs_mkt_lag1",
    ]]

    cat_cols = ["gpu_model","region","type","duration"]
    num_cols = [c for c in X.columns if c not in cat_cols]
    pre = ColumnTransformer([
        ("cat", make_ohe(), cat_cols),
        ("num", "passthrough", num_cols),
    ])

    point = Pipeline([("pre", pre),
                      ("model", GradientBoostingRegressor(loss="ls", n_estimators=600, learning_rate=0.05, max_depth=3))])

    # guarded CV
    n_samples = len(model_df)
    n_dates = model_df["date"].nunique()
    n_splits = min(5, max(2, n_dates - 1))
    if n_samples <= n_splits:
        n_splits = 2 if n_samples > 2 else 0

    if n_splits >= 2:
        tscv = TimeSeriesSplit(n_splits=n_splits)
        maes, mapes = [], []
        for tr, va in tscv.split(X):
            point.fit(X.iloc[tr], y.iloc[tr])
            pred_cv = np.expm1(point.predict(X.iloc[va]))
            truth   = model_df.iloc[va]["price_hourly_usd"].values
            maes.append(mean_absolute_error(truth, pred_cv))
            mapes.append(mape(truth, pred_cv))
        print(f"[CV] MAE={np.mean(maes):.3f}, MAPE={np.mean(mapes)*100:.1f}% over {len(maes)} folds")
    else:
        print("[CV] skipped (not enough samples)")

    point.fit(X, y)

    def q_pipe(alpha):
        return Pipeline([
            ("pre", pre),
            ("model", GradientBoostingRegressor(loss="quantile", alpha=alpha,
                                                n_estimators=700, learning_rate=0.05, max_depth=3))
        ])
    q10 = q_pipe(0.10).fit(X, y)
    q50 = q_pipe(0.50).fit(X, y)
    q90 = q_pipe(0.90).fit(X, y)

    last_per_group, last_date, target_date = build_next_features(df, model_df)
    if last_per_group.empty:
        can_train_ml = False
    else:
        # ensure numeric for ratios if you add any later
        _num(last_per_group, ["market_med","market_p25","market_p75","price_hourly_usd"])
        nextX = last_per_group[[
            "gpu_model","region","type","duration","gpu_count",
            "market_med","market_p25","market_p75","market_cnt",
            "price_hourly_usd","premium_vs_mkt",
        ]].rename(columns={
            "market_med":"market_med_lag1",
            "market_p25":"market_p25_lag1",
            "market_p75":"market_p75_lag1",
            "market_cnt":"market_cnt_lag1",
            "price_hourly_usd":"price_hourly_usd_lag1",
            "premium_vs_mkt":"premium_vs_mkt_lag1",
        })

        yhat   = np.expm1(point.predict(nextX))
        yhat10 = np.expm1(q10.predict(nextX))
        yhat50 = np.expm1(q50.predict(nextX))
        yhat90 = np.expm1(q90.predict(nextX))

        pred = last_per_group[keys + ["gpu_count"]].copy()
        pred["date_target"]   = pd.to_datetime(str(target_date))
        pred["yhat"]          = yhat.round(4)
        pred["p10"]           = yhat10.round(4)
        pred["p50"]           = yhat50.round(4)
        pred["p90"]           = yhat90.round(4)
        pred["last_obs_date"] = pd.to_datetime(str(last_date))

# 4B) Baseline fallback (no crash, uses last price + market dispersion)
if pred is None:
    print("[ML] Not enough data — using baseline: yhat = last price; bands from market dispersion.")
    # normalize date, then pick latest valid day
    if "date" in df.columns:
        df = df.copy()
        df["date"] = _ensure_day_datetime(df["date"])
    last_date = df.loc[df["date"].notna(), "date"].max()
    target_date = last_date + timedelta(days=1)

    last_per_group = (df[df["date"]==last_date]
                      .dropna(subset=["price_hourly_usd","market_med","market_p25","market_p75"])
                      .copy())
    # numeric before ratios/clip
    _num(last_per_group, ["market_p25","market_p75","market_med","price_hourly_usd"])

    # bands via market dispersion
    ratio25 = (last_per_group["market_p25"] / last_per_group["market_med"]).clip(0.6, 1.0).fillna(0.9)
    ratio75 = (last_per_group["market_p75"] / last_per_group["market_med"]).clip(1.0, 1.6).fillna(1.1)
    y_last  = last_per_group["price_hourly_usd"].values
    p10     = (y_last * ratio25.values).round(4)
    p50     = y_last.round(4)
    p90     = (y_last * ratio75.values).round(4)

    pred = last_per_group[keys + ["gpu_count"]].copy()
    pred["date_target"]   = pd.to_datetime(str(target_date))
    pred["yhat"]          = p50
    pred["p10"]           = p10
    pred["p50"]           = p50
    pred["p90"]           = p90
    pred["last_obs_date"] = pd.to_datetime(str(last_date))

# 5) Save predictions
OUT = DERIVED / "price_predictions.csv"
pred = pred.sort_values(["gpu_model","region","provider","type","duration"]).reset_index(drop=True)
pred.to_csv(OUT, index=False)
print(f"✔ wrote {OUT} ({len(pred)} rows)")
display(pred.head(12))


[ML] Not enough data — using baseline: yhat = last price; bands from market dispersion.
✔ wrote docs/data/derived/price_predictions.csv (21 rows)


Unnamed: 0,provider,region,gpu_model,type,duration,gpu_count,date_target,yhat,p10,p50,p90,last_obs_date
0,CrusoeCloud,Global,H100,On-Demand,1h,1.0,2025-09-05,3.9,3.5948,3.9,5.1293,2025-09-04
1,CrusoeCloud,Global,H100,Reserved-1y,1h,1.0,2025-09-05,2.93,2.7007,2.93,3.8536,2025-09-04
2,CrusoeCloud,Global,H100,Reserved-3y,1h,1.0,2025-09-05,2.54,2.3412,2.54,3.3407,2025-09-04
3,CrusoeCloud,Global,H100,Reserved-6m,1h,1.0,2025-09-05,3.12,2.8758,3.12,4.1035,2025-09-04
4,Hydra Host (Brokkr),Global,H100,On-Demand,1h,1.0,2025-09-05,2.3,2.12,2.3,3.025,2025-09-04
5,Nebius,Global,H100,On-Demand,1h,,2025-09-05,2.0,1.8435,2.0,2.6304,2025-09-04
6,OVHcloud,Global,H100,On-Demand,1h,8.0,2025-09-05,0.225,0.2074,0.225,0.2959,2025-09-04
7,Paperspace,Global,H100,On-Demand,1h,,2025-09-05,2.24,2.0647,2.24,2.9461,2025-09-04
8,Shadeform,Global,H100,On-Demand,1h,,2025-09-05,10.0,9.2174,10.0,13.1522,2025-09-04
9,TensorDock,Global,H100,On-Demand,1h,1.0,2025-09-05,2.25,2.0739,2.25,2.9592,2025-09-04
