In [1]:
!pip install playwright pandas pytz
!playwright install




In [2]:
#import librarie
import asyncio
import pandas as pd
import datetime as dt
from pytz import timezone
from playwright.async_api import async_playwright


In [4]:
import nest_asyncio
import asyncio
nest_asyncio.apply()


In [1]:
# Scraper for SF Compute price grid
import re, nest_asyncio, asyncio, datetime as dt, pandas as pd
from pathlib import Path
from playwright.async_api import async_playwright

nest_asyncio.apply()

GPU_TYPES = ["H100", "H200"]            
DURATIONS_EXPECTED = ["1 hour","1 day","1 week","1 month"]

async def scrape_sfcompute_grid(headless=True, slow_mo=0):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless, slow_mo=slow_mo)
        page = await browser.new_page()
        await page.goto("https://sfcompute.com", wait_until="networkidle")

        all_rows = []
        ts = dt.datetime.utcnow().isoformat(timespec="seconds")

        for gpu in GPU_TYPES:
            # Try clicking GPU tab
            try:
                await page.get_by_role("button", name=re.compile(gpu, re.I)).first.click(timeout=1500)
            except:
                try:
                    await page.get_by_text(re.compile(gpu, re.I)).first.click(timeout=1500)
                except:
                    pass  

            # Get table or fallback container
            table = page.locator("table").first
            if await page.locator("table").count() == 0:
                table = page.locator("section, div").filter(has_text=re.compile("1 hour|1 day|1 week|1 month", re.I)).first

            # Parse GPU counts from header
            header_text = await table.inner_text()
            counts = re.findall(r"(?:^|\s)(8|16|32|64|128|256)(?:\s|$)", header_text)
            counts = list(dict.fromkeys(counts))
            counts_int = [int(c) for c in counts]

            # Parse each duration row
            for dur in DURATIONS_EXPECTED:
                try:
                    row_el = table.locator(f"text={dur}").first
                    await row_el.scroll_into_view_if_needed()
                except:
                    continue

                grid_text = await table.inner_text()

                # extract line
                pat = re.compile(rf"{re.escape(dur)}[^\n]*\n?(.*)", re.I)
                mline = pat.search(grid_text)
                line = mline.group(0) if mline else grid_text

                # find $ prices
                prices = re.findall(r"\$\s*(\d+(?:\.\d{1,2})?)", line)
                if len(prices) < len(counts_int):
                    after = grid_text[grid_text.lower().find(dur):]
                    prices = re.findall(r"\$\s*(\d+(?:\.\d{1,2})?)", after)

                for idx, c in enumerate(counts_int):
                    val = float(prices[idx]) if idx < len(prices) else None
                    all_rows.append({
                        "ts_utc": ts,
                        "gpu_type": gpu,
                        "gpu_count": c,
                        "duration": dur,
                        "usd_per_gpu_hr": val
                    })

        await browser.close()
        return pd.DataFrame(all_rows)

# ---- Run & Save ----
df = await scrape_sfcompute_grid(headless=True)
print(df)

# Save outputs (so GitHub Actions can collect them)
out_dir = Path("sfcompute_grid")
out_dir.mkdir(exist_ok=True)
today = dt.datetime.utcnow().strftime("%Y%m%d")
daily_file = out_dir / f"grid_{today}.csv"
hist_file  = out_dir / "grid_history.csv"

df.to_csv(daily_file, index=False)
if hist_file.exists():
    hist = pd.read_csv(hist_file)
    hist = pd.concat([hist, df], ignore_index=True).drop_duplicates()
    hist.to_csv(hist_file, index=False)
else:
    df.to_csv(hist_file, index=False)

print("Saved:", daily_file)
print("History file updated:", hist_file)


                 ts_utc gpu_type  gpu_count duration  usd_per_gpu_hr
0   2025-08-29T09:01:54     H100          8   1 hour            1.40
1   2025-08-29T09:01:54     H100         32   1 hour            1.40
2   2025-08-29T09:01:54     H100        128   1 hour            1.39
3   2025-08-29T09:01:54     H100        256   1 hour            1.40
4   2025-08-29T09:01:54     H100         16   1 hour            1.44
5   2025-08-29T09:01:54     H100          8    1 day            1.40
6   2025-08-29T09:01:54     H100         32    1 day            1.40
7   2025-08-29T09:01:54     H100        128    1 day            1.40
8   2025-08-29T09:01:54     H100        256    1 day            1.40
9   2025-08-29T09:01:54     H100         16    1 day            1.40
10  2025-08-29T09:01:54     H100          8  1 month            1.40
11  2025-08-29T09:01:54     H100         32  1 month            1.40
12  2025-08-29T09:01:54     H100        128  1 month            1.40
13  2025-08-29T09:01:54     H100  