In [1]:
!pip install playwright pandas pytz
!playwright install




In [2]:
#import librarie
import asyncio
import pandas as pd
import datetime as dt
from pytz import timezone
from playwright.async_api import async_playwright


In [4]:
import nest_asyncio
import asyncio
nest_asyncio.apply()


In [11]:
# Install once per kernel
!pip install -q playwright nest_asyncio pandas pytz
!python -m playwright install

import nest_asyncio, asyncio, datetime as dt, re
import pandas as pd
from pytz import timezone
from playwright.async_api import async_playwright

nest_asyncio.apply()
EU_DUBLIN = timezone("Europe/Dublin")

async def get_sfcompute_rate(
    gpu_match="H100",           # which GPU card to click (fuzzy text match)
    gpu_count="8",              # "8" GPUs per your screenshot
    duration_label="4 hours",   # pick one of the preset durations
    start_hour_local="09:00",   # local start time; the UI may show a slightly different span
    headless=True, slow_mo=120
):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless, slow_mo=slow_mo)
        page = await browser.new_page()
        await page.goto("https://sfcompute.com/buy", wait_until="networkidle")

        # Dismiss cookie banners if any
        for t in ["Accept", "Accept all", "I agree", "Got it", "Allow all"]:
            try:
                await page.get_by_role("button", name=re.compile(t, re.I)).click(timeout=1500)
            except:
                pass

        # Select GPU (any element containing "H100")
        await page.mouse.wheel(0, 800)
        try:
            await page.get_by_text(re.compile(gpu_match, re.I)).first.click(timeout=4000)
        except:
            # fallback
            await page.locator("xpath=//*[contains(normalize-space(.), 'H100')]").first.click(timeout=4000)

        # Select GPU count = 8
        await page.get_by_role("button", name=gpu_count).first.click()

        # Select duration
        await page.get_by_role("button", name=duration_label).first.click()

        # Set start date/time (today at start_hour_local)
        start_dt = dt.datetime.now(EU_DUBLIN).replace(minute=0, second=0, microsecond=0)
        hh, mm = map(int, start_hour_local.split(":"))
        start_dt = start_dt.replace(hour=hh, minute=mm)
        iso_val = start_dt.strftime("%Y-%m-%dT%H:%M")
        try:
            # If there is a scheduler toggle, click it then fill the datetime input
            try:
                await page.get_by_text(re.compile("Schedule a start date", re.I)).first.click(timeout=1000)
            except:
                pass
            await page.locator("input[type='datetime-local']").first.fill(iso_val)
        except:
            # not fatal; UI may default to ASAP
            pass

        # Click "Get Instant Quote"
        for lab in ["Get Instant Quote", "Instant Quote", "Get Quote"]:
            try:
                await page.get_by_role("button", name=re.compile(lab, re.I)).first.click(timeout=3000)
                break
            except:
                pass

        # Wait briefly for the quote panel to render
        await page.wait_for_timeout(1500)

        # ---- Parse the quote panel for "Rate" and "Total cost" ----
        # Strategy: read the quote box text, then regex the lines we want.
        body_txt = await page.locator("body").inner_text()

        # Find a "Rate" like: Rate  $1.40/gpu/hr
        m_rate = re.search(r"Rate\s*[\:\-]?\s*\$\s*(\d+(?:\.\d+)?)\s*/?\s*gpu\s*/?\s*hr", body_txt, re.I)
        rate = float(m_rate.group(1)) if m_rate else None

        # Optional: also grab Total cost like: Total cost  $54.51
        m_total = re.search(r"Total\s*cost\s*[\:\-]?\s*\$\s*(\d+(?:\.\d+)?)", body_txt, re.I)
        total_cost = float(m_total.group(1)) if m_total else None

        await browser.close()

        if rate is None:
            raise RuntimeError("Could not find the Rate ($/gpu/hr) in the quote panel. "
                               "If the page layout changed, re-run and check what's displayed.")

        return {
            "ts_utc": dt.datetime.utcnow().isoformat(timespec="seconds"),
            "gpu_type": "H100",
            "gpu_count": 8,
            "duration": duration_label,
            "start_date_local": start_dt.strftime("%Y-%m-%d %H:%M"),
            "usd_per_gpu_hr": rate,
            "total_cost_usd": total_cost
        }

# Run in Jupyter with `await`
row = await get_sfcompute_rate(
    gpu_match="H100",
    gpu_count="8",
    duration_label="4 hours",   # you can try "1 day", "1 week", etc.
    start_hour_local="09:00",
    headless=True, slow_mo=120
)
pd.DataFrame([row])



Unnamed: 0,ts_utc,gpu_type,gpu_count,duration,start_date_local,usd_per_gpu_hr,total_cost_usd
0,2025-08-29T08:10:16,H100,8,4 hours,2025-08-29 09:00,1.4,53.95


In [1]:
#explore price page
# install once per kernel
!pip install -q playwright nest_asyncio pandas
!python -m playwright install

import re, nest_asyncio, asyncio, datetime as dt, pandas as pd
from playwright.async_api import async_playwright

nest_asyncio.apply()

GPU_TYPES = ["H100", "H200"]            # tabs to click
DURATIONS_EXPECTED = ["1 hour","1 day","1 week","1 month"]

async def scrape_sfcompute_grid(headless=True, slow_mo=0):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless, slow_mo=slow_mo)
        page = await browser.new_page()
        await page.goto("https://sfcompute.com", wait_until="networkidle")

        all_rows = []
        ts = dt.datetime.utcnow().isoformat(timespec="seconds")

        for gpu in GPU_TYPES:
            # If there are tabs or toggles for GPU model, click the one that contains H100/H200
            # If the site auto-shows H100 grid, this just won’t find anything and that’s fine.
            try:
                await page.get_by_role("button", name=re.compile(gpu, re.I)).first.click(timeout=1500)
            except:
                try:
                    await page.get_by_text(re.compile(gpu, re.I)).first.click(timeout=1500)
                except:
                    pass  # grid may already be on the desired GPU

            # Find the grid (assume the first table-like element with $ cells)
            # Try real <table> first
            table = None
            tables = page.locator("table")
            if await tables.count() > 0:
                table = tables.first
            else:
                # fallback: a div-based grid – pick the section that contains all duration labels
                container = page.locator("section, div").filter(has_text=re.compile("1 hour|1 day|1 week|1 month", re.I)).first
                table = container

            # read header: GPU counts (top row numbers)
            header_text = await table.inner_text()
            # parse counts that appear as standalone numbers across the header line
            counts = re.findall(r"(?:^|\s)(8|16|32|64|128|256)(?:\s|$)", header_text)
            counts = list(dict.fromkeys(counts))  # unique, keep order
            counts_int = [int(c) for c in counts]

            # parse each duration row; for each count, capture a $x.xx or mark as None
            for dur in DURATIONS_EXPECTED:
                # locate the row by its label
                try:
                    row_el = table.locator(f"text={dur}").first
                    await row_el.scroll_into_view_if_needed()
                except:
                    continue

                # grab the whole table/container text and then pull the price cells near the duration line
                # more robust: find all $ amounts in the table and then map their order to counts
                grid_text = await table.inner_text()

                # narrow to the line starting with the duration
                pat = re.compile(rf"{re.escape(dur)}[^\n]*\n?(.*)", re.I)
                mline = pat.search(grid_text)
                line = mline.group(0) if mline else grid_text

                # find $ numbers in that line; if not enough, search the whole grid after the duration occurs
                prices = re.findall(r"\$\s*(\d+(?:\.\d{1,2})?)", line)
                if len(prices) < len(counts_int):
                    # fallback: search in the full grid, will still map left-to-right
                    after = grid_text[grid_text.lower().find(dur):]
                    prices = re.findall(r"\$\s*(\d+(?:\.\d{1,2})?)", after)

                # cells may include blanks (—). Build row list mapping by position; if price list shorter, fill with None
                for idx, c in enumerate(counts_int):
                    val = None
                    if idx < len(prices):
                        try:
                            val = float(prices[idx])
                        except:
                            val = None
                    all_rows.append({
                        "ts_utc": ts,
                        "gpu_type": gpu,
                        "gpu_count": c,
                        "duration": dur,
                        "usd_per_gpu_hr": val
                    })

        await browser.close()
        return pd.DataFrame(all_rows)

# run it
df = await scrape_sfcompute_grid(headless=True)
df


Unnamed: 0,ts_utc,gpu_type,gpu_count,duration,usd_per_gpu_hr
0,2025-08-29T08:26:06,H100,8,1 hour,1.4
1,2025-08-29T08:26:06,H100,32,1 hour,1.4
2,2025-08-29T08:26:06,H100,128,1 hour,1.39
3,2025-08-29T08:26:06,H100,256,1 hour,1.4
4,2025-08-29T08:26:06,H100,16,1 hour,1.44
5,2025-08-29T08:26:06,H100,8,1 day,1.4
6,2025-08-29T08:26:06,H100,32,1 day,1.4
7,2025-08-29T08:26:06,H100,128,1 day,1.4
8,2025-08-29T08:26:06,H100,256,1 day,1.4
9,2025-08-29T08:26:06,H100,16,1 day,1.4


In [3]:
#ave the df to a cv
import datetime as dt
from pathlib import Path

# folder for storing results
out_dir = Path("sfcompute_grid")
out_dir.mkdir(exist_ok=True)

# filenames
today = dt.datetime.utcnow().strftime("%Y%m%d")
daily_file = out_dir / f"grid_{today}.csv"
hist_file  = out_dir / "grid_history.csv"

# save today's scrape
df.to_csv(daily_file, index=False)

# update growing history file
if hist_file.exists():
    hist = pd.read_csv(hist_file)
    hist = pd.concat([hist, df], ignore_index=True).drop_duplicates()
    hist.to_csv(hist_file, index=False)
else:
    df.to_csv(hist_file, index=False)

print("Saved:", daily_file)
print("History file updated:", hist_file)


Saved: sfcompute_grid/grid_20250829.csv
History file updated: sfcompute_grid/grid_history.csv


In [4]:
#check
pd.read_csv("sfcompute_grid/grid_history.csv").tail()

Unnamed: 0,ts_utc,gpu_type,gpu_count,duration,usd_per_gpu_hr
25,2025-08-29T08:26:06,H200,8,1 month,1.4
26,2025-08-29T08:26:06,H200,32,1 month,1.4
27,2025-08-29T08:26:06,H200,128,1 month,1.4
28,2025-08-29T08:26:06,H200,256,1 month,1.4
29,2025-08-29T08:26:06,H200,16,1 month,1.4
