In [1]:
!pip install playwright pandas pytz
!playwright install



In [2]:
import asyncio
import pandas as pd
import datetime as dt
from pytz import timezone
from playwright.async_api import async_playwright

In [3]:
import nest_asyncio
import asyncio
nest_asyncio.apply()

In [59]:
# SF Compute price grid (keep your flow; fix "1 week" + all counts)
import re, nest_asyncio, asyncio, datetime as dt, pandas as pd
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

nest_asyncio.apply()

GPU_TYPES = ["H100", "H200"]
DURATIONS_EXPECTED = ["1 hour", "1 day", "1 week", "1 month"]

# Strict $-anchored price AND a decimal fallback (used only when $ not in text)
PRICE_RE = re.compile(r"(?<=\$)\s*([0-9]+(?:\.[0-9]{1,3})?)")
DECIMAL_FALLBACK_RE = re.compile(r"([0-9]+(?:\.[0-9]{1,3}))")


async def _select_explore_prices_tab(page):
    try:
        await page.get_by_role("tab", name=re.compile(r"Explore Prices", re.I)).click(timeout=1500)
    except:
        try:
            await page.get_by_text(re.compile(r"Explore Prices", re.I)).first.click(timeout=1500)
        except:
            pass

async def _select_gpu(page, gpu_label: str):
    """Robustly select GPU type (H100/H200)."""
    # 1) Native <select>
    try:
        sel = page.get_by_role("combobox").first
        if await sel.count() > 0:
            await sel.select_option(label=gpu_label)
            return True
    except:
        pass
    # 2) Custom dropdown
    try:
        dd = page.get_by_role("button", name=re.compile(r"H100|H200", re.I)).first
        if await dd.count() == 0:
            dd = page.locator("div,button,[role='button']").filter(has_text=re.compile(r"H100|H200")).first
        await dd.click(timeout=1500)
        await page.get_by_text(re.compile(rf"^{gpu_label}$", re.I)).first.click(timeout=1500)
        return True
    except:
        pass
    # 3) Button toggle
    try:
        await page.get_by_role("button", name=re.compile(rf"^{gpu_label}$", re.I)).first.click(timeout=1500)
        return True
    except:
        pass
    # 4) Fallback
    try:
        await page.get_by_text(re.compile(rf"^{gpu_label}$", re.I)).first.click(timeout=1500)
        return True
    except:
        return False

async def _wait_table_update(page, expected_gpu: str):
    tbl = page.locator("table").first
    if await page.locator("table").count() == 0:
        tbl = page.locator("section, div").filter(
            has_text=re.compile("1 hour|1 day|1 week|1 month", re.I)
        ).first
    try:
        await tbl.wait_for(state="visible", timeout=6000)
    except:
        pass
    # settle after GPU switch so the row texts are fresh
    await page.wait_for_timeout(400)
    return tbl

def _slice_one_row_text(full_text: str, dur: str) -> str:
    """Take only the text segment from this duration label up to the next duration label."""
    lo = full_text.lower()
    start = lo.find(dur)
    if start == -1:
        return ""
    next_pos = len(full_text)
    for other in DURATIONS_EXPECTED:
        if other == dur:
            continue
        p = lo.find(other, start + 1)
        if p != -1:
            next_pos = min(next_pos, p)
    return full_text[start:next_pos]

# ---------- NEW: DOM fallback for rows where '$' is CSS-injected (e.g., '1 week') ----------
async def _extract_row_prices_via_dom(table_locator, dur: str, ncols: int):
    """
    Find the visual row that contains `dur` and pull texts/attributes from its cells.
    Returns a list of length ncols with floats or None.
    """
    # locate the label element first
    label_loc = table_locator.get_by_text(re.compile(rf"\b{re.escape(dur)}\b", re.I)).first
    if await label_loc.count() == 0:
        return None

    # Grab candidate cell texts/attributes from the same row container
    cell_texts = await label_loc.evaluate("""
      (el) => {
        function findRowRoot(node){
          while (node && node !== document.body) {
            if (node.tagName === 'TR') return node;
            const role = node.getAttribute && node.getAttribute('role');
            if (role && role.toLowerCase() === 'row') return node;
            const cls = (node.className || '').toString();
            if (/\\brow\\b/i.test(cls)) return node;
            node = node.parentElement;
          }
          return null;
        }
        const row = findRowRoot(el) || el.parentElement;
        if (!row) return [];

        // Prefer direct siblings after the label cell; otherwise, collect all leaf cells
        const out = [];
        const children = Array.from(row.children);
        let startIdx = children.findIndex(ch => ch.contains(el));
        if (startIdx >= 0) {
          for (let i = startIdx + 1; i < children.length; i++) {
            const c = children[i];
            const t = (c.innerText || c.textContent || '').replace(/\\s+/g,' ').trim();
            const aria = c.getAttribute && (c.getAttribute('aria-label') || '');
            const data = c.getAttribute && (c.getAttribute('data-price') || '');
            out.push([t, aria, data].filter(Boolean).join(' '));
          }
        }
        if (out.length === 0) {
          const cells = Array.from(row.querySelectorAll('td,th,a,button,div,span'));
          for (const c of cells) {
            if (c.contains(el) || el.contains(c)) continue;
            const t = (c.innerText || c.textContent || '').replace(/\\s+/g,' ').trim();
            const aria = c.getAttribute && (c.getAttribute('aria-label') || '');
            const data = c.getAttribute && (c.getAttribute('data-price') || '');
            out.push([t, aria, data].filter(Boolean).join(' '));
          }
        }
        return out;
      }
    """)

    # Parse decimals; require a decimal point to avoid '1' from '1 week'
    vals = []
    for txt in cell_texts:
        m = DECIMAL_FALLBACK_RE.search(txt)
        vals.append(float(m.group(1)) if m else None)
        if len(vals) >= ncols:
            break

    if not any(v is not None for v in vals):
        return None

    # pad to ncols
    return vals + [None] * (ncols - len(vals))

# ---------- MODIFIED: parser that prefers $ text, then DOM fallback ----------
async def _parse_grid_from_table(table_locator):
    """
    Minimal-change parser:
    - Force columns to the full set [8,16,32,64,128,256]
    - For each duration label, grab that *row* from the DOM and parse the next cells left-to-right.
    - Prefer $-anchored numbers; if missing (e.g., CSS-injected $ in 1-week links), fall back to decimals.
    """
    counts_int = [8, 16, 32, 64, 128, 256]

    rows = {}
    for dur in DURATIONS_EXPECTED:
        # Locate the row that contains exactly this duration label
        row = table_locator.locator("tr", has_text=re.compile(rf"^\s*{re.escape(dur)}\s*$", re.I)).first
        if await row.count() == 0:
            # Sometimes it's not a semantic <tr>; look for any row-like container
            row = table_locator.locator("*[role='row'], div, section").filter(
                has_text=re.compile(rf"^\s*{re.escape(dur)}\s*$", re.I)
            ).first

        # Pull texts from the next N cells after the label cell
        # We read both plain cell text and any nested link/button text/attributes.
        cell_texts = await row.evaluate("""
          (el) => {
            // Find the element that has the duration text, then read its siblings
            const textMatches = (node, dur) =>
              (node.innerText || node.textContent || '').trim().toLowerCase() === dur.toLowerCase();

            function findLabelCell(root, dur){
              const all = Array.from(root.querySelectorAll('th,td,div,span'));
              return all.find(n => textMatches(n, dur)) || root;
            }

            function collectCellStrings(nodes){
              const out = [];
              for(const n of nodes){
                const base = (n.innerText || n.textContent || '').replace(/\\s+/g,' ').trim();
                const aria = n.getAttribute?.('aria-label') || '';
                const title = n.getAttribute?.('title') || '';
                const data  = n.getAttribute?.('data-price') || '';
                // Include nested anchors/buttons in case the number lives there
                const nested = Array.from(n.querySelectorAll('a,button,span,div')).map(x =>
                  (x.innerText || x.textContent || '').replace(/\\s+/g,' ').trim()
                ).filter(Boolean).join(' ');
                out.push([base, aria, title, data, nested].filter(Boolean).join(' '));
              }
              return out;
            }

            const label = findLabelCell(el, /* dur injected below by Playwright */ '');
            // Children after label within the same row/container
            const siblings = Array.from(label.parentElement?.children || []);
            const idx = siblings.indexOf(label);
            const after = idx >= 0 ? siblings.slice(idx + 1) : [];

            // If that yields nothing (non-tabular markup), fall back to all cells under row
            const cells = after.length ? after : Array.from(el.querySelectorAll('td,th,div,span'));
            return collectCellStrings(cells);
          }
        """, arg=dur)

        # Now extract numbers from those cell strings
        vals = []
        for txt in cell_texts:
            # 1) Try $-anchored prices first
            m = PRICE_RE.search(txt)
            if m:
                vals.append(float(m.group(1)))
            else:
                # 2) Fallback to decimals (handles the 1-week link cells)
                m2 = DECIMAL_FALLBACK_RE.search(txt)
                vals.append(float(m2.group(1)) if m2 else None)
            if len(vals) >= len(counts_int):
                break

        # Pad/trim to the expected 6 columns
        vals = (vals + [None] * len(counts_int))[:len(counts_int)]
        rows[dur] = vals

    return counts_int, rows


async def scrape_sfcompute_grid(headless=True, slow_mo=0):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless, slow_mo=slow_mo)
        page = await browser.new_page()
        await page.goto("https://sfcompute.com/buy", wait_until="networkidle")
        await _select_explore_prices_tab(page)

        all_rows = []
        ts = dt.datetime.utcnow().isoformat(timespec="seconds")

        for gpu in GPU_TYPES:
            await _select_gpu(page, gpu)
            table = await _wait_table_update(page, gpu)
            counts_int, rows = await _parse_grid_from_table(table)

            for dur, vals in rows.items():
                for c, val in zip(counts_int, vals):
                    all_rows.append({
                        "ts_utc": ts,
                        "gpu_type": gpu,
                        "gpu_count": c,
                        "duration": dur,
                        "usd_per_gpu_hr": val
                    })

        await browser.close()
        return pd.DataFrame(all_rows)

# ---- Run & Save ----
df = await scrape_sfcompute_grid(headless=True)
print(df)

out_dir = Path("sfcompute_grid")
out_dir.mkdir(exist_ok=True)
today = dt.datetime.utcnow().strftime("%Y%m%d")
daily_file = out_dir / f"grid_{today}.csv"
hist_file  = out_dir / "grid_history.csv"

df.to_csv(daily_file, index=False)
if hist_file.exists():
    hist = pd.read_csv(hist_file)
    hist = pd.concat([hist, df], ignore_index=True).drop_duplicates()
    hist.to_csv(hist_file, index=False)
else:
    df.to_csv(hist_file, index=False)

print("Saved:", daily_file)
print("History file updated:", hist_file)
#CORRECT 

                 ts_utc gpu_type  gpu_count duration  usd_per_gpu_hr
0   2025-09-03T16:02:08     H100          8   1 hour            1.40
1   2025-09-03T16:02:08     H100         16   1 hour            1.40
2   2025-09-03T16:02:08     H100         32   1 hour            1.40
3   2025-09-03T16:02:08     H100         64   1 hour            1.40
4   2025-09-03T16:02:08     H100        128   1 hour            1.44
5   2025-09-03T16:02:08     H100        256   1 hour            1.55
6   2025-09-03T16:02:08     H100          8    1 day            1.40
7   2025-09-03T16:02:08     H100         16    1 day            1.40
8   2025-09-03T16:02:08     H100         32    1 day            1.40
9   2025-09-03T16:02:08     H100         64    1 day            1.40
10  2025-09-03T16:02:08     H100        128    1 day            1.40
11  2025-09-03T16:02:08     H100        256    1 day            1.77
12  2025-09-03T16:02:08     H100          8   1 week            1.40
13  2025-09-03T16:02:08     H100  