From e6e173a4c685371df663f6c6099e42846d2019df Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 15:20:15 -0800 Subject: [PATCH 01/27] Add Buildkite infrastructure for GPU job isolation - Add deployment/buildkite/ with setup-node.sh, pipeline.yml, Dockerfile - Add BuildkiteLauncher class for submitting jobs to Buildkite - Add buildkite-runner.py for job execution in containers - Add BuildkiteGPU enum (B200_BK, H100_BK, MI300_BK) - Add e2e test script for verifying Buildkite integration This enables vendors to onboard GPU resources with proper per-GPU isolation using a single setup script that creates Buildkite agents. --- deployment/buildkite/Dockerfile | 42 +++ deployment/buildkite/pipeline.yml | 49 ++++ deployment/buildkite/setup-node.sh | 222 +++++++++++++++ src/libkernelbot/consts.py | 13 +- src/libkernelbot/launchers/__init__.py | 3 +- src/libkernelbot/launchers/buildkite.py | 359 ++++++++++++++++++++++++ src/runners/buildkite-runner.py | 68 +++++ tests/e2e_buildkite_test.py | 115 ++++++++ 8 files changed, 869 insertions(+), 2 deletions(-) create mode 100644 deployment/buildkite/Dockerfile create mode 100644 deployment/buildkite/pipeline.yml create mode 100755 deployment/buildkite/setup-node.sh create mode 100644 src/libkernelbot/launchers/buildkite.py create mode 100644 src/runners/buildkite-runner.py create mode 100644 tests/e2e_buildkite_test.py diff --git a/deployment/buildkite/Dockerfile b/deployment/buildkite/Dockerfile new file mode 100644 index 00000000..3127a3a5 --- /dev/null +++ b/deployment/buildkite/Dockerfile @@ -0,0 +1,42 @@ +# Kernelbot evaluation image +FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + +# System packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + python3-pip \ + git \ + wget \ + curl \ + build-essential \ + ninja-build \ + cmake \ + && rm -rf /var/lib/apt/lists/* + +# Set Python 3.11 as default +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 + +# Upgrade pip +RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel + +# PyTorch + CUDA +RUN pip install --no-cache-dir \ + torch==2.4.0 \ + triton \ + numpy \ + scipy + +# Copy kernelbot +WORKDIR /app +COPY pyproject.toml . +COPY src/ src/ +RUN pip install --no-cache-dir -e . + +# Default command +CMD ["python", "/app/src/runners/buildkite-runner.py"] diff --git a/deployment/buildkite/pipeline.yml b/deployment/buildkite/pipeline.yml new file mode 100644 index 00000000..13826a39 --- /dev/null +++ b/deployment/buildkite/pipeline.yml @@ -0,0 +1,49 @@ +# Kernelbot Evaluation Pipeline +# Jobs target specific GPU queue, Buildkite routes to idle agent + +steps: + - label: ":rocket: Kernel Evaluation" + command: "python /app/src/runners/buildkite-runner.py" + + # Queue is set dynamically via KERNELBOT_QUEUE env var + agents: + queue: "${KERNELBOT_QUEUE}" + + plugins: + - docker#v5.11.0: + image: "${KERNELBOT_IMAGE:-ghcr.io/gpu-mode/kernelbot:latest}" + always-pull: true + runtime: nvidia + # GPU isolation - agent exports NVIDIA_VISIBLE_DEVICES + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + - KERNELBOT_GPU_INDEX + - KERNELBOT_CPUSET + - KERNELBOT_MEMORY + # Resource constraints + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + # Mount for caching + volumes: + - "/var/lib/buildkite-agent/cache:/cache:rw" + # Cleanup + leave-container: false + + timeout_in_minutes: 15 + + # Artifacts + artifact_paths: + - "result.json" + - "profile_data/**/*" + + # Retry on infrastructure failures only + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: 255 + limit: 1 diff --git a/deployment/buildkite/setup-node.sh b/deployment/buildkite/setup-node.sh new file mode 100755 index 00000000..70186499 --- /dev/null +++ b/deployment/buildkite/setup-node.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# Buildkite GPU Node Setup +# Usage: BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=b200 ./setup-node.sh + +set -euo pipefail + +# === CONFIGURATION === +BUILDKITE_TOKEN="${BUILDKITE_AGENT_TOKEN:?Must set BUILDKITE_AGENT_TOKEN}" +GPU_TYPE="${GPU_TYPE:?Must set GPU_TYPE (e.g., b200, mi300, h100)}" +NODE_NAME="${NODE_NAME:-$(hostname)}" + +# Auto-detect GPU count +detect_gpu_count() { + if command -v nvidia-smi &> /dev/null; then + nvidia-smi --query-gpu=count --format=csv,noheader | head -1 + elif command -v rocm-smi &> /dev/null; then + rocm-smi --showid | grep -c "GPU" + else + echo "8" # Default + fi +} + +GPU_COUNT="${GPU_COUNT:-$(detect_gpu_count)}" +CPUS_PER_GPU="${CPUS_PER_GPU:-8}" +RAM_PER_GPU="${RAM_PER_GPU:-64g}" + +# Queue name - same for all agents on this node +QUEUE_NAME="${GPU_TYPE}" + +echo "=== Buildkite GPU Node Setup ===" +echo "Node: ${NODE_NAME}" +echo "GPU Type: ${GPU_TYPE}" +echo "GPU Count: ${GPU_COUNT}" +echo "Queue: ${QUEUE_NAME}" +echo "CPUs per GPU: ${CPUS_PER_GPU}" +echo "RAM per GPU: ${RAM_PER_GPU}" +echo "" + +# === INSTALL DEPENDENCIES === + +install_docker_nvidia() { + echo "Installing Docker and NVIDIA Container Toolkit..." + + # Docker + if ! command -v docker &> /dev/null; then + curl -fsSL https://get.docker.com | sh + usermod -aG docker ubuntu 2>/dev/null || true + fi + + # NVIDIA Container Toolkit + if ! dpkg -l | grep -q nvidia-container-toolkit; then + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + apt-get update + apt-get install -y nvidia-container-toolkit + nvidia-ctk runtime configure --runtime=docker + systemctl restart docker + fi + + echo "Docker + NVIDIA toolkit installed." +} + +install_buildkite_agent() { + echo "Installing Buildkite Agent..." + + if ! command -v buildkite-agent &> /dev/null; then + apt-get install -y apt-transport-https gnupg + curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | \ + gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | \ + tee /etc/apt/sources.list.d/buildkite-agent.list + apt-get update + apt-get install -y buildkite-agent + fi + + echo "Buildkite Agent installed." +} + +# === CREATE PER-GPU AGENTS === + +setup_agents() { + echo "Configuring ${GPU_COUNT} agents..." + + # Create base directories + mkdir -p /etc/buildkite-agent/hooks + mkdir -p /var/lib/buildkite-agent + + # Create shared hooks + cat > /etc/buildkite-agent/hooks/environment << 'HOOKEOF' +#!/bin/bash +# GPU isolation hook - runs before each job +set -euo pipefail + +# GPU index is set per-agent via environment +echo "GPU ${BUILDKITE_AGENT_META_DATA_GPU_INDEX} allocated for this job" +echo "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES}" +HOOKEOF + chmod +x /etc/buildkite-agent/hooks/environment + + # Create pre-exit hook for cleanup + cat > /etc/buildkite-agent/hooks/pre-exit << 'HOOKEOF' +#!/bin/bash +# Cleanup after job +docker system prune -f --filter "until=1h" 2>/dev/null || true +HOOKEOF + chmod +x /etc/buildkite-agent/hooks/pre-exit + + # Stop any existing agents + systemctl stop 'buildkite-agent-gpu*' 2>/dev/null || true + + # Create agent for each GPU + for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + local cpu_start=$((gpu_idx * CPUS_PER_GPU)) + local cpu_end=$((cpu_start + CPUS_PER_GPU - 1)) + local agent_name="${NODE_NAME}-gpu${gpu_idx}" + local config_dir="/etc/buildkite-agent/agent-${gpu_idx}" + local build_dir="/var/lib/buildkite-agent/gpu-${gpu_idx}/builds" + + mkdir -p "${config_dir}" + mkdir -p "${build_dir}" + + # Agent configuration + cat > "${config_dir}/buildkite-agent.cfg" << CFGEOF +# Buildkite Agent Configuration - GPU ${gpu_idx} +token="${BUILDKITE_TOKEN}" +name="${agent_name}" +tags="queue=${QUEUE_NAME},gpu=${GPU_TYPE},gpu-index=${gpu_idx},node=${NODE_NAME}" +build-path="${build_dir}" +hooks-path="/etc/buildkite-agent/hooks" +plugins-path="/var/lib/buildkite-agent/plugins" +disconnect-after-job=false +disconnect-after-idle-timeout=0 +CFGEOF + + # Agent environment file (for GPU isolation) + cat > "${config_dir}/environment" << ENVEOF +NVIDIA_VISIBLE_DEVICES=${gpu_idx} +CUDA_VISIBLE_DEVICES=${gpu_idx} +KERNELBOT_GPU_INDEX=${gpu_idx} +KERNELBOT_CPU_START=${cpu_start} +KERNELBOT_CPU_END=${cpu_end} +KERNELBOT_CPUSET=${cpu_start}-${cpu_end} +KERNELBOT_MEMORY=${RAM_PER_GPU} +ENVEOF + + # Systemd service + cat > "/etc/systemd/system/buildkite-agent-gpu${gpu_idx}.service" << SVCEOF +[Unit] +Description=Buildkite Agent (GPU ${gpu_idx}) +Documentation=https://buildkite.com/docs/agent/v3 +After=network.target docker.service +Requires=docker.service + +[Service] +Type=simple +User=buildkite-agent +EnvironmentFile=${config_dir}/environment +ExecStart=/usr/bin/buildkite-agent start --config ${config_dir}/buildkite-agent.cfg +RestartSec=5 +Restart=on-failure +RestartForceExitStatus=SIGPIPE +TimeoutStartSec=10 +TimeoutStopSec=60 +KillMode=process + +[Install] +WantedBy=multi-user.target +SVCEOF + + echo " Agent ${gpu_idx}: GPU=${gpu_idx}, CPUs=${cpu_start}-${cpu_end}" + done + + # Fix permissions + chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent + chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent + + # Add buildkite-agent to docker group + usermod -aG docker buildkite-agent +} + +# === START AGENTS === + +start_agents() { + echo "Starting agents..." + systemctl daemon-reload + + for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + systemctl enable "buildkite-agent-gpu${gpu_idx}" + systemctl start "buildkite-agent-gpu${gpu_idx}" + done + + sleep 3 + + echo "" + echo "=== Agent Status ===" + for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + status=$(systemctl is-active "buildkite-agent-gpu${gpu_idx}" 2>/dev/null || echo "unknown") + echo " GPU ${gpu_idx}: ${status}" + done +} + +# === MAIN === + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + exit 1 +fi + +install_docker_nvidia +install_buildkite_agent +setup_agents +start_agents + +echo "" +echo "=== Setup Complete ===" +echo "Agents should appear at: https://buildkite.com/organizations/YOUR_ORG/agents" +echo "Queue: ${QUEUE_NAME}" +echo "" +echo "Test with: buildkite-agent start --help" diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index f60764de..b9f30d0e 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -33,6 +33,13 @@ class ModalGPU(Enum): L4x4 = "L4x4" +class BuildkiteGPU(Enum): + """GPUs available via Buildkite-managed infrastructure.""" + B200_BK = "B200_BK" + H100_BK = "H100_BK" + MI300_BK = "MI300_BK" + + @dataclasses.dataclass class GPU: name: str @@ -50,7 +57,7 @@ def _make_gpu_lookup(runner_map: dict[str, Type[Enum]]): return lookup -_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU}) +_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU, "Buildkite": BuildkiteGPU}) def get_gpu_by_name(name: str) -> GPU: @@ -121,6 +128,10 @@ class RankCriterion(Enum): "MI300": None, "MI300x8": None, "MI250": None, + # Buildkite-managed GPUs + "B200_BK": "100", + "H100_BK": "90a", + "MI300_BK": None, } diff --git a/src/libkernelbot/launchers/__init__.py b/src/libkernelbot/launchers/__init__.py index df47476f..1a7a8a39 100644 --- a/src/libkernelbot/launchers/__init__.py +++ b/src/libkernelbot/launchers/__init__.py @@ -1,5 +1,6 @@ +from .buildkite import BuildkiteLauncher from .github import GitHubLauncher from .launcher import Launcher from .modal import ModalLauncher -__all__ = [Launcher, GitHubLauncher, ModalLauncher] +__all__ = [Launcher, GitHubLauncher, ModalLauncher, BuildkiteLauncher] diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py new file mode 100644 index 00000000..88c476a2 --- /dev/null +++ b/src/libkernelbot/launchers/buildkite.py @@ -0,0 +1,359 @@ +"""Buildkite launcher for kernel evaluation jobs. + +Uses single-queue model where all agents on a node share the same queue. +Buildkite automatically routes jobs to idle agents. +""" + +from __future__ import annotations + +import asyncio +import base64 +import datetime +import json +import os +import zlib +from dataclasses import dataclass, field +from typing import Any + +import httpx + +from libkernelbot.consts import GPU, BuildkiteGPU +from libkernelbot.report import RunProgressReporter +from libkernelbot.run_eval import ( + CompileResult, + EvalResult, + FullResult, + ProfileResult, + RunResult, + SystemInfo, +) +from libkernelbot.utils import setup_logging + +from .launcher import Launcher + +logger = setup_logging(__name__) + +BUILDKITE_API = "https://api.buildkite.com/v2" + + +@dataclass +class BuildkiteConfig: + """Buildkite launcher configuration.""" + + org_slug: str = "gpu-mode" + pipeline_slug: str = "kernelbot" + api_token: str = field(default_factory=lambda: os.environ.get("BUILDKITE_API_TOKEN", "")) + + # Docker image for jobs + image: str = "ghcr.io/gpu-mode/kernelbot:latest" + + # Timeouts + poll_interval_seconds: int = 10 + max_wait_seconds: int = 900 # 15 minutes + + # Resource defaults + cpus: int = 8 + memory: str = "64g" + + +@dataclass +class BuildkiteResult: + """Result from a Buildkite job.""" + + success: bool + error: str | None + result: dict[str, Any] | None + build_url: str | None = None + build_number: int | None = None + + +class BuildkiteLauncher(Launcher): + """Launcher that submits jobs to Buildkite.""" + + def __init__(self, config: BuildkiteConfig | None = None): + super().__init__(name="Buildkite", gpus=BuildkiteGPU) + self.config = config or BuildkiteConfig() + self._client: httpx.AsyncClient | None = None + + async def _get_client(self) -> httpx.AsyncClient: + if self._client is None: + self._client = httpx.AsyncClient( + headers={ + "Authorization": f"Bearer {self.config.api_token}", + "Content-Type": "application/json", + }, + timeout=30.0, + ) + return self._client + + def _encode_payload(self, config: dict[str, Any]) -> str: + """Compress and base64-encode config.""" + json_bytes = json.dumps(config).encode("utf-8") + compressed = zlib.compress(json_bytes) + return base64.b64encode(compressed).decode("ascii") + + def _get_queue_for_gpu(self, gpu_type: GPU) -> str: + """Map GPU type to Buildkite queue name.""" + queue_map = { + "B200_BK": "b200", + "H100_BK": "h100", + "MI300_BK": "mi300", + } + return queue_map.get(gpu_type.name, gpu_type.name.lower().replace("_bk", "")) + + async def run_submission( + self, config: dict, gpu_type: GPU, status: RunProgressReporter + ) -> FullResult: + """ + Launch a kernel evaluation job on Buildkite. + + Args: + config: Evaluation configuration dict + gpu_type: Which GPU to run on + status: Progress reporter for status updates + + Returns: + FullResult with success status and results + """ + queue = self._get_queue_for_gpu(gpu_type) + run_id = f"sub-{config.get('submission_id', 'unknown')}-{gpu_type.name}" + + await status.push(f"Submitting to Buildkite queue: {queue}") + logger.info(f"Submitting job {run_id} to Buildkite queue {queue}") + + result = await self._launch( + run_id=run_id, + config=config, + queue=queue, + status=status, + ) + + if not result.success: + return FullResult( + success=False, + error=result.error or "Buildkite job failed", + runs={}, + system=SystemInfo(), + ) + + if result.result is None: + return FullResult( + success=False, + error="No result returned from Buildkite job", + runs={}, + system=SystemInfo(), + ) + + # Parse the result + return self._parse_result(result.result) + + async def _launch( + self, + run_id: str, + config: dict[str, Any], + queue: str, + status: RunProgressReporter, + ) -> BuildkiteResult: + """ + Launch a kernel evaluation job. + + Args: + run_id: Unique identifier for this run + config: Evaluation configuration dict + queue: GPU queue name (e.g., "b200", "mi300") + status: Progress reporter + + Returns: + BuildkiteResult with success status and results + """ + client = await self._get_client() + payload = self._encode_payload(config) + + # Create build + url = ( + f"{BUILDKITE_API}/organizations/{self.config.org_slug}" + f"/pipelines/{self.config.pipeline_slug}/builds" + ) + + build_data = { + "commit": "HEAD", + "branch": "main", + "message": f"Kernel eval: {run_id}", + "env": { + "KERNELBOT_RUN_ID": run_id, + "KERNELBOT_PAYLOAD": payload, + "KERNELBOT_QUEUE": queue, + "KERNELBOT_IMAGE": self.config.image, + "KERNELBOT_CPUS": str(self.config.cpus), + "KERNELBOT_MEMORY": self.config.memory, + }, + "meta_data": { + "run_id": run_id, + "queue": queue, + }, + } + + try: + response = await client.post(url, json=build_data) + response.raise_for_status() + build = response.json() + except httpx.HTTPError as e: + logger.error(f"Failed to create build: {e}") + return BuildkiteResult( + success=False, + error=f"Failed to create build: {e}", + result=None, + ) + + build_url = build.get("web_url") + build_number = build.get("number") + logger.info(f"Build created: {build_url}") + await status.update(f"Build created: [{build_number}](<{build_url}>)") + + # Wait for completion + return await self._wait_for_build(build, run_id, status) + + async def _wait_for_build( + self, build: dict, run_id: str, status: RunProgressReporter + ) -> BuildkiteResult: + """Poll until build completes and download artifacts.""" + client = await self._get_client() + build_url = build.get("url") + web_url = build.get("web_url") + start = asyncio.get_event_loop().time() + + while asyncio.get_event_loop().time() - start < self.config.max_wait_seconds: + try: + response = await client.get(build_url) + response.raise_for_status() + build = response.json() + except httpx.HTTPError as e: + logger.warning(f"Error polling build: {e}") + await asyncio.sleep(self.config.poll_interval_seconds) + continue + + state = build.get("state") + elapsed = asyncio.get_event_loop().time() - start + + if state == "passed": + await status.update(f"Build completed: [{build.get('number')}](<{web_url}>)") + result = await self._download_result(build) + return BuildkiteResult( + success=True, + error=None, + result=result, + build_url=web_url, + build_number=build.get("number"), + ) + + if state in ("failed", "canceled", "blocked"): + return BuildkiteResult( + success=False, + error=f"Build {state}", + result=None, + build_url=web_url, + build_number=build.get("number"), + ) + + await status.update( + f"⏳ Build [{build.get('number')}](<{web_url}>): {state} ({elapsed:.1f}s)" + ) + await asyncio.sleep(self.config.poll_interval_seconds) + + return BuildkiteResult( + success=False, + error="Build timed out", + result=None, + build_url=web_url, + build_number=build.get("number"), + ) + + async def _download_result(self, build: dict) -> dict[str, Any] | None: + """Download result.json artifact.""" + client = await self._get_client() + + # Get artifacts from first job + jobs = build.get("jobs", []) + if not jobs: + return None + + job = jobs[0] + artifacts_url = job.get("artifacts_url") + if not artifacts_url: + return None + + try: + response = await client.get(artifacts_url) + response.raise_for_status() + artifacts = response.json() + + for artifact in artifacts: + if artifact.get("filename") == "result.json": + download_url = artifact.get("download_url") + result_resp = await client.get(download_url) + result_resp.raise_for_status() + return result_resp.json() + except Exception as e: + logger.error(f"Failed to download artifacts: {e}") + + return None + + def _parse_result(self, data: dict[str, Any]) -> FullResult: + """Parse result.json into FullResult.""" + runs = {} + + for k, v in data.get("runs", {}).items(): + comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"]) + run_res = None if v.get("run") is None else RunResult(**v["run"]) + profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"]) + + res = EvalResult( + start=datetime.datetime.fromisoformat(v["start"]), + end=datetime.datetime.fromisoformat(v["end"]), + compilation=comp_res, + run=run_res, + profile=profile_res, + ) + runs[k] = res + + system = SystemInfo(**data.get("system", {})) + return FullResult(success=True, error="", runs=runs, system=system) + + async def get_queue_status(self, queue: str) -> dict[str, Any]: + """Get status of agents in a queue.""" + client = await self._get_client() + url = f"{BUILDKITE_API}/organizations/{self.config.org_slug}/agents" + + try: + response = await client.get(url) + response.raise_for_status() + agents = response.json() + except httpx.HTTPError as e: + return {"error": str(e), "agents": []} + + queue_agents = [] + for agent in agents: + agent_queue = None + for meta in agent.get("metadata", []): + if meta.startswith("queue="): + agent_queue = meta.split("=", 1)[1] + break + + if agent_queue == queue: + queue_agents.append({ + "name": agent.get("name"), + "state": agent.get("connection_state"), + "busy": agent.get("job") is not None, + "gpu_index": next( + (m.split("=")[1] for m in agent.get("metadata", []) + if m.startswith("gpu-index=")), + None + ), + }) + + return { + "queue": queue, + "total": len(queue_agents), + "idle": sum(1 for a in queue_agents if not a["busy"]), + "agents": queue_agents, + } diff --git a/src/runners/buildkite-runner.py b/src/runners/buildkite-runner.py new file mode 100644 index 00000000..d865bf2c --- /dev/null +++ b/src/runners/buildkite-runner.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Buildkite job runner for kernel evaluation.""" + +import base64 +import json +import os +import sys +import zlib +from dataclasses import asdict +from datetime import datetime +from pathlib import Path + + +def serialize(obj: object): + """Serialize datetime objects for JSON.""" + if isinstance(obj, datetime): + return obj.isoformat() + raise TypeError(f"Type {type(obj)} not serializable") + + +def main(): + run_id = os.environ.get("KERNELBOT_RUN_ID", "unknown") + payload_b64 = os.environ.get("KERNELBOT_PAYLOAD") + + print("=== Kernelbot Evaluation ===") + print(f"Run ID: {run_id}") + print(f"GPU: {os.environ.get('NVIDIA_VISIBLE_DEVICES', 'not set')}") + print(f"GPU Index: {os.environ.get('KERNELBOT_GPU_INDEX', 'not set')}") + print() + + if not payload_b64: + print("ERROR: KERNELBOT_PAYLOAD not set", file=sys.stderr) + sys.exit(1) + + # Decode payload + try: + compressed = base64.b64decode(payload_b64) + config_json = zlib.decompress(compressed).decode("utf-8") + config = json.loads(config_json) + except Exception as e: + print(f"ERROR: Failed to decode payload: {e}", file=sys.stderr) + sys.exit(1) + + # Import here to catch import errors clearly + from libkernelbot.run_eval import run_config + + # Run evaluation + print("Starting evaluation...") + result = run_config(config) + + # Write result + result_dict = asdict(result) + result_json = json.dumps(result_dict, default=serialize, indent=2) + Path("result.json").write_text(result_json) + print("Result written to result.json") + + # Print summary + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + + sys.exit(0 if result.success else 1) + + +if __name__ == "__main__": + main() diff --git a/tests/e2e_buildkite_test.py b/tests/e2e_buildkite_test.py new file mode 100644 index 00000000..d29bda8f --- /dev/null +++ b/tests/e2e_buildkite_test.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""End-to-end test for Buildkite integration. + +Usage: + BUILDKITE_API_TOKEN=xxx python tests/e2e_buildkite_test.py [--queue QUEUE] + +This script: +1. Creates a simple test job +2. Submits it to Buildkite +3. Waits for completion +4. Prints the result +""" + +import argparse +import asyncio +import os +import sys + +# Add src to path for local testing +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + + +async def main(): + parser = argparse.ArgumentParser(description="E2E test for Buildkite integration") + parser.add_argument("--queue", default="test", help="Buildkite queue name (default: test)") + parser.add_argument("--org", default="gpu-mode", help="Buildkite org slug") + parser.add_argument("--pipeline", default="kernelbot", help="Buildkite pipeline slug") + parser.add_argument("--dry-run", action="store_true", help="Just print config, don't submit") + args = parser.parse_args() + + token = os.environ.get("BUILDKITE_API_TOKEN") + if not token: + print("ERROR: BUILDKITE_API_TOKEN environment variable not set") + sys.exit(1) + + from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher + + config = BuildkiteConfig( + org_slug=args.org, + pipeline_slug=args.pipeline, + api_token=token, + ) + + print("=== Buildkite E2E Test ===") + print(f"Organization: {config.org_slug}") + print(f"Pipeline: {config.pipeline_slug}") + print(f"Queue: {args.queue}") + print() + + # Simple test config - just print GPU info + test_config = { + "lang": "py", + "mode": "test", + "sources": { + "submission.py": """ +import torch +print(f"CUDA available: {torch.cuda.is_available()}") +if torch.cuda.is_available(): + print(f"GPU: {torch.cuda.get_device_name()}") + print(f"Device count: {torch.cuda.device_count()}") +""", + }, + "main": "submission.py", + "tests": [], + "benchmarks": [], + } + + if args.dry_run: + print("Dry run - config would be:") + import json + print(json.dumps(test_config, indent=2)) + return + + launcher = BuildkiteLauncher(config) + + # Create a simple status reporter + class SimpleReporter: + async def push(self, msg): + print(f"[STATUS] {msg}") + + async def update(self, msg): + print(f"[UPDATE] {msg}") + + print("Submitting test job...") + result = await launcher._launch( + run_id="e2e-test", + config=test_config, + queue=args.queue, + status=SimpleReporter(), + ) + + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + if result.build_url: + print(f"Build URL: {result.build_url}") + if result.result: + import json + print(f"Result: {json.dumps(result.result, indent=2)}") + + # Also test queue status + print() + print("=== Queue Status ===") + status = await launcher.get_queue_status(args.queue) + print(f"Queue: {status.get('queue')}") + print(f"Total agents: {status.get('total')}") + print(f"Idle agents: {status.get('idle')}") + for agent in status.get("agents", []): + print(f" - {agent['name']}: {agent['state']} (busy={agent['busy']})") + + +if __name__ == "__main__": + asyncio.run(main()) From ac465231b0052dc2f310e5ca0288b604f2ec63a8 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 15:31:50 -0800 Subject: [PATCH 02/27] Add Buildkite infrastructure documentation --- SKILLS/buildkite.md | 274 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 SKILLS/buildkite.md diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md new file mode 100644 index 00000000..e9b57c75 --- /dev/null +++ b/SKILLS/buildkite.md @@ -0,0 +1,274 @@ +# Buildkite GPU Infrastructure Guide + +This document describes how to set up and use the Buildkite infrastructure for GPU job isolation. + +## Overview + +Buildkite provides a parallel infrastructure for onboarding arbitrary GPU vendors with proper isolation. It runs alongside the existing GitHub Actions system, providing: + +- Per-GPU job isolation via `NVIDIA_VISIBLE_DEVICES` +- Resource constraints (CPU, RAM, disk) via Docker cgroups +- Clear, reproducible Docker environment +- Automatic queue management + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ VENDOR 8-GPU NODE │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ Agent GPU-0 │ │ Agent GPU-1 │ ... │ Agent GPU-7 │ │ +│ │ NVIDIA_VIS │ │ NVIDIA_VIS │ │ NVIDIA_VIS │ │ +│ │ IBLE_DEV=0 │ │ IBLE_DEV=1 │ │ IBLE_DEV=7 │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ +│ └───────────────┴───────────────────┘ │ +│ │ │ +│ ┌────────────▼────────────┐ │ +│ │ queue = "nvidia-b200" │ ← All agents same queue│ +│ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ + ┌───────────────────────┐ + │ BUILDKITE CLOUD │ + │ Routes to idle agent │ + └───────────────────────┘ +``` + +## Prerequisites + +### Buildkite Account Setup + +1. Create/access Buildkite organization at https://buildkite.com +2. Create a pipeline named `kernelbot` +3. Generate two tokens: + - **Agent Token**: For nodes to connect (Agents → Agent Tokens) + - **API Token**: For submitting jobs (Personal Settings → API Access Tokens) + +### API Token Permissions + +The API token needs these scopes: +- `read_builds` +- `write_builds` +- `read_agents` (optional, for queue status) + +## Vendor Node Setup + +### Automated Setup (Full) + +For a fresh Ubuntu node with NVIDIA GPUs: + +```bash +git clone https://github.com/gpu-mode/kernelbot.git +cd kernelbot +git checkout buildkite-infrastructure + +sudo BUILDKITE_AGENT_TOKEN= GPU_TYPE= ./deployment/buildkite/setup-node.sh +``` + +Environment variables: +- `BUILDKITE_AGENT_TOKEN` (required): Agent token from Buildkite +- `GPU_TYPE` (required): Queue name, e.g., `b200`, `h100`, `mi300`, `test` +- `GPU_COUNT` (optional): Number of GPUs (auto-detected) +- `CPUS_PER_GPU` (optional): CPUs per agent (default: 8) +- `RAM_PER_GPU` (optional): RAM per agent (default: 64g) +- `NODE_NAME` (optional): Node identifier (default: hostname) + +### Manual Setup (Existing Docker/NVIDIA) + +If Docker and nvidia-container-toolkit are already installed: + +```bash +# Install Buildkite agent +sudo apt-get install -y apt-transport-https gnupg +curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | \ + sudo gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg +echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | \ + sudo tee /etc/apt/sources.list.d/buildkite-agent.list +sudo apt-get update +sudo apt-get install -y buildkite-agent + +# Configure agent +export BUILDKITE_TOKEN="" +export GPU_TYPE="test" +export NODE_NAME=$(hostname) + +echo "token=\"${BUILDKITE_TOKEN}\" +name=\"${NODE_NAME}-gpu0\" +tags=\"queue=${GPU_TYPE},gpu=${GPU_TYPE},gpu-index=0,node=${NODE_NAME}\"" | \ + sudo tee /etc/buildkite-agent/buildkite-agent.cfg + +# Add to docker group and start +sudo usermod -aG docker buildkite-agent +sudo systemctl enable buildkite-agent +sudo systemctl start buildkite-agent +``` + +### Verify Agent Connection + +Check the Buildkite dashboard: +``` +https://buildkite.com/organizations//agents +``` + +Or via API: +```bash +curl -H "Authorization: Bearer " \ + https://api.buildkite.com/v2/organizations//agents +``` + +## Pipeline Configuration + +### Create Pipeline in Buildkite + +1. Go to Pipelines → New Pipeline +2. Name: `kernelbot` +3. Repository: `https://github.com/gpu-mode/kernelbot` +4. Steps: Either upload from repo or paste directly + +### Pipeline YAML + +The pipeline is at `deployment/buildkite/pipeline.yml`: + +```yaml +steps: + - label: ":rocket: Kernel Evaluation" + command: "python /app/src/runners/buildkite-runner.py" + agents: + queue: "${KERNELBOT_QUEUE}" + plugins: + - docker#v5.11.0: + image: "${KERNELBOT_IMAGE:-ghcr.io/gpu-mode/kernelbot:latest}" + runtime: nvidia + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + timeout_in_minutes: 15 +``` + +## Testing + +### End-to-End Test + +Run from your local machine: + +```bash +cd kernelbot +BUILDKITE_API_TOKEN= uv run python tests/e2e_buildkite_test.py --queue test +``` + +Options: +- `--queue `: Target queue (default: test) +- `--org `: Buildkite org (default: gpu-mode) +- `--pipeline `: Pipeline name (default: kernelbot) +- `--dry-run`: Print config without submitting + +### Check Queue Status + +```bash +BUILDKITE_API_TOKEN= uv run python -c " +import asyncio +from libkernelbot.launchers.buildkite import BuildkiteLauncher, BuildkiteConfig + +async def main(): + launcher = BuildkiteLauncher(BuildkiteConfig(api_token='')) + status = await launcher.get_queue_status('test') + print(f'Queue: {status[\"queue\"]}') + print(f'Total agents: {status[\"total\"]}') + print(f'Idle agents: {status[\"idle\"]}') + for agent in status['agents']: + print(f' - {agent[\"name\"]}: busy={agent[\"busy\"]}') + +asyncio.run(main()) +" +``` + +## GPU Types + +Buildkite-managed GPUs are registered with `_BK` suffix: + +| GPU Type | Queue | SM Arch | +|----------|-------|---------| +| `B200_BK` | `b200` | 100 | +| `H100_BK` | `h100` | 90a | +| `MI300_BK` | `mi300` | (AMD) | + +## Environment Variables + +### For Kernelbot API/Backend + +- `BUILDKITE_API_TOKEN`: API token for submitting jobs + +### For Buildkite Agents (set by setup script) + +- `NVIDIA_VISIBLE_DEVICES`: GPU index for isolation +- `CUDA_VISIBLE_DEVICES`: Same as above +- `KERNELBOT_GPU_INDEX`: GPU index (0, 1, 2, ...) +- `KERNELBOT_CPUSET`: CPU cores for this agent +- `KERNELBOT_MEMORY`: Memory limit + +### For Jobs (passed via pipeline) + +- `KERNELBOT_RUN_ID`: Unique run identifier +- `KERNELBOT_PAYLOAD`: Base64+zlib compressed job config +- `KERNELBOT_QUEUE`: Target queue name + +## Troubleshooting + +### Agent not appearing in dashboard + +1. Check agent is running: `sudo systemctl status buildkite-agent` +2. Check logs: `sudo journalctl -u buildkite-agent -f` +3. Verify token is correct in `/etc/buildkite-agent/buildkite-agent.cfg` + +### Job stuck in queue + +1. Check agents are idle: Buildkite dashboard → Agents +2. Verify queue name matches agent tags +3. Check agent logs for errors + +### Docker permission denied + +```bash +sudo usermod -aG docker buildkite-agent +sudo systemctl restart buildkite-agent +``` + +### GPU not visible in container + +1. Verify nvidia-container-toolkit: `nvidia-ctk --version` +2. Configure docker runtime: `sudo nvidia-ctk runtime configure --runtime=docker` +3. Restart docker: `sudo systemctl restart docker` +4. Test: `docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi` + +### Package dependency conflicts (nvidia-container-toolkit) + +If you see version conflicts: +```bash +sudo apt-get install -y nvidia-container-toolkit=1.18.1-1 nvidia-container-toolkit-base=1.18.1-1 +``` + +## Resource Isolation + +| Resource | Method | Enforcement | +|----------|--------|-------------| +| GPU | `NVIDIA_VISIBLE_DEVICES` | Per-agent env var | +| CPU | `--cpuset-cpus` | Docker cgroups | +| Memory | `--memory` | Docker cgroups | +| Disk | Separate build paths | Filesystem | +| Network | Docker bridge | Container isolation | + +## Files Reference + +| File | Purpose | +|------|---------| +| `deployment/buildkite/setup-node.sh` | Vendor node setup script | +| `deployment/buildkite/pipeline.yml` | Buildkite pipeline config | +| `deployment/buildkite/Dockerfile` | Docker image for jobs | +| `src/libkernelbot/launchers/buildkite.py` | BuildkiteLauncher class | +| `src/runners/buildkite-runner.py` | Job execution script | +| `tests/e2e_buildkite_test.py` | E2E test script | From 8db475441bef488757127db6565582f1a92b8f77 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 15:52:41 -0800 Subject: [PATCH 03/27] Update Buildkite docs with troubleshooting lessons learned --- SKILLS/buildkite.md | 51 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index e9b57c75..cad2ae9c 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -252,6 +252,57 @@ If you see version conflicts: sudo apt-get install -y nvidia-container-toolkit=1.18.1-1 nvidia-container-toolkit-base=1.18.1-1 ``` +### Agent fails with "Missing build-path" + +The config file needs `build-path` set: + +```bash +sudo nano /etc/buildkite-agent/buildkite-agent.cfg +``` + +Add this line: +``` +build-path="/var/lib/buildkite-agent/builds" +``` + +Then: +```bash +sudo mkdir -p /var/lib/buildkite-agent/builds +sudo chown buildkite-agent:buildkite-agent /var/lib/buildkite-agent/builds +sudo systemctl restart buildkite-agent +``` + +### Agent not appearing - "Could not find queue" + +You must create the queue in Buildkite web UI: +1. Go to **Agents** tab → **Default cluster** → **Queues** +2. Click **New Queue** +3. Enter queue name (e.g., `test`) +4. Select **Self hosted** +5. Click **Create Queue** + +### Jobs run on hosted agents instead of self-hosted + +Make sure your pipeline steps include the queue: + +```yaml +steps: + - label: ":rocket: Test Job" + command: "nvidia-smi" + agents: + queue: "test" # This is required! +``` + +Without `agents: queue:`, Buildkite uses hosted runners by default. + +### Git clone fails with "Permission denied (publickey)" + +The buildkite-agent user doesn't have SSH keys for GitHub. Fix by using HTTPS: + +```bash +cd /tmp && sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:" +``` + ## Resource Isolation | Resource | Method | Enforcement | From 2b97f309eb9b30bf60747ab64da12b414e4765e1 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 15:55:41 -0800 Subject: [PATCH 04/27] Add simplified Buildkite setup script with proper GPU isolation --- deployment/buildkite/setup-node-simple.sh | 121 ++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100755 deployment/buildkite/setup-node-simple.sh diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh new file mode 100755 index 00000000..1fcbbaa4 --- /dev/null +++ b/deployment/buildkite/setup-node-simple.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# Buildkite GPU Node Setup - Simplified version +# Usage: sudo BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=test ./setup-node-simple.sh + +set -euo pipefail + +# === CONFIGURATION === +BUILDKITE_TOKEN="${BUILDKITE_AGENT_TOKEN:?Must set BUILDKITE_AGENT_TOKEN}" +GPU_TYPE="${GPU_TYPE:?Must set GPU_TYPE (e.g., b200, mi300, h100, test)}" +NODE_NAME="${NODE_NAME:-$(hostname)}" + +# Auto-detect GPU count +if command -v nvidia-smi &> /dev/null; then + GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1) +else + GPU_COUNT="${GPU_COUNT:-1}" +fi + +echo "=== Buildkite GPU Node Setup ===" +echo "Node: ${NODE_NAME}" +echo "GPU Type: ${GPU_TYPE}" +echo "GPU Count: ${GPU_COUNT}" +echo "" + +# === INSTALL BUILDKITE AGENT === +if ! command -v buildkite-agent &> /dev/null; then + echo "Installing Buildkite Agent..." + apt-get update + apt-get install -y apt-transport-https gnupg + curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | \ + gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | \ + tee /etc/apt/sources.list.d/buildkite-agent.list + apt-get update + apt-get install -y buildkite-agent +fi + +# === STOP EXISTING AGENTS === +echo "Stopping existing agents..." +systemctl stop buildkite-agent 2>/dev/null || true +for i in $(seq 0 15); do + systemctl stop "buildkite-agent-gpu${i}" 2>/dev/null || true + systemctl disable "buildkite-agent-gpu${i}" 2>/dev/null || true +done + +# === CREATE DIRECTORIES === +mkdir -p /var/lib/buildkite-agent/builds +chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent + +# === CONFIGURE GIT TO USE HTTPS === +sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:" + +# === CREATE AGENT FOR EACH GPU === +echo "Creating ${GPU_COUNT} agents..." + +for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + agent_name="${NODE_NAME}-gpu${gpu_idx}" + config_file="/etc/buildkite-agent/buildkite-agent-gpu${gpu_idx}.cfg" + build_dir="/var/lib/buildkite-agent/builds/gpu${gpu_idx}" + + mkdir -p "${build_dir}" + chown buildkite-agent:buildkite-agent "${build_dir}" + + # Write config + cat > "${config_file}" << EOF +token="${BUILDKITE_TOKEN}" +name="${agent_name}" +tags="queue=${GPU_TYPE},gpu=${GPU_TYPE},gpu-index=${gpu_idx},node=${NODE_NAME}" +build-path="${build_dir}" +hooks-path="/etc/buildkite-agent/hooks" +EOF + + # Write systemd service + cat > "/etc/systemd/system/buildkite-agent-gpu${gpu_idx}.service" << EOF +[Unit] +Description=Buildkite Agent (GPU ${gpu_idx}) +Documentation=https://buildkite.com/docs/agent/v3 +After=network.target + +[Service] +Type=simple +User=buildkite-agent +Environment=NVIDIA_VISIBLE_DEVICES=${gpu_idx} +Environment=CUDA_VISIBLE_DEVICES=${gpu_idx} +ExecStart=/usr/bin/buildkite-agent start --config ${config_file} +RestartSec=5 +Restart=on-failure +TimeoutStartSec=10 +TimeoutStopSec=60 + +[Install] +WantedBy=multi-user.target +EOF + + echo " Created agent ${gpu_idx}: GPU=${gpu_idx}" +done + +# === START AGENTS === +echo "Starting agents..." +systemctl daemon-reload + +for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + systemctl enable "buildkite-agent-gpu${gpu_idx}" + systemctl start "buildkite-agent-gpu${gpu_idx}" +done + +sleep 3 + +echo "" +echo "=== Agent Status ===" +for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + status=$(systemctl is-active "buildkite-agent-gpu${gpu_idx}" 2>/dev/null || echo "unknown") + echo " GPU ${gpu_idx}: ${status}" +done + +echo "" +echo "=== Setup Complete ===" +echo "Created ${GPU_COUNT} agents for queue: ${GPU_TYPE}" +echo "Each agent sees only its assigned GPU via NVIDIA_VISIBLE_DEVICES" +echo "" +echo "Check agents at: https://buildkite.com/organizations/YOUR_ORG/agents" From 41e141dce009461341e80e110680f42b2a35d13e Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 15:56:23 -0800 Subject: [PATCH 05/27] Fix git config permission error in setup script --- deployment/buildkite/setup-node-simple.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh index 1fcbbaa4..23506fd2 100755 --- a/deployment/buildkite/setup-node-simple.sh +++ b/deployment/buildkite/setup-node-simple.sh @@ -48,6 +48,7 @@ mkdir -p /var/lib/buildkite-agent/builds chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent # === CONFIGURE GIT TO USE HTTPS === +cd /tmp sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:" # === CREATE AGENT FOR EACH GPU === From 4c62ea3809c10c6ffb2f1ac6f0ebfaa102b3ae26 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:11:37 -0800 Subject: [PATCH 06/27] Update Buildkite setup script with all fixes: environment hook, git HTTPS, proper isolation --- SKILLS/buildkite.md | 84 ++++++++++------------- deployment/buildkite/setup-node-simple.sh | 60 ++++++++++++++-- 2 files changed, 89 insertions(+), 55 deletions(-) diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index cad2ae9c..979ff197 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -55,68 +55,56 @@ The API token needs these scopes: ## Vendor Node Setup -### Automated Setup (Full) +### Prerequisites (Do This First in Buildkite UI) -For a fresh Ubuntu node with NVIDIA GPUs: +Before running the setup script on your node: + +1. **Create Buildkite account** at https://buildkite.com +2. **Create pipeline** named `kernelbot` +3. **Generate Agent Token**: Go to Agents → Agent Tokens → New Token +4. **Create Queue**: Go to Agents → Default cluster → Queues → New Queue + - Enter your GPU type as the key (e.g., `test`, `b200`, `h100`) + - Select **Self hosted** + - Click Create Queue + +### Run Setup Script + +On your GPU node: ```bash git clone https://github.com/gpu-mode/kernelbot.git cd kernelbot -git checkout buildkite-infrastructure -sudo BUILDKITE_AGENT_TOKEN= GPU_TYPE= ./deployment/buildkite/setup-node.sh +sudo BUILDKITE_AGENT_TOKEN= GPU_TYPE= ./deployment/buildkite/setup-node-simple.sh ``` -Environment variables: -- `BUILDKITE_AGENT_TOKEN` (required): Agent token from Buildkite -- `GPU_TYPE` (required): Queue name, e.g., `b200`, `h100`, `mi300`, `test` -- `GPU_COUNT` (optional): Number of GPUs (auto-detected) -- `CPUS_PER_GPU` (optional): CPUs per agent (default: 8) -- `RAM_PER_GPU` (optional): RAM per agent (default: 64g) -- `NODE_NAME` (optional): Node identifier (default: hostname) +The script will: +- Install Buildkite agent (if not present) +- Create one agent per GPU with proper isolation +- Configure git to use HTTPS (avoids SSH key issues) +- Create environment hook that sets `NVIDIA_VISIBLE_DEVICES` per job +- Start all agents as systemd services -### Manual Setup (Existing Docker/NVIDIA) +### Verify Setup -If Docker and nvidia-container-toolkit are already installed: +1. Check agents appear in Buildkite: https://buildkite.com/organizations/YOUR_ORG/agents +2. Run a test build with this pipeline: -```bash -# Install Buildkite agent -sudo apt-get install -y apt-transport-https gnupg -curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | \ - sudo gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg -echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | \ - sudo tee /etc/apt/sources.list.d/buildkite-agent.list -sudo apt-get update -sudo apt-get install -y buildkite-agent - -# Configure agent -export BUILDKITE_TOKEN="" -export GPU_TYPE="test" -export NODE_NAME=$(hostname) - -echo "token=\"${BUILDKITE_TOKEN}\" -name=\"${NODE_NAME}-gpu0\" -tags=\"queue=${GPU_TYPE},gpu=${GPU_TYPE},gpu-index=0,node=${NODE_NAME}\"" | \ - sudo tee /etc/buildkite-agent/buildkite-agent.cfg - -# Add to docker group and start -sudo usermod -aG docker buildkite-agent -sudo systemctl enable buildkite-agent -sudo systemctl start buildkite-agent +```yaml +steps: + - label: "GPU Test" + command: "echo NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES && nvidia-smi -L" + agents: + queue: "your-queue-name" ``` -### Verify Agent Connection - -Check the Buildkite dashboard: -``` -https://buildkite.com/organizations//agents -``` +### Environment Variables -Or via API: -```bash -curl -H "Authorization: Bearer " \ - https://api.buildkite.com/v2/organizations//agents -``` +The setup script sets these automatically: +- `GPU_TYPE` (required): Queue name matching what you created in Buildkite +- `BUILDKITE_AGENT_TOKEN` (required): Agent token from Buildkite +- `NODE_NAME` (optional): Defaults to hostname +- `GPU_COUNT` (optional): Auto-detected from nvidia-smi ## Pipeline Configuration diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh index 23506fd2..424d007e 100755 --- a/deployment/buildkite/setup-node-simple.sh +++ b/deployment/buildkite/setup-node-simple.sh @@ -1,6 +1,14 @@ #!/bin/bash -# Buildkite GPU Node Setup - Simplified version +# Buildkite GPU Node Setup # Usage: sudo BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=test ./setup-node-simple.sh +# +# PREREQUISITES: +# 1. Create a Buildkite account and pipeline named 'kernelbot' +# 2. Generate an Agent Token from: Agents > Agent Tokens +# 3. Create a queue in: Agents > Default cluster > Queues > New Queue +# - Enter your GPU_TYPE as the key (e.g., 'test', 'b200', 'h100') +# - Select 'Self hosted' +# 4. Run this script with the token and GPU type set -euo pipefail @@ -22,6 +30,12 @@ echo "GPU Type: ${GPU_TYPE}" echo "GPU Count: ${GPU_COUNT}" echo "" +# === CHECK ROOT === +if [[ $EUID -ne 0 ]]; then + echo "ERROR: This script must be run as root (use sudo)" + exit 1 +fi + # === INSTALL BUILDKITE AGENT === if ! command -v buildkite-agent &> /dev/null; then echo "Installing Buildkite Agent..." @@ -33,24 +47,44 @@ if ! command -v buildkite-agent &> /dev/null; then tee /etc/apt/sources.list.d/buildkite-agent.list apt-get update apt-get install -y buildkite-agent + echo "Buildkite Agent installed." +else + echo "Buildkite Agent already installed." fi # === STOP EXISTING AGENTS === echo "Stopping existing agents..." systemctl stop buildkite-agent 2>/dev/null || true +systemctl disable buildkite-agent 2>/dev/null || true for i in $(seq 0 15); do systemctl stop "buildkite-agent-gpu${i}" 2>/dev/null || true systemctl disable "buildkite-agent-gpu${i}" 2>/dev/null || true done # === CREATE DIRECTORIES === +echo "Creating directories..." mkdir -p /var/lib/buildkite-agent/builds +mkdir -p /etc/buildkite-agent/hooks chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent +chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent -# === CONFIGURE GIT TO USE HTTPS === +# === CONFIGURE GIT TO USE HTTPS (avoids SSH key issues) === +echo "Configuring git to use HTTPS..." cd /tmp sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:" +# === CREATE ENVIRONMENT HOOK FOR GPU ISOLATION === +echo "Creating environment hook for GPU isolation..." +cat > /etc/buildkite-agent/hooks/environment << 'HOOKEOF' +#!/bin/bash +# GPU isolation hook - sets NVIDIA_VISIBLE_DEVICES based on agent's gpu-index tag +export NVIDIA_VISIBLE_DEVICES="${BUILDKITE_AGENT_META_DATA_GPU_INDEX}" +export CUDA_VISIBLE_DEVICES="${BUILDKITE_AGENT_META_DATA_GPU_INDEX}" +echo "GPU isolation: NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES}" +HOOKEOF +chmod +x /etc/buildkite-agent/hooks/environment +chown buildkite-agent:buildkite-agent /etc/buildkite-agent/hooks/environment + # === CREATE AGENT FOR EACH GPU === echo "Creating ${GPU_COUNT} agents..." @@ -70,6 +104,7 @@ tags="queue=${GPU_TYPE},gpu=${GPU_TYPE},gpu-index=${gpu_idx},node=${NODE_NAME}" build-path="${build_dir}" hooks-path="/etc/buildkite-agent/hooks" EOF + chown buildkite-agent:buildkite-agent "${config_file}" # Write systemd service cat > "/etc/systemd/system/buildkite-agent-gpu${gpu_idx}.service" << EOF @@ -81,8 +116,6 @@ After=network.target [Service] Type=simple User=buildkite-agent -Environment=NVIDIA_VISIBLE_DEVICES=${gpu_idx} -Environment=CUDA_VISIBLE_DEVICES=${gpu_idx} ExecStart=/usr/bin/buildkite-agent start --config ${config_file} RestartSec=5 Restart=on-failure @@ -93,7 +126,7 @@ TimeoutStopSec=60 WantedBy=multi-user.target EOF - echo " Created agent ${gpu_idx}: GPU=${gpu_idx}" + echo " Created agent ${gpu_idx}: ${agent_name}" done # === START AGENTS === @@ -116,7 +149,20 @@ done echo "" echo "=== Setup Complete ===" +echo "" echo "Created ${GPU_COUNT} agents for queue: ${GPU_TYPE}" -echo "Each agent sees only its assigned GPU via NVIDIA_VISIBLE_DEVICES" +echo "GPU isolation is handled via environment hook (NVIDIA_VISIBLE_DEVICES)" +echo "" +echo "IMPORTANT: Make sure you created the '${GPU_TYPE}' queue in Buildkite:" +echo " 1. Go to: https://buildkite.com/organizations/YOUR_ORG/clusters" +echo " 2. Click 'Default cluster' > 'Queues' > 'New Queue'" +echo " 3. Enter '${GPU_TYPE}' as the key, select 'Self hosted'" +echo "" +echo "Your agents should appear at: https://buildkite.com/organizations/YOUR_ORG/agents" echo "" -echo "Check agents at: https://buildkite.com/organizations/YOUR_ORG/agents" +echo "Test with this pipeline step:" +echo ' steps:' +echo ' - label: "GPU Test"' +echo ' command: "echo NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES && nvidia-smi -L"' +echo ' agents:' +echo " queue: \"${GPU_TYPE}\"" From 1e06bb37a88235bce1338dd6d52032a8fc556200 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:12:44 -0800 Subject: [PATCH 07/27] Add Docker-based pipeline with full resource isolation (GPU, CPU, RAM) --- deployment/buildkite/pipeline-test-docker.yml | 36 +++++++++++++++++++ deployment/buildkite/setup-node-simple.sh | 32 ++++++++++++++--- 2 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 deployment/buildkite/pipeline-test-docker.yml diff --git a/deployment/buildkite/pipeline-test-docker.yml b/deployment/buildkite/pipeline-test-docker.yml new file mode 100644 index 00000000..e9c9a360 --- /dev/null +++ b/deployment/buildkite/pipeline-test-docker.yml @@ -0,0 +1,36 @@ +# Simple Docker test pipeline for Buildkite +# Paste this into your pipeline settings to test Docker + GPU isolation + +steps: + - label: ":whale: Docker GPU Test" + agents: + queue: "test" # Change to your queue name + + plugins: + - docker#v5.11.0: + image: "nvidia/cuda:12.4.0-runtime-ubuntu22.04" + always-pull: true + runtime: nvidia + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + # Resource constraints from environment hook + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + + command: | + echo "=== Environment ===" + echo "NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES" + echo "CUDA_VISIBLE_DEVICES=$$CUDA_VISIBLE_DEVICES" + echo "" + echo "=== GPU Info ===" + nvidia-smi + echo "" + echo "=== CPU Info ===" + nproc + echo "" + echo "=== Memory Info ===" + free -h + + timeout_in_minutes: 5 diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh index 424d007e..66d004b1 100755 --- a/deployment/buildkite/setup-node-simple.sh +++ b/deployment/buildkite/setup-node-simple.sh @@ -74,13 +74,35 @@ cd /tmp sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:" # === CREATE ENVIRONMENT HOOK FOR GPU ISOLATION === -echo "Creating environment hook for GPU isolation..." +echo "Creating environment hook for GPU/CPU/RAM isolation..." cat > /etc/buildkite-agent/hooks/environment << 'HOOKEOF' #!/bin/bash -# GPU isolation hook - sets NVIDIA_VISIBLE_DEVICES based on agent's gpu-index tag -export NVIDIA_VISIBLE_DEVICES="${BUILDKITE_AGENT_META_DATA_GPU_INDEX}" -export CUDA_VISIBLE_DEVICES="${BUILDKITE_AGENT_META_DATA_GPU_INDEX}" -echo "GPU isolation: NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES}" +# Resource isolation hook - sets GPU, CPU, and memory limits based on agent's gpu-index + +GPU_INDEX="${BUILDKITE_AGENT_META_DATA_GPU_INDEX:-0}" +CPUS_PER_GPU="${CPUS_PER_GPU:-8}" +RAM_PER_GPU="${RAM_PER_GPU:-64g}" + +# GPU isolation +export NVIDIA_VISIBLE_DEVICES="${GPU_INDEX}" +export CUDA_VISIBLE_DEVICES="${GPU_INDEX}" + +# CPU isolation (assign a range of CPUs to each GPU) +CPU_START=$((GPU_INDEX * CPUS_PER_GPU)) +CPU_END=$((CPU_START + CPUS_PER_GPU - 1)) +export KERNELBOT_CPUSET="${CPU_START}-${CPU_END}" +export KERNELBOT_CPUS="${CPUS_PER_GPU}" + +# Memory isolation +export KERNELBOT_MEMORY="${RAM_PER_GPU}" + +# GPU index for the runner +export KERNELBOT_GPU_INDEX="${GPU_INDEX}" + +echo "=== Resource Isolation ===" +echo "GPU: ${NVIDIA_VISIBLE_DEVICES}" +echo "CPUs: ${KERNELBOT_CPUSET} (${KERNELBOT_CPUS} cores)" +echo "Memory: ${KERNELBOT_MEMORY}" HOOKEOF chmod +x /etc/buildkite-agent/hooks/environment chown buildkite-agent:buildkite-agent /etc/buildkite-agent/hooks/environment From 64187722abba8faac9a9357783c363cd843e1a6a Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:13:41 -0800 Subject: [PATCH 08/27] Auto-detect CPU/RAM and divide by GPU count in environment hook --- deployment/buildkite/setup-node-simple.sh | 27 +++++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh index 66d004b1..7c67bd98 100755 --- a/deployment/buildkite/setup-node-simple.sh +++ b/deployment/buildkite/setup-node-simple.sh @@ -77,11 +77,25 @@ sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf echo "Creating environment hook for GPU/CPU/RAM isolation..." cat > /etc/buildkite-agent/hooks/environment << 'HOOKEOF' #!/bin/bash -# Resource isolation hook - sets GPU, CPU, and memory limits based on agent's gpu-index +# Resource isolation hook - auto-detects and divides resources by GPU count GPU_INDEX="${BUILDKITE_AGENT_META_DATA_GPU_INDEX:-0}" -CPUS_PER_GPU="${CPUS_PER_GPU:-8}" -RAM_PER_GPU="${RAM_PER_GPU:-64g}" + +# Auto-detect total resources +TOTAL_CPUS=$(nproc) +TOTAL_RAM_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}') +TOTAL_RAM_GB=$((TOTAL_RAM_KB / 1024 / 1024)) + +# Auto-detect GPU count +if command -v nvidia-smi &> /dev/null; then + GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1) +else + GPU_COUNT=1 +fi + +# Calculate per-GPU allocation +CPUS_PER_GPU=$((TOTAL_CPUS / GPU_COUNT)) +RAM_PER_GPU=$((TOTAL_RAM_GB / GPU_COUNT)) # GPU isolation export NVIDIA_VISIBLE_DEVICES="${GPU_INDEX}" @@ -94,15 +108,14 @@ export KERNELBOT_CPUSET="${CPU_START}-${CPU_END}" export KERNELBOT_CPUS="${CPUS_PER_GPU}" # Memory isolation -export KERNELBOT_MEMORY="${RAM_PER_GPU}" +export KERNELBOT_MEMORY="${RAM_PER_GPU}g" # GPU index for the runner export KERNELBOT_GPU_INDEX="${GPU_INDEX}" echo "=== Resource Isolation ===" -echo "GPU: ${NVIDIA_VISIBLE_DEVICES}" -echo "CPUs: ${KERNELBOT_CPUSET} (${KERNELBOT_CPUS} cores)" -echo "Memory: ${KERNELBOT_MEMORY}" +echo "Machine: ${TOTAL_CPUS} CPUs, ${TOTAL_RAM_GB}GB RAM, ${GPU_COUNT} GPUs" +echo "This job: GPU ${NVIDIA_VISIBLE_DEVICES}, CPUs ${KERNELBOT_CPUSET}, RAM ${KERNELBOT_MEMORY}" HOOKEOF chmod +x /etc/buildkite-agent/hooks/environment chown buildkite-agent:buildkite-agent /etc/buildkite-agent/hooks/environment From 6e778695313442bb4970ad83975369d157d96746 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:16:37 -0800 Subject: [PATCH 09/27] Add plugins-path to agent config for Docker plugin support --- deployment/buildkite/setup-node-simple.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh index 7c67bd98..af78686f 100755 --- a/deployment/buildkite/setup-node-simple.sh +++ b/deployment/buildkite/setup-node-simple.sh @@ -64,6 +64,7 @@ done # === CREATE DIRECTORIES === echo "Creating directories..." mkdir -p /var/lib/buildkite-agent/builds +mkdir -p /var/lib/buildkite-agent/plugins mkdir -p /etc/buildkite-agent/hooks chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent @@ -138,6 +139,7 @@ name="${agent_name}" tags="queue=${GPU_TYPE},gpu=${GPU_TYPE},gpu-index=${gpu_idx},node=${NODE_NAME}" build-path="${build_dir}" hooks-path="/etc/buildkite-agent/hooks" +plugins-path="/var/lib/buildkite-agent/plugins" EOF chown buildkite-agent:buildkite-agent "${config_file}" From 5ff1574136a9d71ad5ce685313b5e0fc69238bff Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:26:40 -0800 Subject: [PATCH 10/27] Update buildkite.md with comprehensive progress summary - Add Quick Start section for fast onboarding - Add Current Status showing working 2x L40S test - Add Working Docker Pipeline example with key points - Add troubleshooting for Docker runtime crashes and hook shebang - Document auto-resource detection (CPU/RAM divided by GPU count) - Add Summary of Key Decisions section --- SKILLS/buildkite.md | 104 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index 979ff197..4818d040 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -11,6 +11,20 @@ Buildkite provides a parallel infrastructure for onboarding arbitrary GPU vendor - Clear, reproducible Docker environment - Automatic queue management +## Quick Start + +1. Create queue in Buildkite UI: Agents → Default cluster → Queues → New Queue (select "Self hosted") +2. Run setup script on your GPU node: + ```bash + sudo BUILDKITE_AGENT_TOKEN= GPU_TYPE= ./deployment/buildkite/setup-node-simple.sh + ``` +3. Test with `pipeline-test-docker.yml` + +## Current Status + +**Working**: Full GPU isolation with auto-resource detection. Tested on 2x NVIDIA L40S node with: +- Each agent gets 1 GPU, 8 CPUs, 144GB RAM (auto-calculated from 16 CPUs / 2 GPUs, 289GB / 2 GPUs) + ## Architecture ``` @@ -140,6 +154,40 @@ steps: ## Testing +### Working Docker Pipeline + +Use this tested pipeline configuration for GPU jobs: + +```yaml +steps: + - label: ":whale: Docker GPU Test" + agents: + queue: "test" # Must match your queue name + plugins: + - docker#v5.11.0: + image: "nvidia/cuda:12.4.0-runtime-ubuntu22.04" + always-pull: true + gpus: "all" # Use gpus instead of runtime: nvidia + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + command: | + echo "=== Resource Isolation ===" + echo "NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES" + nvidia-smi + nproc + free -h + timeout_in_minutes: 5 +``` + +**Key points**: +- Use `gpus: "all"` instead of `runtime: nvidia` (more reliable) +- Use `$$NVIDIA_VISIBLE_DEVICES` (double dollar) in YAML to prevent variable stripping +- The environment hook auto-sets KERNELBOT_CPUS, KERNELBOT_MEMORY based on machine resources + ### End-to-End Test Run from your local machine: @@ -283,6 +331,33 @@ steps: Without `agents: queue:`, Buildkite uses hosted runners by default. +### Docker runtime crashes / "nvidia-container-runtime: no such file" + +Use `gpus: "all"` in the Docker plugin instead of `runtime: nvidia`: + +```yaml +plugins: + - docker#v5.11.0: + gpus: "all" # ✓ Use this + # runtime: nvidia # ✗ Avoid - can cause crashes +``` + +If issues persist, reinstall nvidia-container-toolkit: +```bash +sudo apt-get remove --purge nvidia-container-toolkit nvidia-container-toolkit-base +sudo apt-get install nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + +### Environment hook not running + +Make sure the hook has a shebang line: +```bash +#!/bin/bash +# Rest of hook script... +``` + ### Git clone fails with "Permission denied (publickey)" The buildkite-agent user doesn't have SSH keys for GitHub. Fix by using HTTPS: @@ -305,9 +380,36 @@ cd /tmp && sudo -u buildkite-agent git config --global url."https://github.com/" | File | Purpose | |------|---------| -| `deployment/buildkite/setup-node.sh` | Vendor node setup script | +| `deployment/buildkite/setup-node-simple.sh` | Vendor node setup script (recommended) | | `deployment/buildkite/pipeline.yml` | Buildkite pipeline config | +| `deployment/buildkite/pipeline-test-docker.yml` | Docker test pipeline | | `deployment/buildkite/Dockerfile` | Docker image for jobs | | `src/libkernelbot/launchers/buildkite.py` | BuildkiteLauncher class | | `src/runners/buildkite-runner.py` | Job execution script | | `tests/e2e_buildkite_test.py` | E2E test script | + +## Auto-Resource Detection + +The environment hook automatically detects and divides machine resources: + +``` +Machine: 16 CPUs, 289GB RAM, 2 GPUs + ↓ +Per-GPU allocation: + - GPU 0: CPUs 0-7, 144GB RAM + - GPU 1: CPUs 8-15, 144GB RAM +``` + +This is calculated in the environment hook as: +- `CPUS_PER_GPU = TOTAL_CPUS / GPU_COUNT` +- `RAM_PER_GPU = TOTAL_RAM_GB / GPU_COUNT` +- `KERNELBOT_CPUSET = (GPU_INDEX * CPUS_PER_GPU) to ((GPU_INDEX + 1) * CPUS_PER_GPU - 1)` + +## Summary of Key Decisions + +1. **Use `gpus: "all"` not `runtime: nvidia`** - More reliable with nvidia-container-toolkit +2. **Environment hook for isolation** - Sets `NVIDIA_VISIBLE_DEVICES`, `KERNELBOT_*` vars before each job +3. **Auto-detect resources** - No hardcoded CPU/RAM values; divides machine resources by GPU count +4. **One agent per GPU** - Each agent has its own build path and GPU assignment +5. **HTTPS for git** - Avoids SSH key issues on buildkite-agent user +6. **Queue must exist first** - Create queue in Buildkite UI before agents can connect From 02c1c1070d45175595bb5ee0bcc9bbaed5da9c31 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:30:19 -0800 Subject: [PATCH 11/27] Add inline steps support for artifact testing - Add inline_steps parameter to _launch() for testing without pipeline config - Add create_artifact_test_steps() helper method - Update e2e test to use artifact mode by default (no Buildkite config needed) - Fix pipeline.yml to use gpus: "all" instead of runtime: nvidia - Add pipeline-artifact-test.yml for manual testing --- .../buildkite/pipeline-artifact-test.yml | 69 +++++++++++++++++ deployment/buildkite/pipeline.yml | 2 +- src/libkernelbot/launchers/buildkite.py | 77 +++++++++++++++++++ tests/e2e_buildkite_test.py | 47 ++++++----- 4 files changed, 176 insertions(+), 19 deletions(-) create mode 100644 deployment/buildkite/pipeline-artifact-test.yml diff --git a/deployment/buildkite/pipeline-artifact-test.yml b/deployment/buildkite/pipeline-artifact-test.yml new file mode 100644 index 00000000..d7c37aba --- /dev/null +++ b/deployment/buildkite/pipeline-artifact-test.yml @@ -0,0 +1,69 @@ +# Simple artifact test pipeline +# Tests: submit job -> run in Docker -> write result.json -> upload artifact -> download + +steps: + - label: ":package: Artifact Test" + agents: + queue: "${KERNELBOT_QUEUE:-test}" + + plugins: + - docker#v5.11.0: + image: "python:3.11-slim" + propagate-environment: true + environment: + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + - NVIDIA_VISIBLE_DEVICES + + command: | + python3 << 'PYEOF' + import base64 + import json + import os + import zlib + from datetime import datetime + + run_id = os.environ.get("KERNELBOT_RUN_ID", "unknown") + payload_b64 = os.environ.get("KERNELBOT_PAYLOAD", "") + + print("=== Artifact Test ===") + print(f"Run ID: {run_id}") + print(f"GPU: {os.environ.get('NVIDIA_VISIBLE_DEVICES', 'not set')}") + + # Decode payload if present + config = {} + if payload_b64: + try: + compressed = base64.b64decode(payload_b64) + config_json = zlib.decompress(compressed).decode("utf-8") + config = json.loads(config_json) + print(f"Decoded config: {json.dumps(config, indent=2)}") + except Exception as e: + print(f"Could not decode payload: {e}") + + # Create result + result = { + "success": True, + "error": None, + "run_id": run_id, + "timestamp": datetime.now().isoformat(), + "config_received": config, + "system": { + "gpu": os.environ.get("NVIDIA_VISIBLE_DEVICES", "none"), + }, + "runs": {} + } + + # Write result.json + with open("result.json", "w") as f: + json.dump(result, f, indent=2) + + print("\n=== Result ===") + print(json.dumps(result, indent=2)) + print("\nResult written to result.json") + PYEOF + + artifact_paths: + - "result.json" + + timeout_in_minutes: 5 diff --git a/deployment/buildkite/pipeline.yml b/deployment/buildkite/pipeline.yml index 13826a39..0c2da4ce 100644 --- a/deployment/buildkite/pipeline.yml +++ b/deployment/buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: - docker#v5.11.0: image: "${KERNELBOT_IMAGE:-ghcr.io/gpu-mode/kernelbot:latest}" always-pull: true - runtime: nvidia + gpus: "all" # Use gpus instead of runtime: nvidia for reliability # GPU isolation - agent exports NVIDIA_VISIBLE_DEVICES propagate-environment: true environment: diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py index 88c476a2..3dd44394 100644 --- a/src/libkernelbot/launchers/buildkite.py +++ b/src/libkernelbot/launchers/buildkite.py @@ -153,6 +153,7 @@ async def _launch( config: dict[str, Any], queue: str, status: RunProgressReporter, + inline_steps: list[dict[str, Any]] | None = None, ) -> BuildkiteResult: """ Launch a kernel evaluation job. @@ -162,6 +163,7 @@ async def _launch( config: Evaluation configuration dict queue: GPU queue name (e.g., "b200", "mi300") status: Progress reporter + inline_steps: Optional inline pipeline steps (for testing without pipeline config) Returns: BuildkiteResult with success status and results @@ -193,6 +195,10 @@ async def _launch( }, } + # If inline steps provided, use them instead of pipeline from repo + if inline_steps: + build_data["steps"] = inline_steps + try: response = await client.post(url, json=build_data) response.raise_for_status() @@ -357,3 +363,74 @@ async def get_queue_status(self, queue: str) -> dict[str, Any]: "idle": sum(1 for a in queue_agents if not a["busy"]), "agents": queue_agents, } + + def create_artifact_test_steps(self, queue: str) -> list[dict[str, Any]]: + """Create inline steps for artifact upload/download testing.""" + # Python script that decodes payload and writes result.json + script = ''' +import base64 +import json +import os +import zlib +from datetime import datetime + +run_id = os.environ.get("KERNELBOT_RUN_ID", "unknown") +payload_b64 = os.environ.get("KERNELBOT_PAYLOAD", "") + +print("=== Artifact Test ===") +print(f"Run ID: {run_id}") +print(f"GPU: {os.environ.get('NVIDIA_VISIBLE_DEVICES', 'not set')}") + +# Decode payload if present +config = {} +if payload_b64: + try: + compressed = base64.b64decode(payload_b64) + config_json = zlib.decompress(compressed).decode("utf-8") + config = json.loads(config_json) + print(f"Decoded config keys: {list(config.keys())}") + except Exception as e: + print(f"Could not decode payload: {e}") + +# Create result matching FullResult structure +result = { + "success": True, + "error": "", + "runs": {}, + "system": { + "gpu_name": os.environ.get("NVIDIA_VISIBLE_DEVICES", "unknown"), + "cuda_version": "test", + "python_version": "3.11", + }, +} + +# Write result.json +with open("result.json", "w") as f: + json.dump(result, f, indent=2) + +print("\\n=== Result ===") +print(json.dumps(result, indent=2)) +print("\\nResult written to result.json") +''' + return [ + { + "label": ":test_tube: Artifact Test", + "agents": {"queue": queue}, + "plugins": [ + { + "docker#v5.11.0": { + "image": "python:3.11-slim", + "propagate-environment": True, + "environment": [ + "KERNELBOT_PAYLOAD", + "KERNELBOT_RUN_ID", + "NVIDIA_VISIBLE_DEVICES", + ], + } + } + ], + "command": f"python3 -c {json.dumps(script)}", + "artifact_paths": ["result.json"], + "timeout_in_minutes": 5, + } + ] diff --git a/tests/e2e_buildkite_test.py b/tests/e2e_buildkite_test.py index d29bda8f..a1df0b94 100644 --- a/tests/e2e_buildkite_test.py +++ b/tests/e2e_buildkite_test.py @@ -6,9 +6,9 @@ This script: 1. Creates a simple test job -2. Submits it to Buildkite +2. Submits it to Buildkite with inline steps (no pipeline config needed) 3. Waits for completion -4. Prints the result +4. Downloads and prints the result artifact """ import argparse @@ -26,6 +26,12 @@ async def main(): parser.add_argument("--org", default="gpu-mode", help="Buildkite org slug") parser.add_argument("--pipeline", default="kernelbot", help="Buildkite pipeline slug") parser.add_argument("--dry-run", action="store_true", help="Just print config, don't submit") + parser.add_argument( + "--mode", + choices=["artifact", "full"], + default="artifact", + help="Test mode: artifact (simple inline test) or full (uses pipeline from repo)", + ) args = parser.parse_args() token = os.environ.get("BUILDKITE_API_TOKEN") @@ -45,29 +51,19 @@ async def main(): print(f"Organization: {config.org_slug}") print(f"Pipeline: {config.pipeline_slug}") print(f"Queue: {args.queue}") + print(f"Mode: {args.mode}") print() - # Simple test config - just print GPU info + # Simple test config test_config = { - "lang": "py", - "mode": "test", - "sources": { - "submission.py": """ -import torch -print(f"CUDA available: {torch.cuda.is_available()}") -if torch.cuda.is_available(): - print(f"GPU: {torch.cuda.get_device_name()}") - print(f"Device count: {torch.cuda.device_count()}") -""", - }, - "main": "submission.py", - "tests": [], - "benchmarks": [], + "test": True, + "message": "Hello from e2e test", } if args.dry_run: print("Dry run - config would be:") import json + print(json.dumps(test_config, indent=2)) return @@ -82,11 +78,19 @@ async def update(self, msg): print(f"[UPDATE] {msg}") print("Submitting test job...") + + # Use inline steps for artifact mode (no pipeline config needed in Buildkite) + inline_steps = None + if args.mode == "artifact": + inline_steps = launcher.create_artifact_test_steps(args.queue) + print("Using inline steps (no pipeline config needed)") + result = await launcher._launch( run_id="e2e-test", config=test_config, queue=args.queue, status=SimpleReporter(), + inline_steps=inline_steps, ) print() @@ -98,7 +102,11 @@ async def update(self, msg): print(f"Build URL: {result.build_url}") if result.result: import json - print(f"Result: {json.dumps(result.result, indent=2)}") + + print("Downloaded artifact:") + print(json.dumps(result.result, indent=2)) + else: + print("No artifact downloaded (result.json not found or download failed)") # Also test queue status print() @@ -110,6 +118,9 @@ async def update(self, msg): for agent in status.get("agents", []): print(f" - {agent['name']}: {agent['state']} (busy={agent['busy']})") + # Exit with appropriate code + sys.exit(0 if result.success else 1) + if __name__ == "__main__": asyncio.run(main()) From 59de25dc0fc412d89ec3961ef7d8b6cae4893955 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:37:34 -0800 Subject: [PATCH 12/27] Fix artifact download to follow S3 redirects Buildkite returns a 302 redirect to S3 for artifact downloads. The auth header shouldn't be forwarded to S3, so we now: 1. Request with follow_redirects=False 2. Extract the S3 URL from the Location header 3. Fetch from S3 with a clean client Also update test pipeline to write result.json artifact. --- deployment/buildkite/pipeline-test-docker.yml | 27 +++++++++++++++++-- src/libkernelbot/launchers/buildkite.py | 16 ++++++++--- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/deployment/buildkite/pipeline-test-docker.yml b/deployment/buildkite/pipeline-test-docker.yml index e9c9a360..bba1c955 100644 --- a/deployment/buildkite/pipeline-test-docker.yml +++ b/deployment/buildkite/pipeline-test-docker.yml @@ -1,5 +1,5 @@ # Simple Docker test pipeline for Buildkite -# Paste this into your pipeline settings to test Docker + GPU isolation +# Paste this into your pipeline settings to test Docker + GPU isolation + artifacts steps: - label: ":whale: Docker GPU Test" @@ -10,11 +10,13 @@ steps: - docker#v5.11.0: image: "nvidia/cuda:12.4.0-runtime-ubuntu22.04" always-pull: true - runtime: nvidia + gpus: "all" propagate-environment: true environment: - NVIDIA_VISIBLE_DEVICES - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID # Resource constraints from environment hook cpus: "${KERNELBOT_CPUS:-8}" memory: "${KERNELBOT_MEMORY:-64g}" @@ -23,6 +25,7 @@ steps: echo "=== Environment ===" echo "NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES" echo "CUDA_VISIBLE_DEVICES=$$CUDA_VISIBLE_DEVICES" + echo "KERNELBOT_RUN_ID=$$KERNELBOT_RUN_ID" echo "" echo "=== GPU Info ===" nvidia-smi @@ -32,5 +35,25 @@ steps: echo "" echo "=== Memory Info ===" free -h + echo "" + echo "=== Creating result.json ===" + cat > result.json << JSONEOF + { + "success": true, + "error": "", + "runs": {}, + "system": { + "gpu_name": "$$NVIDIA_VISIBLE_DEVICES", + "cuda_version": "12.4", + "python_version": "N/A" + } + } + JSONEOF + cat result.json + echo "" + echo "=== Done ===" + + artifact_paths: + - "result.json" timeout_in_minutes: 5 diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py index 3dd44394..0b654161 100644 --- a/src/libkernelbot/launchers/buildkite.py +++ b/src/libkernelbot/launchers/buildkite.py @@ -296,9 +296,19 @@ async def _download_result(self, build: dict) -> dict[str, Any] | None: for artifact in artifacts: if artifact.get("filename") == "result.json": download_url = artifact.get("download_url") - result_resp = await client.get(download_url) - result_resp.raise_for_status() - return result_resp.json() + # Buildkite returns a 302 redirect to S3 + # We need to follow it without the auth header + result_resp = await client.get(download_url, follow_redirects=False) + if result_resp.status_code == 302: + # Get the redirect URL and fetch without auth + s3_url = result_resp.headers.get("location") + async with httpx.AsyncClient(timeout=30.0) as s3_client: + result_resp = await s3_client.get(s3_url) + result_resp.raise_for_status() + return result_resp.json() + else: + result_resp.raise_for_status() + return result_resp.json() except Exception as e: logger.error(f"Failed to download artifacts: {e}") From c1596b5b6539f119ac88c6e9c098de66a126cd17 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:39:56 -0800 Subject: [PATCH 13/27] Update buildkite.md with E2E workflow documentation - Add read_artifacts to required API token scopes - Add S3 redirect handling to key decisions - Add complete E2E workflow diagram and explanation - Include verified test output showing successful artifact download - Document the 9-step flow from submission to result retrieval --- SKILLS/buildkite.md | 98 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 3 deletions(-) diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index 4818d040..e3dce853 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -63,9 +63,10 @@ Buildkite provides a parallel infrastructure for onboarding arbitrary GPU vendor ### API Token Permissions The API token needs these scopes: -- `read_builds` -- `write_builds` -- `read_agents` (optional, for queue status) +- `read_builds` - Poll build status +- `write_builds` - Create/trigger builds +- `read_artifacts` - Download result.json artifact +- `read_agents` (optional) - Check queue status ## Vendor Node Setup @@ -413,3 +414,94 @@ This is calculated in the environment hook as: 4. **One agent per GPU** - Each agent has its own build path and GPU assignment 5. **HTTPS for git** - Avoids SSH key issues on buildkite-agent user 6. **Queue must exist first** - Create queue in Buildkite UI before agents can connect +7. **Follow S3 redirects for artifacts** - Buildkite returns 302 to S3; must fetch without auth header + +## E2E Workflow (Verified Working) + +The complete end-to-end flow for submitting jobs and retrieving results: + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Your Backend │────▶│ Buildkite │────▶│ GPU Runner │ +│ │ │ Cloud │ │ (Self-hosted) │ +│ BuildkiteLauncher │ │ │ │ +│ ._launch() │ │ Routes to │ │ Runs Docker │ +│ │ │ idle agent │ │ container │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ │ │ + │ 1. POST /builds │ │ + │ (payload encoded) │ │ + │──────────────────────▶│ │ + │ │ 2. Dispatch job │ + │ │──────────────────────▶│ + │ │ │ + │ │ │ 3. Run evaluation + │ │ │ Write result.json + │ │ │ + │ │ 4. Upload artifact │ + │ │◀──────────────────────│ + │ │ │ + │ 5. Poll status │ │ + │◀─────────────────────▶│ │ + │ │ │ + │ 6. Download artifact │ │ + │ (via S3 redirect) │ │ + │◀──────────────────────│ │ + │ │ │ + ▼ │ │ + Return result │ │ +``` + +### Verified Test Output + +``` +=== Buildkite E2E Test === +Organization: mark-saroufim +Pipeline: kernelbot +Queue: test +Mode: artifact + +Submitting test job... +[UPDATE] Build created: [28] +[UPDATE] Build completed: [28] + +=== Result === +Success: True +Build URL: https://buildkite.com/mark-saroufim/kernelbot/builds/28 +Downloaded artifact: +{ + "success": true, + "error": "", + "runs": {}, + "system": { + "gpu_name": "test", + "cuda_version": "12.4", + "python_version": "N/A" + } +} + +=== Queue Status === +Queue: test +Total agents: 0 +Idle agents: 0 +``` + +### How It Works + +1. **BuildkiteLauncher._launch()** encodes config as base64+zlib compressed payload +2. **POST to Buildkite API** creates a build with env vars (KERNELBOT_PAYLOAD, KERNELBOT_RUN_ID) +3. **Buildkite routes** the job to an idle agent in the specified queue +4. **Agent runs Docker container** with GPU isolation (NVIDIA_VISIBLE_DEVICES set by environment hook) +5. **Container writes result.json** to working directory +6. **Buildkite uploads artifact** to S3 +7. **BuildkiteLauncher polls** until build completes +8. **Downloads result.json** by following S3 redirect (without auth header) +9. **Returns parsed result** to caller + +### Running the E2E Test + +```bash +BUILDKITE_API_TOKEN= uv run python tests/e2e_buildkite_test.py \ + --org \ + --queue test +``` From 78c61c2bc864c3b9e1f0ac61f6add6104eac046f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 16:46:18 -0800 Subject: [PATCH 14/27] Add Buildkite integration tests and real evaluation support - Add L40S_BK GPU type for test infrastructure - Create pipeline-eval.yml for running real kernel evaluations - Create tests/test_buildkite.py with integration tests matching modal/github pattern - Update submit_buildkite_job.py to support --eval flag for real evaluations - Add queue mapping for L40S_BK -> test queue --- deployment/buildkite/pipeline-eval.yml | 69 ++++++++++ scripts/submit_buildkite_job.py | 173 +++++++++++++++++++++++ src/libkernelbot/consts.py | 1 + src/libkernelbot/launchers/buildkite.py | 1 + tests/test_buildkite.py | 174 ++++++++++++++++++++++++ 5 files changed, 418 insertions(+) create mode 100644 deployment/buildkite/pipeline-eval.yml create mode 100755 scripts/submit_buildkite_job.py create mode 100644 tests/test_buildkite.py diff --git a/deployment/buildkite/pipeline-eval.yml b/deployment/buildkite/pipeline-eval.yml new file mode 100644 index 00000000..17ce8141 --- /dev/null +++ b/deployment/buildkite/pipeline-eval.yml @@ -0,0 +1,69 @@ +# Kernelbot Evaluation Pipeline for Buildkite +# This pipeline runs real kernel evaluations by cloning the repo and running the evaluator +# +# To use this pipeline: +# 1. Go to your Buildkite pipeline settings +# 2. Paste this YAML in the Steps section +# 3. Submit jobs via BuildkiteLauncher + +steps: + - label: ":rocket: Kernel Evaluation" + agents: + queue: "${KERNELBOT_QUEUE:-test}" + + plugins: + - docker#v5.11.0: + image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" + always-pull: false + gpus: "all" + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + + command: | + set -e + + echo "=== Environment ===" + echo "NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES" + echo "KERNELBOT_RUN_ID=$$KERNELBOT_RUN_ID" + nvidia-smi -L + + echo "" + echo "=== Installing Dependencies ===" + apt-get update -qq + apt-get install -y -qq python3.11 python3.11-venv python3-pip git > /dev/null + + # Create and activate virtual environment + python3.11 -m venv /tmp/venv + source /tmp/venv/bin/activate + + # Install PyTorch and kernelbot + pip install --quiet torch triton numpy scipy + + # Clone kernelbot and install + git clone --depth 1 https://github.com/gpu-mode/kernelbot.git /tmp/kernelbot + cd /tmp/kernelbot + pip install --quiet -e . + + echo "" + echo "=== Running Evaluation ===" + python src/runners/buildkite-runner.py + + echo "" + echo "=== Done ===" + + artifact_paths: + - "result.json" + - "profile_data/**/*" + + timeout_in_minutes: 30 + + retry: + automatic: + - exit_status: -1 + limit: 2 diff --git a/scripts/submit_buildkite_job.py b/scripts/submit_buildkite_job.py new file mode 100755 index 00000000..e4d5a573 --- /dev/null +++ b/scripts/submit_buildkite_job.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +"""Submit a test job to Buildkite and download the result. + +Usage: + # Simple test (just writes dummy result.json): + BUILDKITE_API_TOKEN=xxx python scripts/submit_buildkite_job.py + + # Real evaluation with vectoradd example: + BUILDKITE_API_TOKEN=xxx python scripts/submit_buildkite_job.py --eval vectoradd_py + + # Real evaluation with identity example: + BUILDKITE_API_TOKEN=xxx python scripts/submit_buildkite_job.py --eval identity_py +""" + +import argparse +import asyncio +import json +import os +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + +from libkernelbot.consts import BuildkiteGPU, SubmissionMode +from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher +from libkernelbot.task import build_task_config, make_task_definition + + +class SimpleReporter: + async def push(self, msg): + print(f"[STATUS] {msg}") + + async def update(self, msg): + print(f"[UPDATE] {msg}") + + +async def main(): + parser = argparse.ArgumentParser(description="Submit a test job to Buildkite") + parser.add_argument("--org", default="mark-saroufim", help="Buildkite org slug") + parser.add_argument("--pipeline", default="kernelbot", help="Pipeline slug") + parser.add_argument("--queue", default="test", help="Queue name") + parser.add_argument("--run-id", default="manual-test", help="Run ID for this job") + parser.add_argument( + "--eval", + type=str, + default=None, + help="Run real evaluation with example (e.g., 'vectoradd_py', 'identity_py')", + ) + parser.add_argument( + "--submission", + type=str, + default=None, + help="Submission file to use (default: auto-detect)", + ) + args = parser.parse_args() + + token = os.environ.get("BUILDKITE_API_TOKEN") + if not token: + print("ERROR: Set BUILDKITE_API_TOKEN environment variable") + sys.exit(1) + + print("=== Buildkite Job Submission ===") + print(f"Org: {args.org}") + print(f"Pipeline: {args.pipeline}") + print(f"Queue: {args.queue}") + print(f"Run ID: {args.run_id}") + + launcher = BuildkiteLauncher( + BuildkiteConfig( + org_slug=args.org, + pipeline_slug=args.pipeline, + api_token=token, + ) + ) + + if args.eval: + # Real evaluation mode + print(f"Eval: {args.eval}") + print() + + project_root = Path(__file__).parent.parent + task_path = project_root / "examples" / args.eval + + if not task_path.exists(): + print(f"ERROR: Example '{args.eval}' not found at {task_path}") + print("Available examples:") + for p in (project_root / "examples").iterdir(): + if p.is_dir() and (p / "task.yml").exists(): + print(f" - {p.name}") + sys.exit(1) + + task_definition = make_task_definition(task_path) + + # Find submission file + if args.submission: + submission_file = task_path / args.submission + else: + # Try common submission names + for name in ["submission_triton.py", "submission.py", "submission_cuda_inline.py"]: + if (task_path / name).exists(): + submission_file = task_path / name + break + else: + print(f"ERROR: No submission file found in {task_path}") + sys.exit(1) + + print(f"Task: {task_path.name}") + print(f"Submission: {submission_file.name}") + + submission_content = submission_file.read_text() + + config = build_task_config( + task=task_definition.task, + submission_content=submission_content, + arch=0, + mode=SubmissionMode.TEST, + ) + + gpu_type = BuildkiteGPU.L40S_BK + result = await launcher.run_submission(config, gpu_type, SimpleReporter()) + + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + print(f"System: {result.system}") + if result.runs: + for name, run in result.runs.items(): + print(f"\n{name}:") + print(f" Passed: {run.run.passed if run.run else 'N/A'}") + print(f" Duration: {run.run.duration if run.run else 'N/A'}s") + if run.run and run.run.result: + print(f" Result: {run.run.result}") + + else: + # Simple test mode + print("Mode: Simple test (no evaluation)") + print() + + config = { + "test": True, + "message": "Hello from manual test", + "run_id": args.run_id, + } + + print("Submitting job...") + result = await launcher._launch( + run_id=args.run_id, + config=config, + queue=args.queue, + status=SimpleReporter(), + ) + + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + if result.build_url: + print(f"Build URL: {result.build_url}") + if result.result: + print("Downloaded artifact:") + print(json.dumps(result.result, indent=2)) + else: + print("No artifact downloaded") + + sys.exit(0 if (result.success if hasattr(result, "success") else True) else 1) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index b9f30d0e..0f518d3a 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -38,6 +38,7 @@ class BuildkiteGPU(Enum): B200_BK = "B200_BK" H100_BK = "H100_BK" MI300_BK = "MI300_BK" + L40S_BK = "L40S_BK" # Test infrastructure @dataclasses.dataclass diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py index 0b654161..c65cab27 100644 --- a/src/libkernelbot/launchers/buildkite.py +++ b/src/libkernelbot/launchers/buildkite.py @@ -98,6 +98,7 @@ def _get_queue_for_gpu(self, gpu_type: GPU) -> str: "B200_BK": "b200", "H100_BK": "h100", "MI300_BK": "mi300", + "L40S_BK": "test", # Test infrastructure } return queue_map.get(gpu_type.name, gpu_type.name.lower().replace("_bk", "")) diff --git a/tests/test_buildkite.py b/tests/test_buildkite.py new file mode 100644 index 00000000..cb051f86 --- /dev/null +++ b/tests/test_buildkite.py @@ -0,0 +1,174 @@ +"""Integration tests for Buildkite launcher. + +Usage: + BUILDKITE_API_TOKEN=xxx pytest tests/test_buildkite.py -v -m integration + +These tests require: +1. A Buildkite account with a 'kernelbot' pipeline +2. A self-hosted runner in the 'test' queue +3. The pipeline configured with deployment/buildkite/pipeline-eval.yml +""" + +import os +from pathlib import Path + +import pytest + +from libkernelbot.consts import BuildkiteGPU, SubmissionMode +from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher +from libkernelbot.report import RunProgressReporter +from libkernelbot.task import build_task_config, make_task_definition + + +class MockProgressReporter(RunProgressReporter): + """Test progress reporter that captures messages.""" + + def __init__(self, title: str = "Test Buildkite Run"): + super().__init__(title) + self.messages = [] + self.updates = [] + + async def push(self, message: str): + self.messages.append(message) + print(f"[STATUS] {message}") + + async def update(self, message: str): + self.updates.append(message) + print(f"[UPDATE] {message}") + + +@pytest.fixture(scope="session") +def buildkite_config(): + """Get Buildkite configuration from environment.""" + token = os.getenv("BUILDKITE_API_TOKEN") + if not token: + pytest.skip("Buildkite integration tests require BUILDKITE_API_TOKEN environment variable") + + org = os.getenv("BUILDKITE_ORG", "mark-saroufim") + pipeline = os.getenv("BUILDKITE_PIPELINE", "kernelbot") + + return BuildkiteConfig( + org_slug=org, + pipeline_slug=pipeline, + api_token=token, + ) + + +@pytest.mark.integration +@pytest.mark.asyncio +@pytest.mark.parametrize("gpu_type", [BuildkiteGPU.L40S_BK]) +async def test_buildkite_launcher_python_script( + project_root: Path, buildkite_config: BuildkiteConfig, gpu_type: BuildkiteGPU +): + """ + Test BuildkiteLauncher with a real Python script. + Uses the vectoradd_py example to verify end-to-end evaluation. + """ + launcher = BuildkiteLauncher(buildkite_config) + reporter = MockProgressReporter("Buildkite Integration Test") + + # Load the vectoradd_py task + task_path = project_root / "examples" / "vectoradd_py" + if not task_path.exists(): + pytest.skip("examples/vectoradd_py not found - skipping Buildkite integration test") + + task_definition = make_task_definition(task_path) + submission_content = (task_path / "submission_triton.py").read_text() + + config = build_task_config( + task=task_definition.task, + submission_content=submission_content, + arch=0, # L40S uses Ada Lovelace architecture + mode=SubmissionMode.TEST, + ) + + result = await launcher.run_submission(config, gpu_type, reporter) + + # Basic structure and success + assert result.success, f"Expected successful run, got: {result.error}" + assert result.error == "" + assert isinstance(result.runs, dict) + + # System info + assert "L40S" in result.system.gpu or "NVIDIA" in result.system.gpu + assert "Linux" in result.system.platform + + # Test run structure + assert "test" in result.runs + test_run = result.runs["test"] + + # Run needs to succeed + assert test_run.run.success is True + assert test_run.run.passed is True + assert test_run.run.exit_code == 0 + assert test_run.run.duration > 0 + + # Test results + assert test_run.run.result["check"] == "pass" + test_count = int(test_run.run.result["test-count"]) + assert test_count >= 1 + + # Sanity check for timings + assert test_run.start < test_run.end + + # Check reporter messages + assert any("Buildkite" in msg or "queue" in msg for msg in reporter.messages) + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_buildkite_launcher_failing_script( + project_root: Path, buildkite_config: BuildkiteConfig +): + """ + Test BuildkiteLauncher with a script designed to fail. + Ensures we don't pass incorrect submissions. + """ + launcher = BuildkiteLauncher(buildkite_config) + reporter = MockProgressReporter("Buildkite Failing Test") + gpu_type = BuildkiteGPU.L40S_BK + + # Load the identity_py task + task_path = project_root / "examples" / "identity_py" + if not task_path.exists(): + pytest.skip("examples/identity_py not found - skipping Buildkite integration test") + + task_definition = make_task_definition(task_path) + # Use a cheating script that should fail + submission_content = (task_path / "cheat-rng.py").read_text() + + task_definition.task.seed = 653212 + config = build_task_config( + task=task_definition.task, + submission_content=submission_content, + arch=0, + mode=SubmissionMode.LEADERBOARD, + ) + + result = await launcher.run_submission(config, gpu_type, reporter) + + # The workflow should run successfully + assert result.success, f"Expected successful workflow run, got: {result.error}" + assert result.error == "" + + # But the actual test or benchmark should fail + test_passed = result.runs.get("test", {}).run.passed if "test" in result.runs else True + benchmark_passed = result.runs.get("benchmark", {}).run.passed if "benchmark" in result.runs else True + + assert not (test_passed and benchmark_passed), "Expected at least one run to fail for cheating script" + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_buildkite_queue_status(buildkite_config: BuildkiteConfig): + """Test that we can query queue status.""" + launcher = BuildkiteLauncher(buildkite_config) + + status = await launcher.get_queue_status("test") + + assert "queue" in status + assert status["queue"] == "test" + assert "total" in status + assert "idle" in status + assert "agents" in status + assert isinstance(status["agents"], list) From 8ebdcbb218165d3930b38bf16e82d593b82eea6c Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 17:12:37 -0800 Subject: [PATCH 15/27] Fix artifact upload by copying result to mounted volume - Changed workdir to /workdir (the mounted checkout directory) - Copy result.json to /workdir before container exits - Use uv for Python package management - Clone from buildkite-infrastructure branch --- deployment/buildkite/pipeline-eval.yml | 50 +++++++++++++------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/deployment/buildkite/pipeline-eval.yml b/deployment/buildkite/pipeline-eval.yml index 17ce8141..4b5c2bd3 100644 --- a/deployment/buildkite/pipeline-eval.yml +++ b/deployment/buildkite/pipeline-eval.yml @@ -1,10 +1,5 @@ # Kernelbot Evaluation Pipeline for Buildkite -# This pipeline runs real kernel evaluations by cloning the repo and running the evaluator -# -# To use this pipeline: -# 1. Go to your Buildkite pipeline settings -# 2. Paste this YAML in the Steps section -# 3. Submit jobs via BuildkiteLauncher +# Mirrors GitHub runner: clone repo, install deps, run evaluation steps: - label: ":rocket: Kernel Evaluation" @@ -17,6 +12,7 @@ steps: always-pull: false gpus: "all" propagate-environment: true + shell: ["/bin/bash", "-e", "-c"] environment: - NVIDIA_VISIBLE_DEVICES - CUDA_VISIBLE_DEVICES @@ -24,46 +20,48 @@ steps: - KERNELBOT_RUN_ID cpus: "${KERNELBOT_CPUS:-8}" memory: "${KERNELBOT_MEMORY:-64g}" + workdir: /workdir command: | set -e echo "=== Environment ===" - echo "NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES" - echo "KERNELBOT_RUN_ID=$$KERNELBOT_RUN_ID" + echo "NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES" + echo "KERNELBOT_RUN_ID=$KERNELBOT_RUN_ID" nvidia-smi -L echo "" - echo "=== Installing Dependencies ===" + echo "=== Installing System Dependencies ===" apt-get update -qq - apt-get install -y -qq python3.11 python3.11-venv python3-pip git > /dev/null + apt-get install -y -qq curl ca-certificates git - # Create and activate virtual environment - python3.11 -m venv /tmp/venv - source /tmp/venv/bin/activate + echo "" + echo "=== Installing uv ===" + curl -LsSf https://astral.sh/uv/install.sh | sh + . /root/.local/bin/env - # Install PyTorch and kernelbot - pip install --quiet torch triton numpy scipy + echo "" + echo "=== Cloning Repository ===" + git clone --depth 1 --branch buildkite-infrastructure https://github.com/gpu-mode/kernelbot.git /opt/kernelbot + cd /opt/kernelbot - # Clone kernelbot and install - git clone --depth 1 https://github.com/gpu-mode/kernelbot.git /tmp/kernelbot - cd /tmp/kernelbot - pip install --quiet -e . + echo "" + echo "=== Installing Dependencies ===" + uv sync echo "" echo "=== Running Evaluation ===" - python src/runners/buildkite-runner.py + uv run python src/runners/buildkite-runner.py echo "" + echo "=== Copying Artifacts ===" + cp result.json /workdir/result.json + cp -r profile_data /workdir/profile_data 2>/dev/null || true + echo "=== Done ===" artifact_paths: - "result.json" - - "profile_data/**/*" + - "profile_data/*" timeout_in_minutes: 30 - - retry: - automatic: - - exit_status: -1 - limit: 2 From 690716c2e5373823f42c1800133d7c6ac9a706eb Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 17:14:21 -0800 Subject: [PATCH 16/27] Use buildkite-infrastructure branch for pipeline config --- src/libkernelbot/launchers/buildkite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py index c65cab27..b8c37bfe 100644 --- a/src/libkernelbot/launchers/buildkite.py +++ b/src/libkernelbot/launchers/buildkite.py @@ -180,7 +180,7 @@ async def _launch( build_data = { "commit": "HEAD", - "branch": "main", + "branch": "buildkite-infrastructure", "message": f"Kernel eval: {run_id}", "env": { "KERNELBOT_RUN_ID": run_id, From cd1f25ed2e016b3fb2b58ea4e3810272c403d341 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 17:26:04 -0800 Subject: [PATCH 17/27] Update pipeline-eval.yml with working configuration - Add PyTorch installation step - Use workdir: /workdir for artifact accessibility - Copy result.json to workdir before container exits - Activate venv before running evaluation --- deployment/buildkite/pipeline-eval.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/deployment/buildkite/pipeline-eval.yml b/deployment/buildkite/pipeline-eval.yml index 4b5c2bd3..8bfa573b 100644 --- a/deployment/buildkite/pipeline-eval.yml +++ b/deployment/buildkite/pipeline-eval.yml @@ -49,9 +49,14 @@ steps: echo "=== Installing Dependencies ===" uv sync + echo "" + echo "=== Installing PyTorch ===" + uv pip install torch triton numpy --index-url https://download.pytorch.org/whl/cu124 + echo "" echo "=== Running Evaluation ===" - uv run python src/runners/buildkite-runner.py + . .venv/bin/activate + python src/runners/buildkite-runner.py echo "" echo "=== Copying Artifacts ===" From 29f12397bc18f96e4331c4438c7711bf0a14b064 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 19:19:57 -0800 Subject: [PATCH 18/27] Update buildkite.md with working E2E evaluation docs - Add real evaluation job submission instructions - Add integration test documentation - Document operational model (no pre-built Docker image) - Clarify when admin action is/isn't needed - Note shared evaluation logic across all runners --- SKILLS/buildkite.md | 105 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index e3dce853..785ffe01 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -505,3 +505,108 @@ BUILDKITE_API_TOKEN= uv run python tests/e2e_buildkite_test.py \ --org \ --queue test ``` + +## Real Evaluation Jobs + +### Submit a Real Kernel Evaluation + +```bash +BUILDKITE_API_TOKEN= uv run python scripts/submit_buildkite_job.py --eval vectoradd_py +``` + +This runs the full evaluation pipeline on actual GPU hardware and returns real benchmark results: + +``` +=== Result === +Success: True +System: SystemInfo(gpu='NVIDIA L40S', device_count=1, cpu='AMD EPYC 9254 24-Core Processor', runtime='CUDA', platform='Linux-5.15.0-164-generic-x86_64-with-glibc2.35', torch='2.6.0+cu124', hostname='...') + +test: + Passed: True + Duration: 3.18s + Result: {'test-count': '5', 'test.0.status': 'pass', 'test.1.status': 'pass', ...} +``` + +### Integration Tests + +Run the full integration test suite: + +```bash +BUILDKITE_API_TOKEN= uv run pytest tests/test_buildkite.py -v -m integration +``` + +Tests include: +- `test_buildkite_launcher_python_script` - Real evaluation with vectoradd_py +- `test_buildkite_launcher_failing_script` - Verifies cheating scripts correctly fail +- `test_buildkite_queue_status` - Tests agent queue API + +### Available Examples + +Any example in the `examples/` directory works: + +```bash +# List available examples +ls examples/ + +# Run a specific example +BUILDKITE_API_TOKEN=xxx uv run python scripts/submit_buildkite_job.py --eval identity_py +``` + +## Operational Model + +### No Pre-Built Docker Image (Current Setup) + +The pipeline does **NOT** use a pre-built Docker image. Each job: + +1. Uses base `nvidia/cuda:12.4.0-devel-ubuntu22.04` image +2. Installs dependencies at runtime: + - `uv` for Python package management + - Clones kernelbot repo from `buildkite-infrastructure` branch + - Runs `uv sync` to install project dependencies + - Runs `uv pip install torch triton numpy` for GPU packages +3. Runs the evaluation + +**Advantages:** +- No Dockerfile to maintain or rebuild +- No image registry to manage +- Always uses latest code from repo +- **No admin action needed** after code updates + +**Trade-off:** +- Slightly longer job startup time (~30-40 seconds for dependency installation) + +### When Admin Action Is Needed + +The only time the machine admin needs to run anything is: + +1. **Initial setup**: Run `setup-node-simple.sh` once when onboarding a new node +2. **Buildkite agent updates**: If Buildkite releases a new agent version (rare) +3. **System-level changes**: NVIDIA driver updates, OS updates, etc. + +Code changes to kernelbot require **no admin action** - the pipeline clones fresh code each run. + +### Shared Evaluation Logic + +All runners (GitHub, Modal, Buildkite) use the exact same evaluation engine: + +```python +# src/runners/buildkite-runner.py:49 +from libkernelbot.run_eval import run_config +result = run_config(config) +``` + +This means: +- Any problem that works on GitHub/Modal works on Buildkite +- Same result format (`FullResult`) +- Same test/benchmark logic +- Same correctness checking + +## Current Branch + +The Buildkite infrastructure is on the `buildkite-infrastructure` branch. The pipeline clones from this branch: + +```yaml +git clone --depth 1 --branch buildkite-infrastructure https://github.com/gpu-mode/kernelbot.git +``` + +Once merged to `main`, update the pipeline config to use `main` branch. From 0bcb540eaefa7c3207ccd2741b192084b1eee700 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 19:24:41 -0800 Subject: [PATCH 19/27] Add Buildkite backend integration and pre-built image support Backend integration: - Register BuildkiteLauncher in create_backend() when BUILDKITE_API_TOKEN is set - Add BUILDKITE_API_TOKEN, BUILDKITE_ORG, BUILDKITE_PIPELINE env vars - Results now flow to database same as GitHub/Modal Pre-built Docker image for fast cold starts: - Add Dockerfile with all dependencies pre-installed - Add build-image.sh script for local image building - Add pipeline-fast.yml for using pre-built image (~5s vs ~40s cold start) - Update setup-node-simple.sh with BUILD_IMAGE=true option Update skills doc with operational model for both approaches --- SKILLS/buildkite.md | 52 ++++++++++++++++------ deployment/buildkite/Dockerfile | 46 ++++++++++---------- deployment/buildkite/build-image.sh | 43 ++++++++++++++++++ deployment/buildkite/pipeline-fast.yml | 53 +++++++++++++++++++++++ deployment/buildkite/setup-node-simple.sh | 24 ++++++++++ src/kernelbot/env.py | 5 +++ src/kernelbot/main.py | 14 +++++- 7 files changed, 200 insertions(+), 37 deletions(-) create mode 100755 deployment/buildkite/build-image.sh create mode 100644 deployment/buildkite/pipeline-fast.yml diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index 785ffe01..1718e3a5 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -554,16 +554,15 @@ BUILDKITE_API_TOKEN=xxx uv run python scripts/submit_buildkite_job.py --eval ide ## Operational Model -### No Pre-Built Docker Image (Current Setup) +### Option 1: No Pre-Built Image (Current Default) -The pipeline does **NOT** use a pre-built Docker image. Each job: +The pipeline installs dependencies at runtime. Each job: 1. Uses base `nvidia/cuda:12.4.0-devel-ubuntu22.04` image -2. Installs dependencies at runtime: +2. Installs dependencies at runtime (~30-40 seconds): - `uv` for Python package management - - Clones kernelbot repo from `buildkite-infrastructure` branch - - Runs `uv sync` to install project dependencies - - Runs `uv pip install torch triton numpy` for GPU packages + - Clones kernelbot repo + - Runs `uv sync` and `uv pip install torch triton numpy` 3. Runs the evaluation **Advantages:** @@ -573,17 +572,44 @@ The pipeline does **NOT** use a pre-built Docker image. Each job: - **No admin action needed** after code updates **Trade-off:** -- Slightly longer job startup time (~30-40 seconds for dependency installation) +- Slower cold starts (~40 seconds) -### When Admin Action Is Needed +### Option 2: Pre-Built Image (Fast Cold Starts) + +For faster cold starts (~5 seconds), build the Docker image on each node: -The only time the machine admin needs to run anything is: +```bash +# During initial setup: +sudo BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=test BUILD_IMAGE=true ./deployment/buildkite/setup-node-simple.sh -1. **Initial setup**: Run `setup-node-simple.sh` once when onboarding a new node -2. **Buildkite agent updates**: If Buildkite releases a new agent version (rare) -3. **System-level changes**: NVIDIA driver updates, OS updates, etc. +# Or build separately: +./deployment/buildkite/build-image.sh +``` + +Then update the Buildkite pipeline config to use the local image: +```yaml +image: "kernelbot:latest" +``` + +**When to rebuild the image:** +- When dependencies change (new PyTorch version, new packages) +- When you want the latest kernelbot code baked in +- NOT needed for problem/task changes (those come via config) + +**Rebuild command:** +```bash +./deployment/buildkite/build-image.sh +``` + +### When Admin Action Is Needed -Code changes to kernelbot require **no admin action** - the pipeline clones fresh code each run. +| Scenario | Action Required | +|----------|-----------------| +| Code changes (no deps) | None - pipeline clones fresh code | +| Dependency changes | Rebuild image: `./build-image.sh` | +| Initial node setup | Run `setup-node-simple.sh` once | +| NVIDIA driver updates | May need to rebuild image | +| Buildkite agent updates | Rare - Buildkite handles this | ### Shared Evaluation Logic diff --git a/deployment/buildkite/Dockerfile b/deployment/buildkite/Dockerfile index 3127a3a5..1a31aec2 100644 --- a/deployment/buildkite/Dockerfile +++ b/deployment/buildkite/Dockerfile @@ -1,4 +1,5 @@ # Kernelbot evaluation image +# Pre-built with all dependencies for fast cold starts FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive @@ -6,37 +7,36 @@ ENV PYTHONUNBUFFERED=1 # System packages RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.11 \ - python3.11-dev \ - python3.11-venv \ - python3-pip \ - git \ - wget \ curl \ + ca-certificates \ + git \ build-essential \ ninja-build \ cmake \ && rm -rf /var/lib/apt/lists/* -# Set Python 3.11 as default -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 +# Install uv +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Clone and install kernelbot +WORKDIR /opt/kernelbot +RUN git clone --depth 1 --branch buildkite-infrastructure https://github.com/gpu-mode/kernelbot.git . + +# Install dependencies with uv +RUN uv sync -# Upgrade pip -RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel +# Install PyTorch and GPU packages +RUN uv pip install torch triton numpy --index-url https://download.pytorch.org/whl/cu124 -# PyTorch + CUDA -RUN pip install --no-cache-dir \ - torch==2.4.0 \ - triton \ - numpy \ - scipy +# Ensure venv is activated for any commands +ENV VIRTUAL_ENV=/opt/kernelbot/.venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" -# Copy kernelbot -WORKDIR /app -COPY pyproject.toml . -COPY src/ src/ -RUN pip install --no-cache-dir -e . +# Verify installation +RUN python -c "import torch; print(f'PyTorch {torch.__version__}')" && \ + python -c "import triton; print(f'Triton installed')" && \ + python -c "from libkernelbot.run_eval import run_config; print('kernelbot installed')" # Default command -CMD ["python", "/app/src/runners/buildkite-runner.py"] +CMD ["python", "/opt/kernelbot/src/runners/buildkite-runner.py"] diff --git a/deployment/buildkite/build-image.sh b/deployment/buildkite/build-image.sh new file mode 100755 index 00000000..af718450 --- /dev/null +++ b/deployment/buildkite/build-image.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Build the kernelbot Docker image locally on a GPU node +# Usage: ./build-image.sh [--push] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +IMAGE_NAME="${KERNELBOT_IMAGE:-kernelbot:latest}" +BRANCH="${KERNELBOT_BRANCH:-buildkite-infrastructure}" + +echo "=== Building Kernelbot Image ===" +echo "Image: $IMAGE_NAME" +echo "Branch: $BRANCH" +echo "" + +# Update Dockerfile to use correct branch +sed -i "s|--branch [a-zA-Z0-9_-]*|--branch $BRANCH|g" "$SCRIPT_DIR/Dockerfile" 2>/dev/null || \ + sed -i '' "s|--branch [a-zA-Z0-9_-]*|--branch $BRANCH|g" "$SCRIPT_DIR/Dockerfile" + +echo "Building image..." +docker build -t "$IMAGE_NAME" -f "$SCRIPT_DIR/Dockerfile" "$REPO_ROOT" + +echo "" +echo "=== Build Complete ===" +echo "Image: $IMAGE_NAME" +docker images "$IMAGE_NAME" + +# Optional: push to registry +if [[ "${1:-}" == "--push" ]]; then + REGISTRY="${KERNELBOT_REGISTRY:-ghcr.io/gpu-mode}" + REMOTE_IMAGE="$REGISTRY/kernelbot:latest" + echo "" + echo "Pushing to $REMOTE_IMAGE..." + docker tag "$IMAGE_NAME" "$REMOTE_IMAGE" + docker push "$REMOTE_IMAGE" + echo "Pushed: $REMOTE_IMAGE" +fi + +echo "" +echo "To use this image, update your pipeline config:" +echo " image: \"$IMAGE_NAME\"" diff --git a/deployment/buildkite/pipeline-fast.yml b/deployment/buildkite/pipeline-fast.yml new file mode 100644 index 00000000..4d50a3fb --- /dev/null +++ b/deployment/buildkite/pipeline-fast.yml @@ -0,0 +1,53 @@ +# Kernelbot Fast Evaluation Pipeline +# Uses pre-built image for fast cold starts (~5s vs ~40s) +# +# Prerequisites: +# 1. Build image on node: ./deployment/buildkite/build-image.sh +# 2. Or pull from registry: docker pull ghcr.io/gpu-mode/kernelbot:latest + +steps: + - label: ":rocket: Kernel Evaluation" + agents: + queue: "${KERNELBOT_QUEUE:-test}" + + plugins: + - docker#v5.11.0: + image: "${KERNELBOT_IMAGE:-kernelbot:latest}" + always-pull: false + gpus: "all" + propagate-environment: true + shell: ["/bin/bash", "-e", "-c"] + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + workdir: /workdir + + command: | + set -e + + echo "=== Environment ===" + echo "NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES" + echo "KERNELBOT_RUN_ID=$KERNELBOT_RUN_ID" + nvidia-smi -L + + echo "" + echo "=== Running Evaluation ===" + cd /opt/kernelbot + python src/runners/buildkite-runner.py + + echo "" + echo "=== Copying Artifacts ===" + cp result.json /workdir/result.json + cp -r profile_data /workdir/profile_data 2>/dev/null || true + + echo "=== Done ===" + + artifact_paths: + - "result.json" + - "profile_data/*" + + timeout_in_minutes: 15 diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh index af78686f..351a8a04 100755 --- a/deployment/buildkite/setup-node-simple.sh +++ b/deployment/buildkite/setup-node-simple.sh @@ -203,3 +203,27 @@ echo ' - label: "GPU Test"' echo ' command: "echo NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES && nvidia-smi -L"' echo ' agents:' echo " queue: \"${GPU_TYPE}\"" + +# === BUILD DOCKER IMAGE (optional) === +if [[ "${BUILD_IMAGE:-}" == "true" ]]; then + echo "" + echo "=== Building Docker Image ===" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + + if [[ -f "$SCRIPT_DIR/Dockerfile" ]]; then + docker build -t kernelbot:latest -f "$SCRIPT_DIR/Dockerfile" "$SCRIPT_DIR/../.." + echo "Docker image built: kernelbot:latest" + echo "" + echo "To use the fast pipeline, update Buildkite config to use:" + echo " image: \"kernelbot:latest\"" + else + echo "WARNING: Dockerfile not found at $SCRIPT_DIR/Dockerfile" + echo "Clone the repo first: git clone https://github.com/gpu-mode/kernelbot.git" + fi +fi + +echo "" +echo "For faster cold starts, build the Docker image:" +echo " BUILD_IMAGE=true $0" +echo "Or manually:" +echo " ./deployment/buildkite/build-image.sh" diff --git a/src/kernelbot/env.py b/src/kernelbot/env.py index 90dd276c..703f2b3c 100644 --- a/src/kernelbot/env.py +++ b/src/kernelbot/env.py @@ -33,6 +33,11 @@ env.GITHUB_WORKFLOW_BRANCH = os.getenv("GITHUB_WORKFLOW_BRANCH", get_github_branch_name()) env.PROBLEMS_REPO = os.getenv("PROBLEMS_REPO") +# Buildkite-specific constants +env.BUILDKITE_API_TOKEN = os.getenv("BUILDKITE_API_TOKEN") +env.BUILDKITE_ORG = os.getenv("BUILDKITE_ORG", "gpu-mode") +env.BUILDKITE_PIPELINE = os.getenv("BUILDKITE_PIPELINE", "kernelbot") + # Directory that will be used for local problem development. env.PROBLEM_DEV_DIR = os.getenv("PROBLEM_DEV_DIR", "examples") diff --git a/src/kernelbot/main.py b/src/kernelbot/main.py index 71736ee0..749a1d56 100644 --- a/src/kernelbot/main.py +++ b/src/kernelbot/main.py @@ -16,7 +16,8 @@ from libkernelbot import consts from libkernelbot.backend import KernelBackend from libkernelbot.background_submission_manager import BackgroundSubmissionManager -from libkernelbot.launchers import GitHubLauncher, ModalLauncher +from libkernelbot.launchers import BuildkiteLauncher, GitHubLauncher, ModalLauncher +from libkernelbot.launchers.buildkite import BuildkiteConfig from libkernelbot.utils import setup_logging logger = setup_logging(__name__) @@ -29,6 +30,17 @@ def create_backend(debug_mode: bool = False) -> KernelBackend: backend.register_launcher( GitHubLauncher(env.GITHUB_REPO, env.GITHUB_TOKEN, env.GITHUB_WORKFLOW_BRANCH) ) + + # Register Buildkite launcher if API token is configured + if env.BUILDKITE_API_TOKEN: + buildkite_config = BuildkiteConfig( + org_slug=env.BUILDKITE_ORG, + pipeline_slug=env.BUILDKITE_PIPELINE, + api_token=env.BUILDKITE_API_TOKEN, + ) + backend.register_launcher(BuildkiteLauncher(buildkite_config)) + logger.info("Buildkite launcher registered") + return backend From c0dfd97f7656438b1e58adaba143b41256dfdda6 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 19:29:58 -0800 Subject: [PATCH 20/27] Add L40S_BK to GPU_TO_SM and clarify env var docs - Add L40S_BK SM arch (89 - Ada Lovelace) to GPU_TO_SM mapping - Document env vars by location: - Heroku/Backend: BUILDKITE_API_TOKEN, BUILDKITE_ORG, BUILDKITE_PIPELINE - GPU Nodes: BUILDKITE_AGENT_TOKEN (set by admin), auto-set vars - Jobs: KERNELBOT_* vars passed via API --- SKILLS/buildkite.md | 47 ++++++++++++++++++++++++++++---------- src/libkernelbot/consts.py | 1 + 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index 1718e3a5..3166668d 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -233,26 +233,49 @@ Buildkite-managed GPUs are registered with `_BK` suffix: | `B200_BK` | `b200` | 100 | | `H100_BK` | `h100` | 90a | | `MI300_BK` | `mi300` | (AMD) | +| `L40S_BK` | `test` | 89 (Ada Lovelace) | ## Environment Variables -### For Kernelbot API/Backend +### On Heroku/Backend (where the app runs) -- `BUILDKITE_API_TOKEN`: API token for submitting jobs +These are set in Heroku config vars or your `.env` file: -### For Buildkite Agents (set by setup script) +| Variable | Required | Description | +|----------|----------|-------------| +| `BUILDKITE_API_TOKEN` | Yes | API token for submitting jobs and downloading artifacts. Get from Buildkite → Personal Settings → API Access Tokens | +| `BUILDKITE_ORG` | No | Organization slug (default: `gpu-mode`) | +| `BUILDKITE_PIPELINE` | No | Pipeline slug (default: `kernelbot`) | -- `NVIDIA_VISIBLE_DEVICES`: GPU index for isolation -- `CUDA_VISIBLE_DEVICES`: Same as above -- `KERNELBOT_GPU_INDEX`: GPU index (0, 1, 2, ...) -- `KERNELBOT_CPUSET`: CPU cores for this agent -- `KERNELBOT_MEMORY`: Memory limit +**API Token Permissions Required:** +- `read_builds` - Poll build status +- `write_builds` - Create/trigger builds +- `read_artifacts` - Download result.json artifact +- `read_agents` (optional) - Check queue status + +### On GPU Runner Nodes -### For Jobs (passed via pipeline) +These are set during node setup: -- `KERNELBOT_RUN_ID`: Unique run identifier -- `KERNELBOT_PAYLOAD`: Base64+zlib compressed job config -- `KERNELBOT_QUEUE`: Target queue name +| Variable | Set By | Description | +|----------|--------|-------------| +| `BUILDKITE_AGENT_TOKEN` | Admin (setup script) | Agent token for connecting to Buildkite | +| `NVIDIA_VISIBLE_DEVICES` | Environment hook | GPU index for isolation (auto-set per job) | +| `CUDA_VISIBLE_DEVICES` | Environment hook | Same as above | +| `KERNELBOT_GPU_INDEX` | Environment hook | GPU index (0, 1, 2, ...) | +| `KERNELBOT_CPUSET` | Environment hook | CPU cores for this agent | +| `KERNELBOT_MEMORY` | Environment hook | Memory limit for Docker | + +### Passed to Jobs (via Buildkite API) + +These are set automatically by the launcher: + +| Variable | Description | +|----------|-------------| +| `KERNELBOT_RUN_ID` | Unique run identifier | +| `KERNELBOT_PAYLOAD` | Base64+zlib compressed job config | +| `KERNELBOT_QUEUE` | Target queue name | +| `KERNELBOT_IMAGE` | Docker image to use | ## Troubleshooting diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index 0f518d3a..3f52737b 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -133,6 +133,7 @@ class RankCriterion(Enum): "B200_BK": "100", "H100_BK": "90a", "MI300_BK": None, + "L40S_BK": "89", # Ada Lovelace } From f459820a0bf1f181fa6ac105d40c81225f2b54e2 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 20:36:50 -0800 Subject: [PATCH 21/27] Add E2E test with database and document known limitations - Add scripts/e2e_buildkite_with_db.py for full end-to-end testing - Creates test leaderboard in PostgreSQL - Submits real kernel evaluation to Buildkite - Stores results in database with scoring - Supports test and leaderboard modes - Update SKILLS/buildkite.md with: - E2E testing instructions and verified results - Known limitations section for reviewers: - Cold start overhead (20-50s breakdown) - Dependency installation tradeoffs (3 options) - GPU isolation gaps (compute, memory, PCIe, disk) - Queue management limitations - Security considerations - Future improvements checklist Tested: Leaderboard mode working with scoring on L40S --- SKILLS/buildkite.md | 195 +++++++++++++++++- scripts/e2e_buildkite_with_db.py | 335 +++++++++++++++++++++++++++++++ 2 files changed, 528 insertions(+), 2 deletions(-) create mode 100644 scripts/e2e_buildkite_with_db.py diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index 3166668d..079e5b73 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -200,7 +200,7 @@ BUILDKITE_API_TOKEN= uv run python tests/e2e_buildkite_test.py --queu Options: - `--queue `: Target queue (default: test) -- `--org `: Buildkite org (default: gpu-mode) +- `--org `: Buildkite org (default: mark-saroufim) - `--pipeline `: Pipeline name (default: kernelbot) - `--dry-run`: Print config without submitting @@ -244,7 +244,7 @@ These are set in Heroku config vars or your `.env` file: | Variable | Required | Description | |----------|----------|-------------| | `BUILDKITE_API_TOKEN` | Yes | API token for submitting jobs and downloading artifacts. Get from Buildkite → Personal Settings → API Access Tokens | -| `BUILDKITE_ORG` | No | Organization slug (default: `gpu-mode`) | +| `BUILDKITE_ORG` | No | Organization slug (default: `mark-saroufim`) | | `BUILDKITE_PIPELINE` | No | Pipeline slug (default: `kernelbot`) | **API Token Permissions Required:** @@ -659,3 +659,194 @@ git clone --depth 1 --branch buildkite-infrastructure https://github.com/gpu-mod ``` Once merged to `main`, update the pipeline config to use `main` branch. + +## E2E Testing with Database + +A comprehensive end-to-end test script is available that: +1. Creates a test leaderboard in the database +2. Submits a real kernel evaluation to Buildkite +3. Stores results in PostgreSQL +4. Verifies data integrity + +### Running E2E Tests + +```bash +# Test mode (correctness only) +BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py \ + --org mark-saroufim --queue test + +# Leaderboard mode (with benchmarks and scoring) +BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py \ + --org mark-saroufim --queue test --mode leaderboard + +# With cleanup (delete test leaderboard after) +BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py \ + --org mark-saroufim --queue test --mode leaderboard --cleanup +``` + +### Verified Working (2026-02-04) + +| Mode | Status | Details | +|------|--------|---------| +| Test | ✅ | 5 tests passed, ~3.4s duration | +| Benchmark | ✅ | 30 runs, 4.07ms mean | +| Leaderboard | ✅ | Score computed and stored | +| Database | ✅ | All runs stored with system info | + +--- + +## Known Limitations & Review Notes + +This section documents known limitations and tradeoffs for code reviewers. + +### 1. Cold Start Overhead + +**Problem**: Each job incurs significant startup overhead: + +| Phase | Time | Notes | +|-------|------|-------| +| Docker pull | 10-30s | First run only if image not cached | +| Container start | 2-5s | Includes cgroup setup | +| Python imports | 5-10s | PyTorch, Triton, etc. | +| Code clone | 3-5s | If using runtime install | +| **Total cold start** | **20-50s** | Varies by image caching | + +**Current Approach**: We use a pre-built Docker image (`ghcr.io/gpu-mode/kernelbot:latest`) with dependencies baked in. This reduces cold start to ~10-15s after first pull. + +### 2. Dependency Installation Tradeoffs + +There are two operational models with different tradeoffs: + +#### Option A: Pre-Built Image (Current Default) +```yaml +image: "ghcr.io/gpu-mode/kernelbot:latest" +``` +- **Pros**: Fast cold starts (~5-10s), consistent environment +- **Cons**: Must rebuild image for dependency changes, requires image registry +- **When to rebuild**: PyTorch version change, new packages, security updates + +#### Option B: Runtime Installation +```yaml +image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" +command: | + pip install torch triton numpy + python eval.py +``` +- **Pros**: Always latest dependencies, no image maintenance +- **Cons**: Slow cold starts (~40-60s), network dependency, version drift +- **Use when**: Testing new dependencies, development + +#### Option C: Cached Dependencies on Host +```yaml +volumes: + - "/var/lib/buildkite-agent/cache/pip:/root/.cache/pip:rw" +``` +- **Pros**: Fast after first run, no image rebuild needed +- **Cons**: Cache invalidation complexity, disk usage, per-node setup +- **Use when**: Frequent dependency changes, limited registry access + +**Recommendation**: Use Option A (pre-built image) for production. Use Option B for development/testing new dependencies. + +### 3. GPU Isolation Limitations + +**Current Isolation Model**: +- GPU isolation via `NVIDIA_VISIBLE_DEVICES` environment variable +- CPU isolation via Docker `--cpuset-cpus` +- Memory isolation via Docker `--memory` + +**Known Gaps**: + +| Resource | Isolation Level | Notes | +|----------|-----------------|-------| +| GPU Compute | ✅ Strong | Only assigned GPU visible | +| GPU Memory | ⚠️ Partial | Other processes could exhaust VRAM if running | +| PCIe Bandwidth | ❌ None | Shared across all GPUs | +| NVLink | ❌ None | If present, shared | +| CPU Cache | ⚠️ Partial | L3 cache shared across cores | +| Network | ⚠️ Partial | Docker bridge, but shared bandwidth | +| Disk I/O | ❌ None | Shared unless using separate volumes | + +**Potential Issues**: +- **Noisy neighbor**: One job could impact another via shared resources +- **VRAM exhaustion**: If host processes use GPU memory +- **Timing variability**: Benchmark results may vary due to shared resources + +**Mitigations**: +- Run one agent per GPU (current approach) +- Use dedicated benchmark nodes for competition scoring +- Monitor for outlier results + +### 4. Artifact Handling + +**Current Flow**: +1. Job writes `result.json` to working directory +2. Buildkite agent uploads to S3 +3. Backend downloads via Buildkite API (302 redirect to S3) + +**Limitations**: +- **Size limit**: ~100MB per artifact (Buildkite limit) +- **Retention**: 6 months by default +- **Download latency**: 1-2s for small files, more for large profiles + +### 5. Queue Management + +**Current Model**: One queue per GPU type (e.g., `b200`, `h100`, `mi300`) + +**Limitations**: +- No priority queuing (FIFO only) +- No job preemption +- No fair-share scheduling between users +- Queue depth visibility requires API calls + +**Potential Improvements**: +- Implement priority via build metadata +- Add rate limiting per user +- Create admin queue for verification runs + +### 6. Error Handling + +**Automatic Retries**: +```yaml +retry: + automatic: + - exit_status: -1 # Infrastructure failure + limit: 2 + - exit_status: 255 # Agent disconnect + limit: 1 +``` + +**Not Automatically Retried**: +- Compilation errors (user code issue) +- Test failures (user code issue) +- Timeout (15 min default) +- OOM errors + +### 7. Security Considerations + +**Sandboxing**: +- Jobs run in Docker containers +- No host network access +- Limited volume mounts + +**Risks**: +- User code has full GPU access (could mine crypto briefly) +- User code could attempt network attacks (mitigated by Docker networking) +- Large submissions could exhaust disk space + +**Mitigations**: +- Timeout limits (15 min) +- Disk quotas (via Docker) +- Network isolation (Docker bridge) +- Result validation before storing + +--- + +## Future Improvements + +- [ ] Add MIG (Multi-Instance GPU) support for H100/A100 +- [ ] Implement job priority queuing +- [ ] Add per-user rate limiting +- [ ] Support multi-GPU jobs for distributed problems +- [ ] Add warm pool of pre-started containers +- [ ] Implement result caching for identical submissions + diff --git a/scripts/e2e_buildkite_with_db.py b/scripts/e2e_buildkite_with_db.py new file mode 100644 index 00000000..629463fe --- /dev/null +++ b/scripts/e2e_buildkite_with_db.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 +"""End-to-end test for Buildkite integration with database storage. + +This script: +1. Creates a test leaderboard in the local database +2. Submits a real kernel evaluation job to Buildkite +3. Stores results in the PostgreSQL database +4. Verifies everything is stored correctly + +Usage: + BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py + +Options: + --queue Buildkite queue (default: test) + --org Buildkite org (default: gpu-mode) + --pipeline Pipeline name (default: kernelbot) + --example Example to run (default: vectoradd_py) + --cleanup Delete the test leaderboard after the test + --dry-run Print config without submitting +""" + +import argparse +import asyncio +import datetime +import os +import sys +from pathlib import Path +from types import SimpleNamespace + +# Add src to path for local testing +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + + +class SimpleReporter: + """Simple progress reporter for CLI output.""" + + def __init__(self, title: str = ""): + self.title = title + self.messages = [] + + async def push(self, msg): + self.messages.append(msg) + print(f" [PUSH] {msg}") + + async def update(self, msg): + print(f" [UPDATE] {msg}") + + async def update_title(self, title): + self.title = title + print(f" [TITLE] {title}") + + async def display_report(self, title, report): + print(f"\n [REPORT] {title}") + for line in report: + print(f" {line}") + + +class MultiReporter: + """Multi-run progress reporter.""" + + def __init__(self): + self.runs = [] + + def add_run(self, name: str) -> SimpleReporter: + reporter = SimpleReporter(name) + self.runs.append(reporter) + print(f"\n--- Run: {name} ---") + return reporter + + async def show(self, msg): + print(f"\n[SHOW] {msg}") + + +async def main(): + parser = argparse.ArgumentParser(description="E2E Buildkite test with database storage") + parser.add_argument("--queue", default="test", help="Buildkite queue (default: test)") + parser.add_argument("--org", default="gpu-mode", help="Buildkite org slug") + parser.add_argument("--pipeline", default="kernelbot", help="Pipeline slug") + parser.add_argument("--example", default="vectoradd_py", help="Example to run") + parser.add_argument("--mode", choices=["test", "leaderboard"], default="test", help="Submission mode") + parser.add_argument("--cleanup", action="store_true", help="Delete test leaderboard after test") + parser.add_argument("--dry-run", action="store_true", help="Print config without submitting") + args = parser.parse_args() + + # Check for required environment variables + token = os.environ.get("BUILDKITE_API_TOKEN") + if not token: + print("ERROR: BUILDKITE_API_TOKEN environment variable not set") + print("\nTo get a token:") + print(" 1. Go to https://buildkite.com/user/api-access-tokens") + print(" 2. Create token with: read_builds, write_builds, read_artifacts, read_agents") + sys.exit(1) + + database_url = os.environ.get("DATABASE_URL", "postgresql://marksaroufim@localhost:5432/kernelbot") + disable_ssl = os.environ.get("DISABLE_SSL", "true") + + print("=" * 60) + print("Buildkite E2E Test with Database Storage") + print("=" * 60) + print(f"Organization: {args.org}") + print(f"Pipeline: {args.pipeline}") + print(f"Queue: {args.queue}") + print(f"Example: {args.example}") + print(f"Mode: {args.mode}") + print(f"Database: {database_url}") + print() + + # Import kernelbot modules + from libkernelbot.backend import KernelBackend + from libkernelbot.consts import BuildkiteGPU, SubmissionMode + from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher + from libkernelbot.leaderboard_db import LeaderboardDB + from libkernelbot.task import make_task_definition + + # Set up database connection + env = SimpleNamespace( + DATABASE_URL=database_url, + DISABLE_SSL=disable_ssl, + ) + + db = LeaderboardDB(url=database_url, ssl_mode="disable" if disable_ssl else "require") + + # Find example + project_root = Path(__file__).parent.parent + task_path = project_root / "examples" / args.example + + if not task_path.exists(): + print(f"ERROR: Example '{args.example}' not found at {task_path}") + print("Available examples:") + for p in (project_root / "examples").iterdir(): + if p.is_dir() and (p / "task.yml").exists(): + print(f" - {p.name}") + sys.exit(1) + + # Load task definition + task_definition = make_task_definition(task_path) + leaderboard_name = f"e2e-test-{args.example}" + + # Find submission file + for name in ["submission_triton.py", "submission.py", "submission_cuda_inline.py"]: + if (task_path / name).exists(): + submission_file = task_path / name + break + else: + print(f"ERROR: No submission file found in {task_path}") + sys.exit(1) + + submission_code = submission_file.read_text() + + print(f"Task: {task_path.name}") + print(f"Submission: {submission_file.name}") + print(f"Leaderboard: {leaderboard_name}") + + if args.dry_run: + print("\n[DRY RUN] Would create leaderboard and submit job") + print(f" Task config keys: {list(task_definition.task.config.keys()) if task_definition.task.config else 'None'}") + return + + # Step 1: Create test leaderboard + print("\n" + "=" * 60) + print("Step 1: Creating test leaderboard") + print("=" * 60) + + with db: + # Check if leaderboard already exists + existing = db.get_leaderboard_names() + if leaderboard_name in existing: + print(f" Leaderboard '{leaderboard_name}' already exists, deleting...") + db.delete_leaderboard(leaderboard_name, force=True) + + # Create leaderboard + deadline = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(days=30) + lb_id = db.create_leaderboard( + name=leaderboard_name, + deadline=deadline, + definition=task_definition, + creator_id=1, # Test user + forum_id=0, + gpu_types=["L40S_BK"], # Buildkite test queue GPU + ) + print(f" Created leaderboard with ID: {lb_id}") + + # Step 2: Set up backend with Buildkite launcher + print("\n" + "=" * 60) + print("Step 2: Setting up Buildkite launcher") + print("=" * 60) + + launcher = BuildkiteLauncher( + BuildkiteConfig( + org_slug=args.org, + pipeline_slug=args.pipeline, + api_token=token, + ) + ) + + # Check queue status + queue_status = await launcher.get_queue_status(args.queue) + print(f" Queue: {queue_status.get('queue')}") + print(f" Total agents: {queue_status.get('total')}") + print(f" Idle agents: {queue_status.get('idle')}") + for agent in queue_status.get("agents", []): + print(f" - {agent['name']}: {agent['state']} (busy={agent['busy']})") + + if queue_status.get("total", 0) == 0: + print("\n WARNING: No agents in queue. Job may wait indefinitely.") + print(" Make sure you have agents running on the Buildkite queue.") + + # Step 3: Create submission and run evaluation + print("\n" + "=" * 60) + print("Step 3: Creating submission and running evaluation") + print("=" * 60) + + with db: + # Create submission entry + submission_id = db.create_submission( + leaderboard=leaderboard_name, + file_name=submission_file.name, + user_id=1, # Test user + code=submission_code, + time=datetime.datetime.now(datetime.timezone.utc), + user_name="e2e-test-user", + ) + print(f" Created submission with ID: {submission_id}") + + # Build task config + from libkernelbot.task import build_task_config + + submission_mode = SubmissionMode.LEADERBOARD if args.mode == "leaderboard" else SubmissionMode.TEST + config = build_task_config( + task=task_definition.task, + submission_content=submission_code, + arch=0, # Will be set by runner + mode=submission_mode, + ) + config["submission_id"] = submission_id + + # Run on Buildkite + print("\n Submitting to Buildkite...") + gpu_type = BuildkiteGPU.L40S_BK + reporter = SimpleReporter(f"Test run on {gpu_type.name}") + + result = await launcher.run_submission(config, gpu_type, reporter) + + print(f"\n Result: success={result.success}") + if result.error: + print(f" Error: {result.error}") + print(f" System: {result.system}") + + # Step 4: Store results in database + print("\n" + "=" * 60) + print("Step 4: Storing results in database") + print("=" * 60) + + if result.success: + with db: + for run_name, run_result in result.runs.items(): + if run_result.run is None: + print(f" Skipping {run_name}: no run result") + continue + + score = None + if run_name == "leaderboard" and run_result.run.passed: + # Compute score for leaderboard runs + from libkernelbot.submission import compute_score + score = compute_score(result, task_definition.task, submission_id) + + db.create_submission_run( + submission=submission_id, + start=run_result.start, + end=run_result.end, + mode=run_name, + runner=gpu_type.name, + score=score, + secret=False, + compilation=run_result.compilation, + result=run_result.run, + system=result.system, + ) + print(f" Stored run: {run_name} (passed={run_result.run.passed}, duration={run_result.run.duration:.2f}s)") + + # Mark submission as done + db.mark_submission_done(submission_id) + print(f"\n Marked submission {submission_id} as done") + + # Step 5: Verify data in database + print("\n" + "=" * 60) + print("Step 5: Verifying data in database") + print("=" * 60) + + with db: + submission = db.get_submission_by_id(submission_id) + if submission: + print(f" Submission ID: {submission['submission_id']}") + print(f" Leaderboard: {submission['leaderboard_name']}") + print(f" File: {submission['file_name']}") + print(f" Done: {submission['done']}") + print(f" Runs: {len(submission['runs'])}") + for run in submission['runs']: + print(f" - {run['mode']}: passed={run['passed']}, runner={run['runner']}") + if run.get('system'): + gpu_name = run['system'].get('gpu', 'unknown') if isinstance(run['system'], dict) else 'unknown' + print(f" GPU: {gpu_name}") + else: + print(" ERROR: Could not retrieve submission from database!") + + # Step 6: Show summary + print("\n" + "=" * 60) + print("Summary") + print("=" * 60) + print(f" Leaderboard: {leaderboard_name}") + print(f" Submission ID: {submission_id}") + print(f" Success: {result.success}") + if result.runs: + for name, run in result.runs.items(): + if run.run: + print(f" {name}: passed={run.run.passed}, duration={run.run.duration:.2f}s") + + # Cleanup if requested + if args.cleanup: + print("\n" + "=" * 60) + print("Cleanup") + print("=" * 60) + with db: + db.delete_leaderboard(leaderboard_name, force=True) + print(f" Deleted leaderboard: {leaderboard_name}") + + print("\n" + "=" * 60) + print("E2E Test Complete!") + print("=" * 60) + + sys.exit(0 if result.success else 1) + + +if __name__ == "__main__": + asyncio.run(main()) From df9c2027b6d614dfa642aafcf1c37c51980a4bae Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 20:40:31 -0800 Subject: [PATCH 22/27] Fix CI: skip gracefully when KERNELBOT_PAYLOAD not set - Update buildkite-runner.py to exit with code 0 when no payload (push/PR triggers don't set payload, only API triggers do) - Add note to pipeline.yml about API-only triggering - Fix lint issues in e2e_buildkite_with_db.py: - Remove unused imports - Fix line length issues --- deployment/buildkite/pipeline.yml | 3 +++ scripts/e2e_buildkite_with_db.py | 14 +++++--------- src/runners/buildkite-runner.py | 9 +++++++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/deployment/buildkite/pipeline.yml b/deployment/buildkite/pipeline.yml index 0c2da4ce..9939658d 100644 --- a/deployment/buildkite/pipeline.yml +++ b/deployment/buildkite/pipeline.yml @@ -1,5 +1,8 @@ # Kernelbot Evaluation Pipeline # Jobs target specific GPU queue, Buildkite routes to idle agent +# +# NOTE: This pipeline is designed to be triggered via API with KERNELBOT_PAYLOAD. +# Direct push/PR triggers will skip gracefully. steps: - label: ":rocket: Kernel Evaluation" diff --git a/scripts/e2e_buildkite_with_db.py b/scripts/e2e_buildkite_with_db.py index 629463fe..1f3a9f29 100644 --- a/scripts/e2e_buildkite_with_db.py +++ b/scripts/e2e_buildkite_with_db.py @@ -25,7 +25,6 @@ import os import sys from pathlib import Path -from types import SimpleNamespace # Add src to path for local testing sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) @@ -106,18 +105,12 @@ async def main(): print() # Import kernelbot modules - from libkernelbot.backend import KernelBackend from libkernelbot.consts import BuildkiteGPU, SubmissionMode from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher from libkernelbot.leaderboard_db import LeaderboardDB from libkernelbot.task import make_task_definition # Set up database connection - env = SimpleNamespace( - DATABASE_URL=database_url, - DISABLE_SSL=disable_ssl, - ) - db = LeaderboardDB(url=database_url, ssl_mode="disable" if disable_ssl else "require") # Find example @@ -153,7 +146,8 @@ async def main(): if args.dry_run: print("\n[DRY RUN] Would create leaderboard and submit job") - print(f" Task config keys: {list(task_definition.task.config.keys()) if task_definition.task.config else 'None'}") + config_keys = list(task_definition.task.config.keys()) if task_definition.task.config else "None" + print(f" Task config keys: {config_keys}") return # Step 1: Create test leaderboard @@ -276,7 +270,9 @@ async def main(): result=run_result.run, system=result.system, ) - print(f" Stored run: {run_name} (passed={run_result.run.passed}, duration={run_result.run.duration:.2f}s)") + passed = run_result.run.passed + duration = run_result.run.duration + print(f" Stored run: {run_name} (passed={passed}, duration={duration:.2f}s)") # Mark submission as done db.mark_submission_done(submission_id) diff --git a/src/runners/buildkite-runner.py b/src/runners/buildkite-runner.py index d865bf2c..716270db 100644 --- a/src/runners/buildkite-runner.py +++ b/src/runners/buildkite-runner.py @@ -29,8 +29,13 @@ def main(): print() if not payload_b64: - print("ERROR: KERNELBOT_PAYLOAD not set", file=sys.stderr) - sys.exit(1) + # No payload means this was triggered by push/PR, not API + # Exit gracefully so CI doesn't fail + print("KERNELBOT_PAYLOAD not set - this build was triggered by push/PR, not API.") + print("Skipping evaluation. To run an evaluation, trigger via BuildkiteLauncher API.") + print() + print("=== Skipped (no payload) ===") + sys.exit(0) # Decode payload try: From 14d27f3cab30b21b1f9aaf7f66b0f03d9b18e55f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 20:41:39 -0800 Subject: [PATCH 23/27] Add Buildkite integration tests to CI workflow - Add integration-tests-buildkite job to testing.yml - Runs pytest -m integration tests/test_buildkite.py - Uses BUILDKITE_API_TOKEN secret - Matches pattern of Modal and GitHub integration tests --- .github/workflows/testing.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index c34818f2..cb90e3a0 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -63,3 +63,16 @@ jobs: - uses: astral-sh/setup-uv@v4 - run: uv sync --extra dev - run: uv run pytest -m integration tests/test_github.py -v + + integration-tests-buildkite: + runs-on: ubuntu-latest + timeout-minutes: 30 + # Skip for Dependabot PRs as they don't have access to secrets + if: github.actor != 'dependabot[bot]' + env: + BUILDKITE_API_TOKEN: ${{ secrets.BUILDKITE_API_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v4 + - run: uv sync --extra dev + - run: uv run pytest -m integration tests/test_buildkite.py -v From 95b0a8496bbc48ba2edaa4334210d3bd10b79378 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 20:42:56 -0800 Subject: [PATCH 24/27] Fix lint: add noqa for C901 complexity in scripts --- scripts/e2e_buildkite_with_db.py | 2 +- scripts/submit_buildkite_job.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/e2e_buildkite_with_db.py b/scripts/e2e_buildkite_with_db.py index 1f3a9f29..dd68dc23 100644 --- a/scripts/e2e_buildkite_with_db.py +++ b/scripts/e2e_buildkite_with_db.py @@ -70,7 +70,7 @@ async def show(self, msg): print(f"\n[SHOW] {msg}") -async def main(): +async def main(): # noqa: C901 parser = argparse.ArgumentParser(description="E2E Buildkite test with database storage") parser.add_argument("--queue", default="test", help="Buildkite queue (default: test)") parser.add_argument("--org", default="gpu-mode", help="Buildkite org slug") diff --git a/scripts/submit_buildkite_job.py b/scripts/submit_buildkite_job.py index e4d5a573..8d835657 100755 --- a/scripts/submit_buildkite_job.py +++ b/scripts/submit_buildkite_job.py @@ -35,7 +35,7 @@ async def update(self, msg): print(f"[UPDATE] {msg}") -async def main(): +async def main(): # noqa: C901 parser = argparse.ArgumentParser(description="Submit a test job to Buildkite") parser.add_argument("--org", default="mark-saroufim", help="Buildkite org slug") parser.add_argument("--pipeline", default="kernelbot", help="Pipeline slug") From 38069fc718461b6e6b59abcaf783b6c4701a84fb Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 20:45:22 -0800 Subject: [PATCH 25/27] Fix default Buildkite org to mark-saroufim --- scripts/e2e_buildkite_with_db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/e2e_buildkite_with_db.py b/scripts/e2e_buildkite_with_db.py index dd68dc23..d7ff3a09 100644 --- a/scripts/e2e_buildkite_with_db.py +++ b/scripts/e2e_buildkite_with_db.py @@ -12,7 +12,7 @@ Options: --queue Buildkite queue (default: test) - --org Buildkite org (default: gpu-mode) + --org Buildkite org (default: mark-saroufim) --pipeline Pipeline name (default: kernelbot) --example Example to run (default: vectoradd_py) --cleanup Delete the test leaderboard after the test @@ -73,7 +73,7 @@ async def show(self, msg): async def main(): # noqa: C901 parser = argparse.ArgumentParser(description="E2E Buildkite test with database storage") parser.add_argument("--queue", default="test", help="Buildkite queue (default: test)") - parser.add_argument("--org", default="gpu-mode", help="Buildkite org slug") + parser.add_argument("--org", default="mark-saroufim", help="Buildkite org slug") parser.add_argument("--pipeline", default="kernelbot", help="Pipeline slug") parser.add_argument("--example", default="vectoradd_py", help="Example to run") parser.add_argument("--mode", choices=["test", "leaderboard"], default="test", help="Submission mode") From 3c585f5bd205f5cd32342dbf8c052a2d4456d528 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 20:52:11 -0800 Subject: [PATCH 26/27] Add org/billing limitation to buildkite docs --- SKILLS/buildkite.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md index 079e5b73..b3222cb0 100644 --- a/SKILLS/buildkite.md +++ b/SKILLS/buildkite.md @@ -839,6 +839,24 @@ retry: - Network isolation (Docker bridge) - Result validation before storing +### 8. Organization & Billing + +**Current State**: Running under personal `mark-saroufim` Buildkite org. + +**Limitations**: +- **Not production-ready**: Personal org has limited visibility/access controls +- **Billing unclear**: Need to understand Buildkite pricing for self-hosted agents + - Self-hosted agents are free, but there may be limits on concurrent builds + - Artifact storage (S3) costs depend on volume +- **Access management**: Personal org doesn't support team-based permissions + +**TODO before production**: +- [ ] Create official `gpu-mode` Buildkite organization +- [ ] Understand billing model for high-volume usage +- [ ] Set up proper team access controls +- [ ] Configure SSO/SAML if needed +- [ ] Review artifact retention policies and costs + --- ## Future Improvements From 2397cca590d83ba85ebc2d937d187aa0715e25d8 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 4 Feb 2026 20:57:00 -0800 Subject: [PATCH 27/27] Fix default Buildkite org to mark-saroufim in env and launcher --- src/kernelbot/env.py | 2 +- src/libkernelbot/launchers/buildkite.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernelbot/env.py b/src/kernelbot/env.py index 703f2b3c..380ed55b 100644 --- a/src/kernelbot/env.py +++ b/src/kernelbot/env.py @@ -35,7 +35,7 @@ # Buildkite-specific constants env.BUILDKITE_API_TOKEN = os.getenv("BUILDKITE_API_TOKEN") -env.BUILDKITE_ORG = os.getenv("BUILDKITE_ORG", "gpu-mode") +env.BUILDKITE_ORG = os.getenv("BUILDKITE_ORG", "mark-saroufim") env.BUILDKITE_PIPELINE = os.getenv("BUILDKITE_PIPELINE", "kernelbot") # Directory that will be used for local problem development. diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py index b8c37bfe..f160a2c7 100644 --- a/src/libkernelbot/launchers/buildkite.py +++ b/src/libkernelbot/launchers/buildkite.py @@ -40,7 +40,7 @@ class BuildkiteConfig: """Buildkite launcher configuration.""" - org_slug: str = "gpu-mode" + org_slug: str = "mark-saroufim" pipeline_slug: str = "kernelbot" api_token: str = field(default_factory=lambda: os.environ.get("BUILDKITE_API_TOKEN", ""))