docker · ericcurtin · Apr 21, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/.versions b/.versions
@@ -1,7 +1,7 @@
 GO_VERSION=1.25
-VLLM_VERSION=0.19.0
-VLLM_UPSTREAM_VERSION=0.17.1
-VLLM_METAL_RELEASE=v0.1.0-20260320-122309
+VLLM_VERSION=0.19.1
+VLLM_UPSTREAM_VERSION=0.19.0
+VLLM_METAL_RELEASE=v0.2.0-20260420-142150
 DIFFUSERS_RELEASE=v0.1.0-20260216-000000
 SGLANG_VERSION=0.5.6
 LLAMA_SERVER_VERSION=latest

diff --git a/Dockerfile b/Dockerfile
@@ -90,7 +90,7 @@ ENTRYPOINT ["/app/model-runner"]
 # --- vLLM variant ---
 FROM llamacpp AS vllm
 
-ARG VLLM_VERSION=0.19.0
+ARG VLLM_VERSION=0.19.1
 ARG VLLM_CUDA_VERSION=cu130
 ARG VLLM_PYTHON_TAG=cp38-abi3
 ARG TARGETARCH

diff --git a/pkg/inference/backends/vllm/vllm_metal.go b/pkg/inference/backends/vllm/vllm_metal.go
@@ -26,7 +26,7 @@ import (
 const (
 	defaultInstallDir = ".docker/model-runner/vllm-metal"
 	// vllmMetalVersion is the vllm-metal release tag to download from Docker Hub.
-	vllmMetalVersion = "v0.1.0-20260320-122309"
+	vllmMetalVersion = "v0.2.0-20260420-142150"
 )
 
 var (
@@ -176,6 +176,29 @@ func (v *vllmMetal) downloadAndExtract(ctx context.Context, _ *http.Client) erro
 		return fmt.Errorf("failed to make python3 executable: %w", err)
 	}
 
+	// Copy pre-built Metal kernel extension to the user's cache directory
+	// so vllm-metal skips JIT compilation at runtime (the macOS sandbox
+	// blocks clang++ invocations needed by the JIT compiler).
+	homeDir, err := os.UserHomeDir()
+	if err == nil {
+		cacheDir := filepath.Join(homeDir, ".cache", "vllm-metal")
+		prebuiltDir := filepath.Join(v.installDir, "prebuilt")
+		if entries, readErr := os.ReadDir(prebuiltDir); readErr == nil {
+			if mkErr := os.MkdirAll(cacheDir, 0755); mkErr == nil {
+				for _, entry := range entries {
+					src := filepath.Join(prebuiltDir, entry.Name())
+					dst := filepath.Join(cacheDir, entry.Name())
+					if data, cpErr := os.ReadFile(src); cpErr == nil {
+						if wErr := os.WriteFile(dst, data, 0755); wErr != nil {
+							v.log.Warn("failed to copy prebuilt extension", "file", entry.Name(), "error", wErr)
+						}
+					}
+				}
+				v.log.Info("Copied pre-built Metal kernel extension to cache", "cacheDir", cacheDir)
+			}
+		}
+	}
+
 	v.log.Info("vllm-metal installed successfully", "version", vllmMetalVersion)
 	return nil
 }

diff --git a/pkg/sandbox/sandbox_darwin.go b/pkg/sandbox/sandbox_darwin.go
@@ -79,13 +79,18 @@ const ConfigurationPython = `(version 1)
     (subpath "/private/tmp")
     (subpath "[HOMEDIR]/Library/Containers/com.docker.docker/Data")
     (subpath "[WORKDIR]")
-    (subpath "[HOMEDIR]/.cache/vllm"))
+    (subpath "[HOMEDIR]/.cache/vllm")
+    (subpath "[HOMEDIR]/.cache/vllm-metal"))
 (allow file-read*
     (subpath "[HOMEDIR]/.docker/models")
     (subpath "[HOMEDIR]/Library/Containers/com.docker.docker/Data")
     (subpath "[WORKDIR]")
     (subpath "[HOMEDIR]/.cache/vllm")
+    (subpath "[HOMEDIR]/.cache/vllm-metal")
     (subpath "/private/tmp"))
+;;; Allow loading pre-compiled Metal kernel extensions from the vllm-metal cache.
+(allow file-map-executable
+    (subpath "[HOMEDIR]/.cache/vllm-metal"))
 `
 
 // ConfigurationLlamaCpp is the sandbox configuration for llama.cpp processes.

diff --git a/scripts/build-vllm-metal-tarball.sh b/scripts/build-vllm-metal-tarball.sh
@@ -70,6 +70,15 @@ curl -fsSL -O "$VLLM_METAL_WHEEL_URL"
 uv pip install --python "$PYTHON_DIR/bin/python3" --system vllm_metal-*.whl
 rm -f vllm_metal-*.whl
 
+# Pre-compile the paged_ops Metal kernel extension so users don't need Xcode CLT
+# at runtime (the macOS sandbox blocks clang++ invocations).  build.py caches the
+# compiled .so under ~/.cache/vllm-metal/; we redirect $HOME so the artefact
+# lands in a known temp location we can bundle into the tarball.
+echo "Pre-compiling vllm-metal paged_ops extension..."
+HOME="$WORK_DIR" "$PYTHON_DIR/bin/python3" -c "from vllm_metal.metal.build import build; build()"
+mkdir -p "$PYTHON_DIR/prebuilt"
+cp "$WORK_DIR/.cache/vllm-metal/"*_paged_ops* "$PYTHON_DIR/prebuilt/"
+
 # Strip files not needed at runtime to reduce tarball size
 echo "Stripping unnecessary files..."
 rm -rf "$PYTHON_DIR/include"