diff --git a/.versions b/.versions index dde091af..194904c2 100644 --- a/.versions +++ b/.versions @@ -1,7 +1,7 @@ GO_VERSION=1.25 -VLLM_VERSION=0.19.0 -VLLM_UPSTREAM_VERSION=0.17.1 -VLLM_METAL_RELEASE=v0.1.0-20260320-122309 +VLLM_VERSION=0.19.1 +VLLM_UPSTREAM_VERSION=0.19.0 +VLLM_METAL_RELEASE=v0.2.0-20260420-142150 DIFFUSERS_RELEASE=v0.1.0-20260216-000000 SGLANG_VERSION=0.5.6 LLAMA_SERVER_VERSION=latest diff --git a/Dockerfile b/Dockerfile index 0412f3cc..976d723c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -90,7 +90,7 @@ ENTRYPOINT ["/app/model-runner"] # --- vLLM variant --- FROM llamacpp AS vllm -ARG VLLM_VERSION=0.19.0 +ARG VLLM_VERSION=0.19.1 ARG VLLM_CUDA_VERSION=cu130 ARG VLLM_PYTHON_TAG=cp38-abi3 ARG TARGETARCH diff --git a/pkg/inference/backends/vllm/vllm_metal.go b/pkg/inference/backends/vllm/vllm_metal.go index 1840d940..8f25a24d 100644 --- a/pkg/inference/backends/vllm/vllm_metal.go +++ b/pkg/inference/backends/vllm/vllm_metal.go @@ -26,7 +26,7 @@ import ( const ( defaultInstallDir = ".docker/model-runner/vllm-metal" // vllmMetalVersion is the vllm-metal release tag to download from Docker Hub. - vllmMetalVersion = "v0.1.0-20260320-122309" + vllmMetalVersion = "v0.2.0-20260420-142150" ) var ( @@ -176,6 +176,29 @@ func (v *vllmMetal) downloadAndExtract(ctx context.Context, _ *http.Client) erro return fmt.Errorf("failed to make python3 executable: %w", err) } + // Copy pre-built Metal kernel extension to the user's cache directory + // so vllm-metal skips JIT compilation at runtime (the macOS sandbox + // blocks clang++ invocations needed by the JIT compiler). + homeDir, err := os.UserHomeDir() + if err == nil { + cacheDir := filepath.Join(homeDir, ".cache", "vllm-metal") + prebuiltDir := filepath.Join(v.installDir, "prebuilt") + if entries, readErr := os.ReadDir(prebuiltDir); readErr == nil { + if mkErr := os.MkdirAll(cacheDir, 0755); mkErr == nil { + for _, entry := range entries { + src := filepath.Join(prebuiltDir, entry.Name()) + dst := filepath.Join(cacheDir, entry.Name()) + if data, cpErr := os.ReadFile(src); cpErr == nil { + if wErr := os.WriteFile(dst, data, 0755); wErr != nil { + v.log.Warn("failed to copy prebuilt extension", "file", entry.Name(), "error", wErr) + } + } + } + v.log.Info("Copied pre-built Metal kernel extension to cache", "cacheDir", cacheDir) + } + } + } + v.log.Info("vllm-metal installed successfully", "version", vllmMetalVersion) return nil } diff --git a/pkg/sandbox/sandbox_darwin.go b/pkg/sandbox/sandbox_darwin.go index 42793ee0..25eee928 100644 --- a/pkg/sandbox/sandbox_darwin.go +++ b/pkg/sandbox/sandbox_darwin.go @@ -79,13 +79,18 @@ const ConfigurationPython = `(version 1) (subpath "/private/tmp") (subpath "[HOMEDIR]/Library/Containers/com.docker.docker/Data") (subpath "[WORKDIR]") - (subpath "[HOMEDIR]/.cache/vllm")) + (subpath "[HOMEDIR]/.cache/vllm") + (subpath "[HOMEDIR]/.cache/vllm-metal")) (allow file-read* (subpath "[HOMEDIR]/.docker/models") (subpath "[HOMEDIR]/Library/Containers/com.docker.docker/Data") (subpath "[WORKDIR]") (subpath "[HOMEDIR]/.cache/vllm") + (subpath "[HOMEDIR]/.cache/vllm-metal") (subpath "/private/tmp")) +;;; Allow loading pre-compiled Metal kernel extensions from the vllm-metal cache. +(allow file-map-executable + (subpath "[HOMEDIR]/.cache/vllm-metal")) ` // ConfigurationLlamaCpp is the sandbox configuration for llama.cpp processes. diff --git a/scripts/build-vllm-metal-tarball.sh b/scripts/build-vllm-metal-tarball.sh index 4a04f9f9..5d0a16b9 100755 --- a/scripts/build-vllm-metal-tarball.sh +++ b/scripts/build-vllm-metal-tarball.sh @@ -70,6 +70,15 @@ curl -fsSL -O "$VLLM_METAL_WHEEL_URL" uv pip install --python "$PYTHON_DIR/bin/python3" --system vllm_metal-*.whl rm -f vllm_metal-*.whl +# Pre-compile the paged_ops Metal kernel extension so users don't need Xcode CLT +# at runtime (the macOS sandbox blocks clang++ invocations). build.py caches the +# compiled .so under ~/.cache/vllm-metal/; we redirect $HOME so the artefact +# lands in a known temp location we can bundle into the tarball. +echo "Pre-compiling vllm-metal paged_ops extension..." +HOME="$WORK_DIR" "$PYTHON_DIR/bin/python3" -c "from vllm_metal.metal.build import build; build()" +mkdir -p "$PYTHON_DIR/prebuilt" +cp "$WORK_DIR/.cache/vllm-metal/"*_paged_ops* "$PYTHON_DIR/prebuilt/" + # Strip files not needed at runtime to reduce tarball size echo "Stripping unnecessary files..." rm -rf "$PYTHON_DIR/include"