Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .versions
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
GO_VERSION=1.25
VLLM_VERSION=0.19.0
VLLM_UPSTREAM_VERSION=0.17.1
VLLM_METAL_RELEASE=v0.1.0-20260320-122309
VLLM_VERSION=0.19.1
VLLM_UPSTREAM_VERSION=0.19.0
VLLM_METAL_RELEASE=v0.2.0-20260420-142150
DIFFUSERS_RELEASE=v0.1.0-20260216-000000
SGLANG_VERSION=0.5.6
LLAMA_SERVER_VERSION=latest
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ ENTRYPOINT ["/app/model-runner"]
# --- vLLM variant ---
FROM llamacpp AS vllm

ARG VLLM_VERSION=0.19.0
ARG VLLM_VERSION=0.19.1
ARG VLLM_CUDA_VERSION=cu130
ARG VLLM_PYTHON_TAG=cp38-abi3
ARG TARGETARCH
Expand Down
25 changes: 24 additions & 1 deletion pkg/inference/backends/vllm/vllm_metal.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
const (
defaultInstallDir = ".docker/model-runner/vllm-metal"
// vllmMetalVersion is the vllm-metal release tag to download from Docker Hub.
vllmMetalVersion = "v0.1.0-20260320-122309"
vllmMetalVersion = "v0.2.0-20260420-142150"
)

var (
Expand Down Expand Up @@ -176,6 +176,29 @@ func (v *vllmMetal) downloadAndExtract(ctx context.Context, _ *http.Client) erro
return fmt.Errorf("failed to make python3 executable: %w", err)
}

// Copy pre-built Metal kernel extension to the user's cache directory
// so vllm-metal skips JIT compilation at runtime (the macOS sandbox
// blocks clang++ invocations needed by the JIT compiler).
homeDir, err := os.UserHomeDir()
if err == nil {
cacheDir := filepath.Join(homeDir, ".cache", "vllm-metal")
prebuiltDir := filepath.Join(v.installDir, "prebuilt")
if entries, readErr := os.ReadDir(prebuiltDir); readErr == nil {
if mkErr := os.MkdirAll(cacheDir, 0755); mkErr == nil {
for _, entry := range entries {
src := filepath.Join(prebuiltDir, entry.Name())
dst := filepath.Join(cacheDir, entry.Name())
if data, cpErr := os.ReadFile(src); cpErr == nil {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Consider surfacing or aggregating non-transient errors during prebuilt cache setup instead of fully swallowing them.

This block currently ignores all errors except a WARN on failed writes. Since this cache helps avoid JIT failures in the sandbox, it’d be helpful to separate best-effort failures from configuration issues. At minimum, consider logging when os.UserHomeDir, os.ReadDir, or os.MkdirAll fail, or logging once when prebuilt cache setup is skipped entirely, to ease debugging of misconfigured environments.

if wErr := os.WriteFile(dst, data, 0755); wErr != nil {
v.log.Warn("failed to copy prebuilt extension", "file", entry.Name(), "error", wErr)
}
}
}
v.log.Info("Copied pre-built Metal kernel extension to cache", "cacheDir", cacheDir)
}
}
}

v.log.Info("vllm-metal installed successfully", "version", vllmMetalVersion)
return nil
}
Expand Down
7 changes: 6 additions & 1 deletion pkg/sandbox/sandbox_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,18 @@ const ConfigurationPython = `(version 1)
(subpath "/private/tmp")
(subpath "[HOMEDIR]/Library/Containers/com.docker.docker/Data")
(subpath "[WORKDIR]")
(subpath "[HOMEDIR]/.cache/vllm"))
(subpath "[HOMEDIR]/.cache/vllm")
(subpath "[HOMEDIR]/.cache/vllm-metal"))
(allow file-read*
(subpath "[HOMEDIR]/.docker/models")
(subpath "[HOMEDIR]/Library/Containers/com.docker.docker/Data")
(subpath "[WORKDIR]")
(subpath "[HOMEDIR]/.cache/vllm")
(subpath "[HOMEDIR]/.cache/vllm-metal")
(subpath "/private/tmp"))
;;; Allow loading pre-compiled Metal kernel extensions from the vllm-metal cache.
(allow file-map-executable
(subpath "[HOMEDIR]/.cache/vllm-metal"))
`

// ConfigurationLlamaCpp is the sandbox configuration for llama.cpp processes.
Expand Down
9 changes: 9 additions & 0 deletions scripts/build-vllm-metal-tarball.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,15 @@ curl -fsSL -O "$VLLM_METAL_WHEEL_URL"
uv pip install --python "$PYTHON_DIR/bin/python3" --system vllm_metal-*.whl
rm -f vllm_metal-*.whl

# Pre-compile the paged_ops Metal kernel extension so users don't need Xcode CLT
# at runtime (the macOS sandbox blocks clang++ invocations). build.py caches the
# compiled .so under ~/.cache/vllm-metal/; we redirect $HOME so the artefact
# lands in a known temp location we can bundle into the tarball.
echo "Pre-compiling vllm-metal paged_ops extension..."
HOME="$WORK_DIR" "$PYTHON_DIR/bin/python3" -c "from vllm_metal.metal.build import build; build()"
mkdir -p "$PYTHON_DIR/prebuilt"
cp "$WORK_DIR/.cache/vllm-metal/"*_paged_ops* "$PYTHON_DIR/prebuilt/"

# Strip files not needed at runtime to reduce tarball size
echo "Stripping unnecessary files..."
rm -rf "$PYTHON_DIR/include"
Expand Down
Loading