docker · doringeman · Jan 28, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,8 @@ vendor/
 llamacpp/build
 llamacpp/install
 
+# vllm-metal build artifacts
+.vllm-metal/
+vllm-metal-macos-arm64-*.tar.gz
+
 .DS_Store
diff --git a/Makefile b/Makefile
@@ -26,7 +26,7 @@ DOCKER_BUILD_ARGS := \
 BUILD_DMR ?= 1
 
 # Main targets
-.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang docker-run-impl help validate lint docker-build-diffusers docker-run-diffusers
+.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang docker-run-impl help validate lint docker-build-diffusers docker-run-diffusers vllm-metal-build vllm-metal-install vllm-metal-dev vllm-metal-clean
 # Default target
 .DEFAULT_GOAL := build
 
@@ -144,7 +144,94 @@ docker-run-impl:
 	DEBUG="${DEBUG}" \
 	scripts/docker-run.sh
 
-# Show help
+# vllm-metal (macOS ARM64 only, requires Python 3.12 for wheel compatibility)
+VLLM_METAL_RELEASE ?= v0.1.0-20260126-121650
+VLLM_METAL_INSTALL_DIR := $(HOME)/.docker/model-runner/vllm-metal
+VLLM_METAL_TARBALL := vllm-metal-macos-arm64-$(VLLM_METAL_RELEASE).tar.gz
+
+vllm-metal-build:
+	@if [ -f "$(VLLM_METAL_TARBALL)" ]; then \
+		echo "Tarball already exists: $(VLLM_METAL_TARBALL)"; \
+	else \
+		echo "Building vllm-metal tarball..."; \
+		scripts/build-vllm-metal-tarball.sh $(VLLM_METAL_RELEASE) $(VLLM_METAL_TARBALL); \
+		echo "Tarball created: $(VLLM_METAL_TARBALL)"; \
+	fi
+
+vllm-metal-install:
+	@VERSION_FILE="$(VLLM_METAL_INSTALL_DIR)/.vllm-metal-version"; \
+	if [ -f "$$VERSION_FILE" ] && [ "$$(cat "$$VERSION_FILE")" = "$(VLLM_METAL_RELEASE)" ]; then \
+		echo "vllm-metal $(VLLM_METAL_RELEASE) already installed"; \
+		exit 0; \
+	fi; \
+	if [ ! -f "$(VLLM_METAL_TARBALL)" ]; then \
+		echo "Error: $(VLLM_METAL_TARBALL) not found. Run 'make vllm-metal-build' first."; \
+		exit 1; \
+	fi; \
+	echo "Installing vllm-metal to $(VLLM_METAL_INSTALL_DIR)..."; \
+	PYTHON_BIN=""; \
+	if command -v python3.12 >/dev/null 2>&1; then \
+		PYTHON_BIN="python3.12"; \
+	elif command -v python3 >/dev/null 2>&1; then \
+		version=$$(python3 --version 2>&1 | grep -oE '[0-9]+\.[0-9]+'); \
+		if [ "$$version" = "3.12" ]; then \
+			PYTHON_BIN="python3"; \
+		fi; \
+	fi; \
+	if [ -z "$$PYTHON_BIN" ]; then \
+		echo "Error: Python 3.12 required (vllm-metal wheel is built for cp312)"; \
+		echo "Install with: brew install python@3.12"; \
+		exit 1; \
+	fi; \
+	echo "Using Python 3.12 from $$(which $$PYTHON_BIN)"; \
+	rm -rf "$(VLLM_METAL_INSTALL_DIR)"; \
+	$$PYTHON_BIN -m venv "$(VLLM_METAL_INSTALL_DIR)"; \
+	SITE_PACKAGES="$(VLLM_METAL_INSTALL_DIR)/lib/python3.12/site-packages"; \
+	mkdir -p "$$SITE_PACKAGES"; \
+	tar -xzf "$(VLLM_METAL_TARBALL)" -C "$$SITE_PACKAGES"; \
+	echo "$(VLLM_METAL_RELEASE)" > "$$VERSION_FILE"; \
+	echo "vllm-metal $(VLLM_METAL_RELEASE) installed successfully!"
+
+vllm-metal-dev:
+	@if [ -z "$(VLLM_METAL_PATH)" ]; then \
+		echo "Usage: make vllm-metal-dev VLLM_METAL_PATH=../vllm-metal"; \
+		exit 1; \
+	fi
+	@PYTHON_BIN=""; \
+	if command -v python3.12 >/dev/null 2>&1; then \
+		PYTHON_BIN="python3.12"; \
+	elif command -v python3 >/dev/null 2>&1; then \
+		version=$$(python3 --version 2>&1 | grep -oE '[0-9]+\.[0-9]+'); \
+		if [ "$$version" = "3.12" ]; then \
+			PYTHON_BIN="python3"; \
+		fi; \
+	fi; \
+	if [ -z "$$PYTHON_BIN" ]; then \
+		echo "Error: Python 3.12 required"; \
+		echo "Install with: brew install python@3.12"; \
+		exit 1; \
+	fi; \
+	echo "Installing vllm-metal from $(VLLM_METAL_PATH)..."; \
+	rm -rf "$(VLLM_METAL_INSTALL_DIR)"; \
+	$$PYTHON_BIN -m venv "$(VLLM_METAL_INSTALL_DIR)"; \
+	. "$(VLLM_METAL_INSTALL_DIR)/bin/activate" && \
+		VLLM_VERSION="0.13.0" && \
+		WORK_DIR=$$(mktemp -d) && \
+		curl -fsSL -o "$$WORK_DIR/vllm.tar.gz" "https://github.com/vllm-project/vllm/releases/download/v$$VLLM_VERSION/vllm-$$VLLM_VERSION.tar.gz" && \
+		tar -xzf "$$WORK_DIR/vllm.tar.gz" -C "$$WORK_DIR" && \
+		pip install -r "$$WORK_DIR/vllm-$$VLLM_VERSION/requirements/cpu.txt" && \
+		pip install -e "$(VLLM_METAL_PATH)" && \
+		pip install -r "$$WORK_DIR/vllm-$$VLLM_VERSION/requirements/common.txt" && \
+		rm -rf "$$WORK_DIR" && \
+		echo "dev" > "$(VLLM_METAL_INSTALL_DIR)/.vllm-metal-version"; \
+	echo "vllm-metal dev installed from $(VLLM_METAL_PATH)"
+
+vllm-metal-clean:
+	@echo "Removing vllm-metal installation and build artifacts..."
+	rm -rf "$(VLLM_METAL_INSTALL_DIR)"
+	rm -f $(VLLM_METAL_TARBALL)
+	@echo "vllm-metal cleaned!"
+
 help:
 	@echo "Available targets:"
 	@echo "  build				- Build the Go application"
@@ -164,6 +251,10 @@ help:
 	@echo "  docker-run-sglang		- Run SGLang Docker container"
 	@echo "  docker-build-diffusers	- Build Diffusers Docker image"
 	@echo "  docker-run-diffusers		- Run Diffusers Docker container"
+	@echo "  vllm-metal-build		- Build vllm-metal tarball locally (macOS ARM64)"
+	@echo "  vllm-metal-install		- Install vllm-metal from local tarball"
+	@echo "  vllm-metal-dev		- Install vllm-metal from local source (editable)"
+	@echo "  vllm-metal-clean		- Clean vllm-metal installation and tarball"
 	@echo "  help				- Show this help message"
 	@echo ""
 	@echo "Backend configuration options:"
@@ -174,3 +265,11 @@ help:
 	@echo "  make run LLAMA_ARGS=\"--verbose --jinja -ngl 999 --ctx-size 2048\""
 	@echo "  make run LOCAL_LLAMA=1"
 	@echo "  make docker-run LLAMA_ARGS=\"--verbose --jinja -ngl 999 --threads 4 --ctx-size 2048\""
+	@echo ""
+	@echo "vllm-metal (macOS ARM64 only, requires Python 3.12):"
+	@echo "  1. Auto-pull from Docker Hub (clean dev installs first: make vllm-metal-clean):"
+	@echo "     make run"
+	@echo "  2. Build and install from tarball:"
+	@echo "     make vllm-metal-build && make vllm-metal-install && make run"
+	@echo "  3. Install from local source (for development):"
+	@echo "     make vllm-metal-dev VLLM_METAL_PATH=../vllm-metal && make run"
diff --git a/cmd/cli/desktop/api.go b/cmd/cli/desktop/api.go
@@ -34,6 +34,10 @@ type OpenAIChatResponse struct {
 			Role             string `json:"role,omitempty"`
 			ReasoningContent string `json:"reasoning_content,omitempty"`
 		} `json:"delta"`
+		Message struct {
+			Content string `json:"content"`
+			Role    string `json:"role,omitempty"`
+		} `json:"message"`
 		Index        int    `json:"index"`
 		FinishReason string `json:"finish_reason"`
 	} `json:"choices"`

diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go
@@ -440,74 +440,131 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
 		TotalTokens      int `json:"total_tokens"`
 	}
 
-	scanner := bufio.NewScanner(resp.Body)
-	for scanner.Scan() {
-		// Check if context was cancelled
-		select {
-		case <-ctx.Done():
-			return assistantResponse.String(), ctx.Err()
-		default:
-		}
+	// Read the first line to detect if this is SSE streaming or a regular JSON response
+	reader := bufio.NewReader(resp.Body)
+	firstLine, err := reader.ReadString('\n')
+	if err != nil && !errors.Is(err, io.EOF) {
+		return assistantResponse.String(), fmt.Errorf("error reading response: %w", err)
+	}
+	firstLine = strings.TrimSpace(firstLine)
 
-		line := scanner.Text()
-		if line == "" {
-			continue
+	// Check if this is a non-streaming JSON response (doesn't start with "data: ")
+	if firstLine != "" && !strings.HasPrefix(firstLine, "data: ") {
+		// This might be a regular JSON response - read the rest and try to parse it
+		restOfBody, readErr := io.ReadAll(reader)
+		if readErr != nil {
+			return assistantResponse.String(), fmt.Errorf("error reading response body: %w", readErr)
 		}
+		fullBody := firstLine + string(restOfBody)
 
-		if !strings.HasPrefix(line, "data: ") {
-			continue
+		var nonStreamResp OpenAIChatResponse
+		if err := json.Unmarshal([]byte(fullBody), &nonStreamResp); err != nil {
+			return assistantResponse.String(), fmt.Errorf("error parsing response: %w", err)
 		}
 
-		data := strings.TrimPrefix(line, "data: ")
-
-		if data == "[DONE]" {
-			break
+		// Extract content from non-streaming response
+		if len(nonStreamResp.Choices) > 0 && nonStreamResp.Choices[0].Message.Content != "" {
+			content := nonStreamResp.Choices[0].Message.Content
+			outputFunc(content)
+			assistantResponse.WriteString(content)
 		}
 
-		var streamResp OpenAIChatResponse
-		if err := json.Unmarshal([]byte(data), &streamResp); err != nil {
-			return assistantResponse.String(), fmt.Errorf("error parsing stream response: %w", err)
+		if nonStreamResp.Usage != nil {
+			finalUsage = nonStreamResp.Usage
 		}
-
-		if streamResp.Usage != nil {
-			finalUsage = streamResp.Usage
+	} else {
+		// SSE streaming response - process line by line
+		scanner := bufio.NewScanner(reader)
+
+		// Process the first line if it was SSE data
+		if strings.HasPrefix(firstLine, "data: ") {
+			data := strings.TrimPrefix(firstLine, "data: ")
+			if data != "[DONE]" {
+				var streamResp OpenAIChatResponse
+				if err := json.Unmarshal([]byte(data), &streamResp); err == nil {
+					if streamResp.Usage != nil {
+						finalUsage = streamResp.Usage
+					}
+					if len(streamResp.Choices) > 0 {
+						if streamResp.Choices[0].Delta.Content != "" {
+							chunk := streamResp.Choices[0].Delta.Content
+							printerState = chatPrinterContent
+							outputFunc(chunk)
+							assistantResponse.WriteString(chunk)
+						}
+					}
+				}
+			}
 		}
 
-		if len(streamResp.Choices) > 0 {
-			if streamResp.Choices[0].Delta.ReasoningContent != "" {
-				chunk := streamResp.Choices[0].Delta.ReasoningContent
-				if printerState == chatPrinterContent {
-					outputFunc("\n\n")
-				}
-				if printerState != chatPrinterReasoning {
-					const thinkingHeader = "Thinking:\n"
+		for scanner.Scan() {
+			// Check if context was cancelled
+			select {
+			case <-ctx.Done():
+				return assistantResponse.String(), ctx.Err()
+			default:
+			}
+
+			line := scanner.Text()
+			if line == "" {
+				continue
+			}
+
+			if !strings.HasPrefix(line, "data: ") {
+				continue
+			}
+
+			data := strings.TrimPrefix(line, "data: ")
+
+			if data == "[DONE]" {
+				break
+			}
+
+			var streamResp OpenAIChatResponse
+			if err := json.Unmarshal([]byte(data), &streamResp); err != nil {
+				return assistantResponse.String(), fmt.Errorf("error parsing stream response: %w", err)
+			}
+
+			if streamResp.Usage != nil {
+				finalUsage = streamResp.Usage
+			}
+
+			if len(streamResp.Choices) > 0 {
+				if streamResp.Choices[0].Delta.ReasoningContent != "" {
+					chunk := streamResp.Choices[0].Delta.ReasoningContent
+					if printerState == chatPrinterContent {
+						outputFunc("\n\n")
+					}
+					if printerState != chatPrinterReasoning {
+						const thinkingHeader = "Thinking:\n"
+						if reasoningFmt != nil {
+							reasoningFmt.Print(thinkingHeader)
+						} else {
+							outputFunc(thinkingHeader)
+						}
+					}
+					printerState = chatPrinterReasoning
 					if reasoningFmt != nil {
-						reasoningFmt.Print(thinkingHeader)
+						reasoningFmt.Print(chunk)
 					} else {
-						outputFunc(thinkingHeader)
+						outputFunc(chunk)
 					}
 				}
-				printerState = chatPrinterReasoning
-				if reasoningFmt != nil {
-					reasoningFmt.Print(chunk)
-				} else {
+				if streamResp.Choices[0].Delta.Content != "" {
+					chunk := streamResp.Choices[0].Delta.Content
+					if printerState == chatPrinterReasoning {
+						outputFunc("\n\n--\n\n")
+					}
+					printerState = chatPrinterContent
 					outputFunc(chunk)
+					assistantResponse.WriteString(chunk)
 				}
 			}
-			if streamResp.Choices[0].Delta.Content != "" {
-				chunk := streamResp.Choices[0].Delta.Content
-				if printerState == chatPrinterReasoning {
-					outputFunc("\n\n--\n\n")
-				}
-				printerState = chatPrinterContent
-				outputFunc(chunk)
-				assistantResponse.WriteString(chunk)
-			}
 		}
-	}
 
-	if err := scanner.Err(); err != nil {
-		return assistantResponse.String(), fmt.Errorf("error reading response stream: %w", err)
+		if err := scanner.Err(); err != nil {
+			return assistantResponse.String(), fmt.Errorf("error reading response stream: %w", err)
+		}
 	}
 
 	if finalUsage != nil {

diff --git a/main.go b/main.go
@@ -19,8 +19,10 @@ import (
 	"github.com/docker/model-runner/pkg/inference/backends/mlx"
 	"github.com/docker/model-runner/pkg/inference/backends/sglang"
 	"github.com/docker/model-runner/pkg/inference/backends/vllm"
+	"github.com/docker/model-runner/pkg/inference/backends/vllmmetal"
 	"github.com/docker/model-runner/pkg/inference/config"
 	"github.com/docker/model-runner/pkg/inference/models"
+	"github.com/docker/model-runner/pkg/inference/platform"
 	"github.com/docker/model-runner/pkg/inference/scheduling"
 	"github.com/docker/model-runner/pkg/metrics"
 	"github.com/docker/model-runner/pkg/middleware"
@@ -85,6 +87,7 @@ func main() {
 	sglangServerPath := os.Getenv("SGLANG_SERVER_PATH")
 	mlxServerPath := os.Getenv("MLX_SERVER_PATH")
 	diffusersServerPath := os.Getenv("DIFFUSERS_SERVER_PATH")
+	vllmMetalServerPath := os.Getenv("VLLM_METAL_SERVER_PATH")
 
 	// Create a proxy-aware HTTP transport
 	// Use a safe type assertion with fallback, and explicitly set Proxy to http.ProxyFromEnvironment
@@ -117,6 +120,9 @@ func main() {
 	if mlxServerPath != "" {
 		log.Infof("MLX_SERVER_PATH: %s", mlxServerPath)
 	}
+	if vllmMetalServerPath != "" {
+		log.Infof("VLLM_METAL_SERVER_PATH: %s", vllmMetalServerPath)
+	}
 
 	// Create llama.cpp configuration from environment variables
 	llamaCppConfig := createLlamaCppConfigFromEnv()
@@ -177,6 +183,19 @@ func main() {
 		log.Fatalf("unable to initialize diffusers backend: %v", err)
 	}
 
+	var vllmMetalBackend inference.Backend
+	if platform.SupportsVLLMMetal() {
+		vllmMetalBackend, err = vllmmetal.New(
+			log,
+			modelManager,
+			log.WithFields(logrus.Fields{"component": vllmmetal.Name}),
+			vllmMetalServerPath,
+		)
+		if err != nil {
+			log.Warnf("Failed to initialize vllm-metal backend: %v", err)
+		}
+	}
+
 	backends := map[string]inference.Backend{
 		llamacpp.Name:  llamaCppBackend,
 		mlx.Name:       mlxBackend,
@@ -185,6 +204,10 @@ func main() {
 	}
 	registerVLLMBackend(backends, vllmBackend)
 
+	if vllmMetalBackend != nil {
+		backends[vllmmetal.Name] = vllmMetalBackend
+	}
+
 	scheduler := scheduling.NewScheduler(
 		log,
 		backends,