Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,8 @@ vendor/
llamacpp/build
llamacpp/install

# vllm-metal build artifacts
.vllm-metal/
vllm-metal-macos-arm64-*.tar.gz

.DS_Store
103 changes: 101 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ DOCKER_BUILD_ARGS := \
BUILD_DMR ?= 1

# Main targets
.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang docker-run-impl help validate lint docker-build-diffusers docker-run-diffusers
.PHONY: build run clean test integration-tests test-docker-ce-installation docker-build docker-build-multiplatform docker-run docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang docker-run-impl help validate lint docker-build-diffusers docker-run-diffusers vllm-metal-build vllm-metal-install vllm-metal-dev vllm-metal-clean
# Default target
.DEFAULT_GOAL := build

Expand Down Expand Up @@ -144,7 +144,94 @@ docker-run-impl:
DEBUG="${DEBUG}" \
scripts/docker-run.sh

# Show help
# vllm-metal (macOS ARM64 only, requires Python 3.12 for wheel compatibility)
VLLM_METAL_RELEASE ?= v0.1.0-20260126-121650
VLLM_METAL_INSTALL_DIR := $(HOME)/.docker/model-runner/vllm-metal
VLLM_METAL_TARBALL := vllm-metal-macos-arm64-$(VLLM_METAL_RELEASE).tar.gz

vllm-metal-build:
@if [ -f "$(VLLM_METAL_TARBALL)" ]; then \
echo "Tarball already exists: $(VLLM_METAL_TARBALL)"; \
else \
echo "Building vllm-metal tarball..."; \
scripts/build-vllm-metal-tarball.sh $(VLLM_METAL_RELEASE) $(VLLM_METAL_TARBALL); \
echo "Tarball created: $(VLLM_METAL_TARBALL)"; \
fi

vllm-metal-install:
@VERSION_FILE="$(VLLM_METAL_INSTALL_DIR)/.vllm-metal-version"; \
if [ -f "$$VERSION_FILE" ] && [ "$$(cat "$$VERSION_FILE")" = "$(VLLM_METAL_RELEASE)" ]; then \
echo "vllm-metal $(VLLM_METAL_RELEASE) already installed"; \
exit 0; \
fi; \
if [ ! -f "$(VLLM_METAL_TARBALL)" ]; then \
echo "Error: $(VLLM_METAL_TARBALL) not found. Run 'make vllm-metal-build' first."; \
exit 1; \
fi; \
echo "Installing vllm-metal to $(VLLM_METAL_INSTALL_DIR)..."; \
PYTHON_BIN=""; \
if command -v python3.12 >/dev/null 2>&1; then \
PYTHON_BIN="python3.12"; \
elif command -v python3 >/dev/null 2>&1; then \
version=$$(python3 --version 2>&1 | grep -oE '[0-9]+\.[0-9]+'); \
if [ "$$version" = "3.12" ]; then \
PYTHON_BIN="python3"; \
fi; \
fi; \
if [ -z "$$PYTHON_BIN" ]; then \
echo "Error: Python 3.12 required (vllm-metal wheel is built for cp312)"; \
echo "Install with: brew install python@3.12"; \
exit 1; \
fi; \
echo "Using Python 3.12 from $$(which $$PYTHON_BIN)"; \
rm -rf "$(VLLM_METAL_INSTALL_DIR)"; \
$$PYTHON_BIN -m venv "$(VLLM_METAL_INSTALL_DIR)"; \
SITE_PACKAGES="$(VLLM_METAL_INSTALL_DIR)/lib/python3.12/site-packages"; \
mkdir -p "$$SITE_PACKAGES"; \
tar -xzf "$(VLLM_METAL_TARBALL)" -C "$$SITE_PACKAGES"; \
echo "$(VLLM_METAL_RELEASE)" > "$$VERSION_FILE"; \
echo "vllm-metal $(VLLM_METAL_RELEASE) installed successfully!"

vllm-metal-dev:
@if [ -z "$(VLLM_METAL_PATH)" ]; then \
echo "Usage: make vllm-metal-dev VLLM_METAL_PATH=../vllm-metal"; \
exit 1; \
fi
@PYTHON_BIN=""; \
if command -v python3.12 >/dev/null 2>&1; then \
PYTHON_BIN="python3.12"; \
elif command -v python3 >/dev/null 2>&1; then \
version=$$(python3 --version 2>&1 | grep -oE '[0-9]+\.[0-9]+'); \
if [ "$$version" = "3.12" ]; then \
PYTHON_BIN="python3"; \
fi; \
fi; \
if [ -z "$$PYTHON_BIN" ]; then \
echo "Error: Python 3.12 required"; \
echo "Install with: brew install python@3.12"; \
exit 1; \
fi; \
echo "Installing vllm-metal from $(VLLM_METAL_PATH)..."; \
rm -rf "$(VLLM_METAL_INSTALL_DIR)"; \
$$PYTHON_BIN -m venv "$(VLLM_METAL_INSTALL_DIR)"; \
. "$(VLLM_METAL_INSTALL_DIR)/bin/activate" && \
VLLM_VERSION="0.13.0" && \
WORK_DIR=$$(mktemp -d) && \
curl -fsSL -o "$$WORK_DIR/vllm.tar.gz" "https://github.com/vllm-project/vllm/releases/download/v$$VLLM_VERSION/vllm-$$VLLM_VERSION.tar.gz" && \
tar -xzf "$$WORK_DIR/vllm.tar.gz" -C "$$WORK_DIR" && \
pip install -r "$$WORK_DIR/vllm-$$VLLM_VERSION/requirements/cpu.txt" && \
pip install -e "$(VLLM_METAL_PATH)" && \
pip install -r "$$WORK_DIR/vllm-$$VLLM_VERSION/requirements/common.txt" && \
rm -rf "$$WORK_DIR" && \
echo "dev" > "$(VLLM_METAL_INSTALL_DIR)/.vllm-metal-version"; \
echo "vllm-metal dev installed from $(VLLM_METAL_PATH)"

vllm-metal-clean:
@echo "Removing vllm-metal installation and build artifacts..."
rm -rf "$(VLLM_METAL_INSTALL_DIR)"
rm -f $(VLLM_METAL_TARBALL)
@echo "vllm-metal cleaned!"

help:
@echo "Available targets:"
@echo " build - Build the Go application"
Expand All @@ -164,6 +251,10 @@ help:
@echo " docker-run-sglang - Run SGLang Docker container"
@echo " docker-build-diffusers - Build Diffusers Docker image"
@echo " docker-run-diffusers - Run Diffusers Docker container"
@echo " vllm-metal-build - Build vllm-metal tarball locally (macOS ARM64)"
@echo " vllm-metal-install - Install vllm-metal from local tarball"
@echo " vllm-metal-dev - Install vllm-metal from local source (editable)"
@echo " vllm-metal-clean - Clean vllm-metal installation and tarball"
@echo " help - Show this help message"
@echo ""
@echo "Backend configuration options:"
Expand All @@ -174,3 +265,11 @@ help:
@echo " make run LLAMA_ARGS=\"--verbose --jinja -ngl 999 --ctx-size 2048\""
@echo " make run LOCAL_LLAMA=1"
@echo " make docker-run LLAMA_ARGS=\"--verbose --jinja -ngl 999 --threads 4 --ctx-size 2048\""
@echo ""
@echo "vllm-metal (macOS ARM64 only, requires Python 3.12):"
@echo " 1. Auto-pull from Docker Hub (clean dev installs first: make vllm-metal-clean):"
@echo " make run"
@echo " 2. Build and install from tarball:"
@echo " make vllm-metal-build && make vllm-metal-install && make run"
@echo " 3. Install from local source (for development):"
@echo " make vllm-metal-dev VLLM_METAL_PATH=../vllm-metal && make run"
4 changes: 4 additions & 0 deletions cmd/cli/desktop/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ type OpenAIChatResponse struct {
Role string `json:"role,omitempty"`
ReasoningContent string `json:"reasoning_content,omitempty"`
} `json:"delta"`
Message struct {
Content string `json:"content"`
Role string `json:"role,omitempty"`
} `json:"message"`
Index int `json:"index"`
FinishReason string `json:"finish_reason"`
} `json:"choices"`
Expand Down
155 changes: 106 additions & 49 deletions cmd/cli/desktop/desktop.go
Original file line number Diff line number Diff line change
Expand Up @@ -440,74 +440,131 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
TotalTokens int `json:"total_tokens"`
}

scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
// Check if context was cancelled
select {
case <-ctx.Done():
return assistantResponse.String(), ctx.Err()
default:
}
// Read the first line to detect if this is SSE streaming or a regular JSON response
reader := bufio.NewReader(resp.Body)
firstLine, err := reader.ReadString('\n')
if err != nil && !errors.Is(err, io.EOF) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would happen in case of error, and the error not being io.EOF?

return assistantResponse.String(), fmt.Errorf("error reading response: %w", err)
}
firstLine = strings.TrimSpace(firstLine)

line := scanner.Text()
if line == "" {
continue
// Check if this is a non-streaming JSON response (doesn't start with "data: ")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering if we could use the headers to understand if its a streaming response or not.
At this point we don't have the headers available but maybe we could check them in a previous step, so if it has: Content-Type: text/event-stream is streaming. Otherwise is not.
That would reduce a lot of logic in here I guess

if firstLine != "" && !strings.HasPrefix(firstLine, "data: ") {
// This might be a regular JSON response - read the rest and try to parse it
restOfBody, readErr := io.ReadAll(reader)
if readErr != nil {
return assistantResponse.String(), fmt.Errorf("error reading response body: %w", readErr)
}
fullBody := firstLine + string(restOfBody)

if !strings.HasPrefix(line, "data: ") {
continue
var nonStreamResp OpenAIChatResponse
if err := json.Unmarshal([]byte(fullBody), &nonStreamResp); err != nil {
return assistantResponse.String(), fmt.Errorf("error parsing response: %w", err)
}

data := strings.TrimPrefix(line, "data: ")

if data == "[DONE]" {
break
// Extract content from non-streaming response
if len(nonStreamResp.Choices) > 0 && nonStreamResp.Choices[0].Message.Content != "" {
content := nonStreamResp.Choices[0].Message.Content
outputFunc(content)
assistantResponse.WriteString(content)
}

var streamResp OpenAIChatResponse
if err := json.Unmarshal([]byte(data), &streamResp); err != nil {
return assistantResponse.String(), fmt.Errorf("error parsing stream response: %w", err)
if nonStreamResp.Usage != nil {
finalUsage = nonStreamResp.Usage
}

if streamResp.Usage != nil {
finalUsage = streamResp.Usage
} else {
// SSE streaming response - process line by line
scanner := bufio.NewScanner(reader)

// Process the first line if it was SSE data
if strings.HasPrefix(firstLine, "data: ") {
data := strings.TrimPrefix(firstLine, "data: ")
if data != "[DONE]" {
var streamResp OpenAIChatResponse
if err := json.Unmarshal([]byte(data), &streamResp); err == nil {
if streamResp.Usage != nil {
finalUsage = streamResp.Usage
}
if len(streamResp.Choices) > 0 {
if streamResp.Choices[0].Delta.Content != "" {
chunk := streamResp.Choices[0].Delta.Content
printerState = chatPrinterContent
outputFunc(chunk)
assistantResponse.WriteString(chunk)
}
}
}
}
}

if len(streamResp.Choices) > 0 {
if streamResp.Choices[0].Delta.ReasoningContent != "" {
chunk := streamResp.Choices[0].Delta.ReasoningContent
if printerState == chatPrinterContent {
outputFunc("\n\n")
}
if printerState != chatPrinterReasoning {
const thinkingHeader = "Thinking:\n"
for scanner.Scan() {
// Check if context was cancelled
select {
case <-ctx.Done():
return assistantResponse.String(), ctx.Err()
default:
}

line := scanner.Text()
if line == "" {
continue
}

if !strings.HasPrefix(line, "data: ") {
continue
}

data := strings.TrimPrefix(line, "data: ")

if data == "[DONE]" {
break
}

var streamResp OpenAIChatResponse
if err := json.Unmarshal([]byte(data), &streamResp); err != nil {
return assistantResponse.String(), fmt.Errorf("error parsing stream response: %w", err)
}

if streamResp.Usage != nil {
finalUsage = streamResp.Usage
}

if len(streamResp.Choices) > 0 {
if streamResp.Choices[0].Delta.ReasoningContent != "" {
chunk := streamResp.Choices[0].Delta.ReasoningContent
if printerState == chatPrinterContent {
outputFunc("\n\n")
}
if printerState != chatPrinterReasoning {
const thinkingHeader = "Thinking:\n"
if reasoningFmt != nil {
reasoningFmt.Print(thinkingHeader)
} else {
outputFunc(thinkingHeader)
}
}
printerState = chatPrinterReasoning
if reasoningFmt != nil {
reasoningFmt.Print(thinkingHeader)
reasoningFmt.Print(chunk)
} else {
outputFunc(thinkingHeader)
outputFunc(chunk)
}
}
printerState = chatPrinterReasoning
if reasoningFmt != nil {
reasoningFmt.Print(chunk)
} else {
if streamResp.Choices[0].Delta.Content != "" {
chunk := streamResp.Choices[0].Delta.Content
if printerState == chatPrinterReasoning {
outputFunc("\n\n--\n\n")
}
printerState = chatPrinterContent
outputFunc(chunk)
assistantResponse.WriteString(chunk)
}
}
if streamResp.Choices[0].Delta.Content != "" {
chunk := streamResp.Choices[0].Delta.Content
if printerState == chatPrinterReasoning {
outputFunc("\n\n--\n\n")
}
printerState = chatPrinterContent
outputFunc(chunk)
assistantResponse.WriteString(chunk)
}
}
}

if err := scanner.Err(); err != nil {
return assistantResponse.String(), fmt.Errorf("error reading response stream: %w", err)
if err := scanner.Err(); err != nil {
return assistantResponse.String(), fmt.Errorf("error reading response stream: %w", err)
}
}

if finalUsage != nil {
Expand Down
23 changes: 23 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ import (
"github.com/docker/model-runner/pkg/inference/backends/mlx"
"github.com/docker/model-runner/pkg/inference/backends/sglang"
"github.com/docker/model-runner/pkg/inference/backends/vllm"
"github.com/docker/model-runner/pkg/inference/backends/vllmmetal"
"github.com/docker/model-runner/pkg/inference/config"
"github.com/docker/model-runner/pkg/inference/models"
"github.com/docker/model-runner/pkg/inference/platform"
"github.com/docker/model-runner/pkg/inference/scheduling"
"github.com/docker/model-runner/pkg/metrics"
"github.com/docker/model-runner/pkg/middleware"
Expand Down Expand Up @@ -85,6 +87,7 @@ func main() {
sglangServerPath := os.Getenv("SGLANG_SERVER_PATH")
mlxServerPath := os.Getenv("MLX_SERVER_PATH")
diffusersServerPath := os.Getenv("DIFFUSERS_SERVER_PATH")
vllmMetalServerPath := os.Getenv("VLLM_METAL_SERVER_PATH")

// Create a proxy-aware HTTP transport
// Use a safe type assertion with fallback, and explicitly set Proxy to http.ProxyFromEnvironment
Expand Down Expand Up @@ -117,6 +120,9 @@ func main() {
if mlxServerPath != "" {
log.Infof("MLX_SERVER_PATH: %s", mlxServerPath)
}
if vllmMetalServerPath != "" {
log.Infof("VLLM_METAL_SERVER_PATH: %s", vllmMetalServerPath)
}

// Create llama.cpp configuration from environment variables
llamaCppConfig := createLlamaCppConfigFromEnv()
Expand Down Expand Up @@ -177,6 +183,19 @@ func main() {
log.Fatalf("unable to initialize diffusers backend: %v", err)
}

var vllmMetalBackend inference.Backend
if platform.SupportsVLLMMetal() {
vllmMetalBackend, err = vllmmetal.New(
log,
modelManager,
log.WithFields(logrus.Fields{"component": vllmmetal.Name}),
vllmMetalServerPath,
)
if err != nil {
log.Warnf("Failed to initialize vllm-metal backend: %v", err)
}
}

backends := map[string]inference.Backend{
llamacpp.Name: llamaCppBackend,
mlx.Name: mlxBackend,
Expand All @@ -185,6 +204,10 @@ func main() {
}
registerVLLMBackend(backends, vllmBackend)

if vllmMetalBackend != nil {
backends[vllmmetal.Name] = vllmMetalBackend
}

scheduler := scheduling.NewScheduler(
log,
backends,
Expand Down
Loading
Loading