Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions cmd/cli/commands/install-runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/docker/model-runner/pkg/inference/backends/diffusers"
"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
"github.com/docker/model-runner/pkg/inference/backends/vllm"
"github.com/docker/model-runner/pkg/inference/backends/vllmmetal"
"github.com/spf13/cobra"
)

Expand All @@ -28,7 +29,7 @@ const (
// installation will try to reach the model runner while waiting for it to
// be ready.
installWaitRetryInterval = 500 * time.Millisecond
backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "). Default: " + llamacpp.Name
backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "|" + vllmmetal.Name + "). Default: " + llamacpp.Name
)

// waitForStandaloneRunnerAfterInstall waits for a standalone model runner
Expand Down Expand Up @@ -237,6 +238,17 @@ type runnerOptions struct {

// runInstallOrStart is shared logic for install-runner and start-runner commands
func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error {
// vllm-metal is installed on-demand via the running model runner,
// not as a standalone container. This applies to all engine kinds.
if opts.backend == vllmmetal.Name {
cmd.Println("Installing vllm-metal backend...")
if err := desktopClient.InstallBackend(vllmmetal.Name); err != nil {
return fmt.Errorf("failed to install vllm-metal backend: %w", err)
}
cmd.Println("vllm-metal backend installed successfully")
return nil
}
Comment thread
doringeman marked this conversation as resolved.

var vllmOnWSL bool
// Ensure that we're running in a supported model runner context.
engineKind := modelRunner.EngineKind()
Expand Down Expand Up @@ -324,7 +336,7 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error
}

// Validate backend selection
validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name}
validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name, vllmmetal.Name}
if opts.backend != "" {
isValid := false
for _, valid := range validBackends {
Expand Down
76 changes: 73 additions & 3 deletions cmd/cli/desktop/desktop.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,12 +479,58 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
TotalTokens int `json:"total_tokens"`
}

// Detect streaming vs non-streaming response via Content-Type header
// Use a buffered reader so we can consume server-sent progress
// lines (e.g. "Installing vllm-metal backend...") that arrive
// before the actual SSE or JSON inference response.
br := bufio.NewReader(resp.Body)

// Consume any plain-text progress lines that precede the real
// response. We peek ahead: if the next non-empty content starts
// with '{' (JSON) or "data:" / ":" (SSE), the progress section
// is over and we fall through to normal processing.
for {
peek, err := br.Peek(1)
if err != nil {
break
}
// JSON object or SSE stream — stop consuming progress lines.
if peek[0] == '{' || peek[0] == ':' {
break
}
line, err := br.ReadString('\n')
if err != nil && line == "" {
break
}
line = strings.TrimRight(line, "\r\n")
if line == "" {
continue
}
// SSE data line — stop, let the normal SSE parser handle it.
if strings.HasPrefix(line, "data:") {
// Put the line back by chaining a reader with the rest.
br = bufio.NewReader(io.MultiReader(
strings.NewReader(line+"\n"),
br,
))
break
}
// Progress message — print to stderr.
fmt.Fprintln(os.Stderr, line)
}

// Detect streaming vs non-streaming response. Because server-sent
// progress lines may have been flushed before the Content-Type was
// set, we also peek at the body content to detect SSE.
isStreaming := strings.HasPrefix(resp.Header.Get("Content-Type"), "text/event-stream")
if !isStreaming {
if peek, err := br.Peek(5); err == nil {
isStreaming = strings.HasPrefix(string(peek), "data:")
}
}

if !isStreaming {
// Non-streaming JSON response
body, err := io.ReadAll(resp.Body)
body, err := io.ReadAll(br)
if err != nil {
return assistantResponse.String(), fmt.Errorf("error reading response body: %w", err)
}
Expand All @@ -506,7 +552,7 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
}
} else {
// SSE streaming response - process line by line
scanner := bufio.NewScanner(resp.Body)
scanner := bufio.NewScanner(br)

for scanner.Scan() {
// Check if context was cancelled
Expand Down Expand Up @@ -782,6 +828,30 @@ func (c *Client) ShowConfigs(modelFilter string) ([]scheduling.ModelConfigEntry,
return configs, nil
}

// InstallBackend triggers on-demand installation of a deferred backend
func (c *Client) InstallBackend(backend string) error {
installPath := inference.InferencePrefix + "/install-backend"
jsonData, err := json.Marshal(struct {
Backend string `json:"backend"`
}{Backend: backend})
if err != nil {
return fmt.Errorf("error marshaling request: %w", err)
}

resp, err := c.doRequest(http.MethodPost, installPath, bytes.NewReader(jsonData))
if err != nil {
return c.handleQueryError(err, installPath)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("install backend failed with status %s: %s", resp.Status, string(body))
}

return nil
}
Comment thread
doringeman marked this conversation as resolved.

func (c *Client) ConfigureBackend(request scheduling.ConfigureRequest) error {
configureBackendPath := inference.InferencePrefix + "/_configure"
jsonData, err := json.Marshal(request)
Expand Down
3 changes: 2 additions & 1 deletion cmd/cli/docs/reference/docker_model_install-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ plink: docker_model.yaml
options:
- option: backend
value_type: string
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
description: |
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
deprecated: false
hidden: false
experimental: false
Expand Down
3 changes: 2 additions & 1 deletion cmd/cli/docs/reference/docker_model_reinstall-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ plink: docker_model.yaml
options:
- option: backend
value_type: string
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
description: |
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
deprecated: false
hidden: false
experimental: false
Expand Down
3 changes: 2 additions & 1 deletion cmd/cli/docs/reference/docker_model_start-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ plink: docker_model.yaml
options:
- option: backend
value_type: string
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
description: |
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
deprecated: false
hidden: false
experimental: false
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/docs/reference/model_install-runner.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Install Docker Model Runner (Docker Engine only)

| Name | Type | Default | Description |
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
| `--debug` | `bool` | | Enable debug logging |
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/docs/reference/model_reinstall-runner.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Reinstall Docker Model Runner (Docker Engine only)

| Name | Type | Default | Description |
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
| `--debug` | `bool` | | Enable debug logging |
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/docs/reference/model_start-runner.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Start Docker Model Runner (Docker Engine only)

| Name | Type | Default | Description |
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
| `--debug` | `bool` | | Enable debug logging |
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |
Expand Down
7 changes: 7 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ func main() {
backends[vllmmetal.Name] = vllmMetalBackend
}

// Backends whose installation is deferred until explicitly requested.
var deferredBackends []string
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this defferedBackends idea 👏

if vllmMetalBackend != nil {
deferredBackends = append(deferredBackends, vllmmetal.Name)
}

scheduler := scheduling.NewScheduler(
log,
backends,
Expand All @@ -220,6 +226,7 @@ func main() {
"",
false,
),
deferredBackends,
)

// Create the HTTP handler for the scheduler
Expand Down
3 changes: 3 additions & 0 deletions pkg/inference/scheduling/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ const (
// enough to encompass any real-world request but also small enough to avoid
// DoS attacks.
maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024

// modelCLIUserAgentPrefix is the user-agent prefix set by the model CLI.
modelCLIUserAgentPrefix = "docker-model-cli/"
)

// trimRequestPathToOpenAIRoot trims a request path to start at the first
Expand Down
66 changes: 65 additions & 1 deletion pkg/inference/scheduling/http_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ func (h *HTTPHandler) routeHandlers() map[string]http.HandlerFunc {
m["GET "+inference.InferencePrefix+"/v1/models"] = h.handleModels
m["GET "+inference.InferencePrefix+"/v1/models/{name...}"] = h.handleModels

m["POST "+inference.InferencePrefix+"/install-backend"] = h.InstallBackend
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use only a POST request to /engines to make the API more RESTful, or keep verbs in the path like we do for actions such as ps and df?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could... do you want to do it in this PR? Or start an initiative for re-reviewing the whole API?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry for not being clear. No need to change anything, I was just wondering what would be better since we follow both patterns. I usually prefer a RESTful API, but I’m also perfectly fine using verbs in the URL.

m["GET "+inference.InferencePrefix+"/status"] = h.GetBackendStatus
m["GET "+inference.InferencePrefix+"/ps"] = h.GetRunningBackends
m["GET "+inference.InferencePrefix+"/df"] = h.GetDiskUsage
Expand Down Expand Up @@ -197,11 +198,28 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
backend = h.scheduler.selectBackendForModel(model, backend, request.Model)
}

// If a deferred backend needs on-demand installation and the request
// comes from the model CLI, stream progress messages so the user sees
// what is happening while the download runs.
autoInstall := h.scheduler.installer.deferredBackends[backend.Name()] &&
!h.scheduler.installer.isInstalled(backend.Name()) &&
strings.Contains(r.UserAgent(), modelCLIUserAgentPrefix)
if autoInstall {
fmt.Fprintf(w, "Installing %s backend...\n", backend.Name())
if f, ok := w.(http.Flusher); ok {
f.Flush()
}
}

// Wait for the corresponding backend installation to complete or fail. We
// don't allow any requests to be scheduled for a backend until it has
// completed installation.
if err := h.scheduler.installer.wait(r.Context(), backend.Name()); err != nil {
if errors.Is(err, ErrBackendNotFound) {
if autoInstall {
// Headers are already sent (200 OK) from the progress
// line, so we can only write the error as plain text.
fmt.Fprintf(w, "backend installation failed: %v\n", err)
} else if errors.Is(err, ErrBackendNotFound) {
http.Error(w, err.Error(), http.StatusNotFound)
} else if errors.Is(err, errInstallerNotStarted) {
http.Error(w, err.Error(), http.StatusServiceUnavailable)
Expand All @@ -211,6 +229,8 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
// shutting down (since that will also cancel the request context).
// Either way, provide a response, even if it's ignored.
http.Error(w, "service unavailable", http.StatusServiceUnavailable)
} else if errors.Is(err, errBackendNotInstalled) {
http.Error(w, fmt.Sprintf("backend %q is not installed; run: docker model install-runner --backend %s", backend.Name(), backend.Name()), http.StatusPreconditionFailed)
} else if errors.Is(err, vllm.ErrorNotFound) {
http.Error(w, err.Error(), http.StatusPreconditionFailed)
} else {
Expand All @@ -219,6 +239,13 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
return
}

if autoInstall {
fmt.Fprintf(w, "%s backend installed successfully\n", backend.Name())
if f, ok := w.(http.Flusher); ok {
f.Flush()
}
}

modelID := h.scheduler.modelManager.ResolveID(request.Model)

// Request a runner to execute the request and defer its release.
Expand Down Expand Up @@ -336,6 +363,43 @@ func (h *HTTPHandler) Unload(w http.ResponseWriter, r *http.Request) {
}
}

// installBackendRequest is the JSON body for the install-backend endpoint.
type installBackendRequest struct {
Backend string `json:"backend"`
}

// InstallBackend handles POST <inference-prefix>/install-backend requests.
// It triggers on-demand installation of a deferred backend.
func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) {
body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
if err != nil {
var maxBytesError *http.MaxBytesError
if errors.As(err, &maxBytesError) {
http.Error(w, "request too large", http.StatusBadRequest)
} else {
http.Error(w, "failed to read request body", http.StatusInternalServerError)
}
return
Comment thread
doringeman marked this conversation as resolved.
}

var req installBackendRequest
if err := json.Unmarshal(body, &req); err != nil || req.Backend == "" {
http.Error(w, "invalid request: backend is required", http.StatusBadRequest)
return
}

if err := h.scheduler.InstallBackend(r.Context(), req.Backend); err != nil {
if errors.Is(err, ErrBackendNotFound) {
http.Error(w, err.Error(), http.StatusNotFound)
} else {
http.Error(w, fmt.Sprintf("backend installation failed: %v", err), http.StatusInternalServerError)
}
return
}

w.WriteHeader(http.StatusOK)
}

// Configure handles POST <inference-prefix>/{backend}/_configure requests.
func (h *HTTPHandler) Configure(w http.ResponseWriter, r *http.Request) {
// Determine the requested backend and ensure that it's valid.
Expand Down
Loading
Loading