From e24fb26d3926a9fa8d4f179fc4e16139ee831e43 Mon Sep 17 00:00:00 2001 From: Dorin Geman Date: Tue, 10 Feb 2026 19:43:43 +0200 Subject: [PATCH 1/5] feat: make vllm-metal backend installation opt-in Signed-off-by: Dorin Geman --- cmd/cli/commands/install-runner.go | 16 ++- cmd/cli/desktop/desktop.go | 24 ++++ .../docker_model_install-runner.yaml | 3 +- .../docker_model_reinstall-runner.yaml | 3 +- .../reference/docker_model_start-runner.yaml | 3 +- .../docs/reference/model_install-runner.md | 2 +- .../docs/reference/model_reinstall-runner.md | 2 +- cmd/cli/docs/reference/model_start-runner.md | 2 +- main.go | 7 + pkg/inference/scheduling/http_handler.go | 35 +++++ pkg/inference/scheduling/installer.go | 121 +++++++++++++++++- pkg/inference/scheduling/scheduler.go | 40 ++++-- pkg/inference/scheduling/scheduler_test.go | 2 +- 13 files changed, 235 insertions(+), 25 deletions(-) diff --git a/cmd/cli/commands/install-runner.go b/cmd/cli/commands/install-runner.go index b986a93ff..50c1538e7 100644 --- a/cmd/cli/commands/install-runner.go +++ b/cmd/cli/commands/install-runner.go @@ -17,6 +17,7 @@ import ( "github.com/docker/model-runner/pkg/inference/backends/diffusers" "github.com/docker/model-runner/pkg/inference/backends/llamacpp" "github.com/docker/model-runner/pkg/inference/backends/vllm" + "github.com/docker/model-runner/pkg/inference/backends/vllmmetal" "github.com/spf13/cobra" ) @@ -28,7 +29,7 @@ const ( // installation will try to reach the model runner while waiting for it to // be ready. installWaitRetryInterval = 500 * time.Millisecond - backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "). Default: " + llamacpp.Name + backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "|" + vllmmetal.Name + "). Default: " + llamacpp.Name ) // waitForStandaloneRunnerAfterInstall waits for a standalone model runner @@ -237,6 +238,17 @@ type runnerOptions struct { // runInstallOrStart is shared logic for install-runner and start-runner commands func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error { + // vllm-metal is installed on-demand via the running model runner, + // not as a standalone container. This applies to all engine kinds. + if opts.backend == vllmmetal.Name { + cmd.Println("Installing vllm-metal backend...") + if err := desktopClient.InstallBackend(vllmmetal.Name); err != nil { + return fmt.Errorf("failed to install vllm-metal backend: %w", err) + } + cmd.Println("vllm-metal backend installed successfully") + return nil + } + var vllmOnWSL bool // Ensure that we're running in a supported model runner context. engineKind := modelRunner.EngineKind() @@ -324,7 +336,7 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error } // Validate backend selection - validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name} + validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name, vllmmetal.Name} if opts.backend != "" { isValid := false for _, valid := range validBackends { diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go index 593f40fcb..1e864bc75 100644 --- a/cmd/cli/desktop/desktop.go +++ b/cmd/cli/desktop/desktop.go @@ -782,6 +782,30 @@ func (c *Client) ShowConfigs(modelFilter string) ([]scheduling.ModelConfigEntry, return configs, nil } +// InstallBackend triggers on-demand installation of a deferred backend +func (c *Client) InstallBackend(backend string) error { + installPath := inference.InferencePrefix + "/install-backend" + jsonData, err := json.Marshal(struct { + Backend string `json:"backend"` + }{Backend: backend}) + if err != nil { + return fmt.Errorf("error marshaling request: %w", err) + } + + resp, err := c.doRequest(http.MethodPost, installPath, bytes.NewReader(jsonData)) + if err != nil { + return c.handleQueryError(err, installPath) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("install backend failed with status %s: %s", resp.Status, string(body)) + } + + return nil +} + func (c *Client) ConfigureBackend(request scheduling.ConfigureRequest) error { configureBackendPath := inference.InferencePrefix + "/_configure" jsonData, err := json.Marshal(request) diff --git a/cmd/cli/docs/reference/docker_model_install-runner.yaml b/cmd/cli/docs/reference/docker_model_install-runner.yaml index 562fd6bbf..a2ea045f3 100644 --- a/cmd/cli/docs/reference/docker_model_install-runner.yaml +++ b/cmd/cli/docs/reference/docker_model_install-runner.yaml @@ -8,7 +8,8 @@ plink: docker_model.yaml options: - option: backend value_type: string - description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp' + description: | + Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp deprecated: false hidden: false experimental: false diff --git a/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml b/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml index 35d328b55..90ca05e10 100644 --- a/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml +++ b/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml @@ -8,7 +8,8 @@ plink: docker_model.yaml options: - option: backend value_type: string - description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp' + description: | + Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp deprecated: false hidden: false experimental: false diff --git a/cmd/cli/docs/reference/docker_model_start-runner.yaml b/cmd/cli/docs/reference/docker_model_start-runner.yaml index 740e36c53..1cb635bba 100644 --- a/cmd/cli/docs/reference/docker_model_start-runner.yaml +++ b/cmd/cli/docs/reference/docker_model_start-runner.yaml @@ -10,7 +10,8 @@ plink: docker_model.yaml options: - option: backend value_type: string - description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp' + description: | + Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp deprecated: false hidden: false experimental: false diff --git a/cmd/cli/docs/reference/model_install-runner.md b/cmd/cli/docs/reference/model_install-runner.md index de40a5028..42949c791 100644 --- a/cmd/cli/docs/reference/model_install-runner.md +++ b/cmd/cli/docs/reference/model_install-runner.md @@ -7,7 +7,7 @@ Install Docker Model Runner (Docker Engine only) | Name | Type | Default | Description | |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------| -| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp | +| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp | | `--debug` | `bool` | | Enable debug logging | | `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner | | `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) | diff --git a/cmd/cli/docs/reference/model_reinstall-runner.md b/cmd/cli/docs/reference/model_reinstall-runner.md index 457b322e5..3e7fd8441 100644 --- a/cmd/cli/docs/reference/model_reinstall-runner.md +++ b/cmd/cli/docs/reference/model_reinstall-runner.md @@ -7,7 +7,7 @@ Reinstall Docker Model Runner (Docker Engine only) | Name | Type | Default | Description | |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------| -| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp | +| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp | | `--debug` | `bool` | | Enable debug logging | | `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner | | `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) | diff --git a/cmd/cli/docs/reference/model_start-runner.md b/cmd/cli/docs/reference/model_start-runner.md index 24cf2fe12..b6fdb0d7a 100644 --- a/cmd/cli/docs/reference/model_start-runner.md +++ b/cmd/cli/docs/reference/model_start-runner.md @@ -7,7 +7,7 @@ Start Docker Model Runner (Docker Engine only) | Name | Type | Default | Description | |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------| -| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp | +| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp | | `--debug` | `bool` | | Enable debug logging | | `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner | | `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) | diff --git a/main.go b/main.go index 2711a25ba..40a374890 100644 --- a/main.go +++ b/main.go @@ -208,6 +208,12 @@ func main() { backends[vllmmetal.Name] = vllmMetalBackend } + // Backends whose installation is deferred until explicitly requested. + var deferredBackends []string + if vllmMetalBackend != nil { + deferredBackends = append(deferredBackends, vllmmetal.Name) + } + scheduler := scheduling.NewScheduler( log, backends, @@ -220,6 +226,7 @@ func main() { "", false, ), + deferredBackends, ) // Create the HTTP handler for the scheduler diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go index 2205cde8f..cdcd59db2 100644 --- a/pkg/inference/scheduling/http_handler.go +++ b/pkg/inference/scheduling/http_handler.go @@ -97,6 +97,7 @@ func (h *HTTPHandler) routeHandlers() map[string]http.HandlerFunc { m["GET "+inference.InferencePrefix+"/v1/models"] = h.handleModels m["GET "+inference.InferencePrefix+"/v1/models/{name...}"] = h.handleModels + m["POST "+inference.InferencePrefix+"/install-backend"] = h.InstallBackend m["GET "+inference.InferencePrefix+"/status"] = h.GetBackendStatus m["GET "+inference.InferencePrefix+"/ps"] = h.GetRunningBackends m["GET "+inference.InferencePrefix+"/df"] = h.GetDiskUsage @@ -211,6 +212,8 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque // shutting down (since that will also cancel the request context). // Either way, provide a response, even if it's ignored. http.Error(w, "service unavailable", http.StatusServiceUnavailable) + } else if errors.Is(err, errBackendNotInstalled) { + http.Error(w, fmt.Sprintf("backend %q is not installed; run: docker model install-runner --backend %s", backend.Name(), backend.Name()), http.StatusPreconditionFailed) } else if errors.Is(err, vllm.ErrorNotFound) { http.Error(w, err.Error(), http.StatusPreconditionFailed) } else { @@ -336,6 +339,38 @@ func (h *HTTPHandler) Unload(w http.ResponseWriter, r *http.Request) { } } +// installBackendRequest is the JSON body for the install-backend endpoint. +type installBackendRequest struct { + Backend string `json:"backend"` +} + +// InstallBackend handles POST /install-backend requests. +// It triggers on-demand installation of a deferred backend. +func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) { + body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize)) + if err != nil { + http.Error(w, "failed to read request body", http.StatusInternalServerError) + return + } + + var req installBackendRequest + if err := json.Unmarshal(body, &req); err != nil || req.Backend == "" { + http.Error(w, "invalid request: backend is required", http.StatusBadRequest) + return + } + + if err := h.scheduler.InstallBackend(r.Context(), req.Backend); err != nil { + if errors.Is(err, ErrBackendNotFound) { + http.Error(w, err.Error(), http.StatusNotFound) + } else { + http.Error(w, fmt.Sprintf("backend installation failed: %v", err), http.StatusInternalServerError) + } + return + } + + w.WriteHeader(http.StatusOK) +} + // Configure handles POST /{backend}/_configure requests. func (h *HTTPHandler) Configure(w http.ResponseWriter, r *http.Request) { // Determine the requested backend and ensure that it's valid. diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go index ae3aba839..2178f2e85 100644 --- a/pkg/inference/scheduling/installer.go +++ b/pkg/inference/scheduling/installer.go @@ -4,6 +4,7 @@ import ( "context" "errors" "net/http" + "sync" "sync/atomic" "github.com/docker/model-runner/pkg/inference" @@ -17,6 +18,9 @@ var ( // errInstallerShuttingDown indicates that the installer's run loop has been // terminated and the installer is shutting down. errInstallerShuttingDown = errors.New("backend installer shutting down") + // errBackendNotInstalled indicates that a deferred backend has not been + // installed. Callers should install it via installBackend before use. + errBackendNotInstalled = errors.New("backend not installed") ) // installStatus tracks the installation status of a backend. @@ -44,14 +48,28 @@ type installer struct { started atomic.Bool // statuses maps backend names to their installation statuses. statuses map[string]*installStatus + // deferredBackends tracks backends whose installation is deferred until + // explicitly requested via installBackend. + deferredBackends map[string]bool + // mu protects on-demand installation via installBackend. + mu sync.Mutex } -// newInstaller creates a new backend installer. +// newInstaller creates a new backend installer. Backends listed in +// deferredBackends are skipped during the automatic run loop and must be +// installed on-demand via installBackend. func newInstaller( log logging.Logger, backends map[string]inference.Backend, httpClient *http.Client, + deferredBackends []string, ) *installer { + // Build the deferred set. + deferred := make(map[string]bool, len(deferredBackends)) + for _, name := range deferredBackends { + deferred[name] = true + } + // Create status trackers. statuses := make(map[string]*installStatus, len(backends)) for name := range backends { @@ -63,10 +81,11 @@ func newInstaller( // Create the installer. return &installer{ - log: log, - backends: backends, - httpClient: httpClient, - statuses: statuses, + log: log, + backends: backends, + httpClient: httpClient, + statuses: statuses, + deferredBackends: deferred, } } @@ -84,6 +103,25 @@ func (i *installer) run(ctx context.Context) { // ubiquitous backend and mlx as a relatively lightweight backend (on macOS // only), this granularity is probably less of a concern. for name, backend := range i.backends { + // For deferred backends, check if they are already installed on disk + // from a previous session. Only call Install() (which verifies the + // existing installation) when files are present, to avoid triggering + // a download. + if i.deferredBackends[name] { + status := i.statuses[name] + if diskUsage, err := backend.GetDiskUsage(); err == nil && diskUsage > 0 { + if err := backend.Install(ctx, i.httpClient); err != nil { + status.err = err + close(status.failed) + } else { + close(status.installed) + } + } + // If not on disk, leave channels open so wait() returns + // errBackendNotInstalled. + continue + } + status := i.statuses[name] var installedClosed bool @@ -114,6 +152,8 @@ func (i *installer) run(ctx context.Context) { } // wait waits for installation of the specified backend to complete or fail. +// For deferred backends that have never been installed, it returns +// errBackendNotInstalled immediately instead of blocking. func (i *installer) wait(ctx context.Context, backend string) error { // Grab the backend status. status, ok := i.statuses[backend] @@ -121,6 +161,20 @@ func (i *installer) wait(ctx context.Context, backend string) error { return ErrBackendNotFound } + // For deferred backends, check whether installation has completed without + // blocking. This doesn't depend on the installer being started, since + // deferred backends are installed on-demand, not by the run loop. + if i.deferredBackends[backend] { + select { + case <-status.installed: + return nil + case <-status.failed: + return status.err + default: + return errBackendNotInstalled + } + } + // If the installer hasn't started, then don't poll for readiness, because // it may never come. If it has started, then even if it's cancelled we can // be sure that we'll at least see failure for all backend installations. @@ -138,3 +192,60 @@ func (i *installer) wait(ctx context.Context, backend string) error { return status.err } } + +// installBackend triggers on-demand installation of a deferred backend. +// It is idempotent: if the backend is already installed, it returns nil. +func (i *installer) installBackend(ctx context.Context, name string) error { + i.mu.Lock() + defer i.mu.Unlock() + + backend, ok := i.backends[name] + if !ok { + return ErrBackendNotFound + } + + status := i.statuses[name] + + // Already installed — nothing to do. + select { + case <-status.installed: + return nil + default: + } + + // If previously failed, reset status for retry. + select { + case <-status.failed: + status = &installStatus{ + installed: make(chan struct{}), + failed: make(chan struct{}), + } + i.statuses[name] = status + default: + } + + // Perform installation. + if err := backend.Install(ctx, i.httpClient); err != nil { + status.err = err + close(status.failed) + return err + } + + close(status.installed) + return nil +} + +// isInstalled returns true if the given backend has completed installation. +// It is non-blocking. +func (i *installer) isInstalled(name string) bool { + status, ok := i.statuses[name] + if !ok { + return false + } + select { + case <-status.installed: + return true + default: + return false + } +} diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go index a0b1e4679..0ce6beea3 100644 --- a/pkg/inference/scheduling/scheduler.go +++ b/pkg/inference/scheduling/scheduler.go @@ -42,9 +42,14 @@ type Scheduler struct { tracker *metrics.Tracker // openAIRecorder is used to record OpenAI API inference requests and responses. openAIRecorder *metrics.OpenAIRecorder + // deferredBackends lists backends whose installation is deferred until + // explicitly requested (e.g. via install-backend endpoint). + deferredBackends []string } -// NewScheduler creates a new inference scheduler. +// NewScheduler creates a new inference scheduler. Backends listed in +// deferredBackends are not installed automatically during startup; they must +// be installed on-demand via InstallBackend. func NewScheduler( log logging.Logger, backends map[string]inference.Backend, @@ -52,19 +57,21 @@ func NewScheduler( modelManager *models.Manager, httpClient *http.Client, tracker *metrics.Tracker, + deferredBackends []string, ) *Scheduler { openAIRecorder := metrics.NewOpenAIRecorder(log.WithField("component", "openai-recorder"), modelManager) // Create the scheduler. s := &Scheduler{ - log: log, - backends: backends, - defaultBackend: defaultBackend, - modelManager: modelManager, - installer: newInstaller(log, backends, httpClient), - loader: newLoader(log, backends, modelManager, openAIRecorder), - tracker: tracker, - openAIRecorder: openAIRecorder, + log: log, + backends: backends, + defaultBackend: defaultBackend, + modelManager: modelManager, + installer: newInstaller(log, backends, httpClient, deferredBackends), + loader: newLoader(log, backends, modelManager, openAIRecorder), + tracker: tracker, + openAIRecorder: openAIRecorder, + deferredBackends: deferredBackends, } // Scheduler successfully initialized. @@ -105,8 +112,14 @@ func (s *Scheduler) selectBackendForModel(model types.Model, backend inference.B } if config.GetFormat() == types.FormatSafetensors { - // Prefer vllm-metal for safetensors models on macOS (most feature-rich for Metal) + // Prefer vllm-metal for safetensors models on macOS (most feature-rich for Metal), + // but only if it has been installed. if vllmMetalBackend, ok := s.backends[vllmmetal.Name]; ok && vllmMetalBackend != nil { + if s.installer.isInstalled(vllmmetal.Name) { + return vllmMetalBackend + } + s.log.Infof("vllm-metal backend is available but not installed. "+ + "To install, run: docker model install-runner --backend %s", vllmmetal.Name) return vllmMetalBackend } // Fall back to MLX on macOS @@ -131,7 +144,12 @@ func (s *Scheduler) selectBackendForModel(model types.Model, backend inference.B // ResetInstaller resets the backend installer with a new HTTP client. func (s *Scheduler) ResetInstaller(httpClient *http.Client) { - s.installer = newInstaller(s.log, s.backends, httpClient) + s.installer = newInstaller(s.log, s.backends, httpClient, s.deferredBackends) +} + +// InstallBackend triggers on-demand installation of a deferred backend. +func (s *Scheduler) InstallBackend(ctx context.Context, name string) error { + return s.installer.installBackend(ctx, name) } // GetRunningBackendsInfo returns information about all running backends as a slice diff --git a/pkg/inference/scheduling/scheduler_test.go b/pkg/inference/scheduling/scheduler_test.go index 7e8e3fd3f..0c1d4a71b 100644 --- a/pkg/inference/scheduling/scheduler_test.go +++ b/pkg/inference/scheduling/scheduler_test.go @@ -33,7 +33,7 @@ func TestCors(t *testing.T) { discard := logrus.New() discard.SetOutput(io.Discard) log := logrus.NewEntry(discard) - s := NewScheduler(log, nil, nil, nil, nil, nil) + s := NewScheduler(log, nil, nil, nil, nil, nil, nil) httpHandler := NewHTTPHandler(s, nil, []string{"*"}) req := httptest.NewRequest(http.MethodOptions, "http://model-runner.docker.internal"+tt.path, http.NoBody) req.Header.Set("Origin", "docker.com") From 18efa90acc2731fcf25e6aafbb1c8e89c68d62d8 Mon Sep 17 00:00:00 2001 From: Dorin Geman Date: Tue, 10 Feb 2026 19:59:48 +0200 Subject: [PATCH 2/5] fix: differentiate MaxBytesError from other read failures in the InstallBackend handler Signed-off-by: Dorin Geman --- pkg/inference/scheduling/http_handler.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go index cdcd59db2..7a7839147 100644 --- a/pkg/inference/scheduling/http_handler.go +++ b/pkg/inference/scheduling/http_handler.go @@ -349,7 +349,12 @@ type installBackendRequest struct { func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) { body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize)) if err != nil { - http.Error(w, "failed to read request body", http.StatusInternalServerError) + var maxBytesError *http.MaxBytesError + if errors.As(err, &maxBytesError) { + http.Error(w, "request too large", http.StatusBadRequest) + } else { + http.Error(w, "failed to read request body", http.StatusInternalServerError) + } return } From 07f29dfeceff1be34875125515c6133a1dc491e1 Mon Sep 17 00:00:00 2001 From: Dorin Geman Date: Tue, 10 Feb 2026 20:08:24 +0200 Subject: [PATCH 3/5] fix(installer): protect statuses map against concurrent read/write Signed-off-by: Dorin Geman --- pkg/inference/scheduling/installer.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go index 2178f2e85..2b51c9442 100644 --- a/pkg/inference/scheduling/installer.go +++ b/pkg/inference/scheduling/installer.go @@ -51,8 +51,9 @@ type installer struct { // deferredBackends tracks backends whose installation is deferred until // explicitly requested via installBackend. deferredBackends map[string]bool - // mu protects on-demand installation via installBackend. - mu sync.Mutex + // mu protects statuses map mutations in installBackend. Readers + // (wait, isInstalled) take an RLock; installBackend takes a full Lock. + mu sync.RWMutex } // newInstaller creates a new backend installer. Backends listed in @@ -155,8 +156,10 @@ func (i *installer) run(ctx context.Context) { // For deferred backends that have never been installed, it returns // errBackendNotInstalled immediately instead of blocking. func (i *installer) wait(ctx context.Context, backend string) error { - // Grab the backend status. + // Grab the backend status under a read lock, since installBackend may replace entries in the map. + i.mu.RLock() status, ok := i.statuses[backend] + i.mu.RUnlock() if !ok { return ErrBackendNotFound } @@ -238,7 +241,9 @@ func (i *installer) installBackend(ctx context.Context, name string) error { // isInstalled returns true if the given backend has completed installation. // It is non-blocking. func (i *installer) isInstalled(name string) bool { + i.mu.RLock() status, ok := i.statuses[name] + i.mu.RUnlock() if !ok { return false } From a5efa9f4c589084ed754acce9aaf663279c37763 Mon Sep 17 00:00:00 2001 From: Dorin Geman Date: Wed, 11 Feb 2026 11:03:11 +0200 Subject: [PATCH 4/5] feat: auto-pull deferred backends on demand Signed-off-by: Dorin Geman --- pkg/inference/scheduling/installer.go | 42 ++++++++++++++++++++------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go index 2b51c9442..8703ad031 100644 --- a/pkg/inference/scheduling/installer.go +++ b/pkg/inference/scheduling/installer.go @@ -54,6 +54,10 @@ type installer struct { // mu protects statuses map mutations in installBackend. Readers // (wait, isInstalled) take an RLock; installBackend takes a full Lock. mu sync.RWMutex + // installMu serializes on-demand install operations so that only one + // goroutine performs the actual download at a time. Held independently + // of mu so that long-running installs don't block map readers. + installMu sync.Mutex } // newInstaller creates a new backend installer. Backends listed in @@ -118,8 +122,8 @@ func (i *installer) run(ctx context.Context) { close(status.installed) } } - // If not on disk, leave channels open so wait() returns - // errBackendNotInstalled. + // If not on disk, leave channels open so wait() can trigger + // on-demand installation when the backend is first needed. continue } @@ -153,8 +157,9 @@ func (i *installer) run(ctx context.Context) { } // wait waits for installation of the specified backend to complete or fail. -// For deferred backends that have never been installed, it returns -// errBackendNotInstalled immediately instead of blocking. +// For deferred backends that have not yet been installed, it triggers +// on-demand installation (auto-pull), blocking until complete or the caller's +// context is cancelled. func (i *installer) wait(ctx context.Context, backend string) error { // Grab the backend status under a read lock, since installBackend may replace entries in the map. i.mu.RLock() @@ -164,9 +169,8 @@ func (i *installer) wait(ctx context.Context, backend string) error { return ErrBackendNotFound } - // For deferred backends, check whether installation has completed without - // blocking. This doesn't depend on the installer being started, since - // deferred backends are installed on-demand, not by the run loop. + // For deferred backends, check whether installation has already completed. + // If not, trigger on-demand installation (auto-pull). if i.deferredBackends[backend] { select { case <-status.installed: @@ -174,7 +178,7 @@ func (i *installer) wait(ctx context.Context, backend string) error { case <-status.failed: return status.err default: - return errBackendNotInstalled + return i.installBackend(ctx, backend) } } @@ -198,16 +202,24 @@ func (i *installer) wait(ctx context.Context, backend string) error { // installBackend triggers on-demand installation of a deferred backend. // It is idempotent: if the backend is already installed, it returns nil. +// installMu serializes actual downloads so only one goroutine installs at a +// time, while mu is held only briefly for map reads/writes so that other +// goroutines calling wait() or isInstalled() are not blocked during the +// (potentially long) Install() call. func (i *installer) installBackend(ctx context.Context, name string) error { - i.mu.Lock() - defer i.mu.Unlock() + // Serialize install operations so only one download runs at a time. + i.installMu.Lock() + defer i.installMu.Unlock() backend, ok := i.backends[name] if !ok { return ErrBackendNotFound } + // Check current status under read lock. + i.mu.RLock() status := i.statuses[name] + i.mu.RUnlock() // Already installed — nothing to do. select { @@ -223,12 +235,20 @@ func (i *installer) installBackend(ctx context.Context, name string) error { installed: make(chan struct{}), failed: make(chan struct{}), } + i.mu.Lock() i.statuses[name] = status + i.mu.Unlock() default: } - // Perform installation. + // Perform installation without holding mu. if err := backend.Install(ctx, i.httpClient); err != nil { + // If the caller's context was cancelled (e.g. Ctrl-C), don't + // permanently mark the backend as failed — leave channels open + // so the next request can retry. + if ctx.Err() != nil { + return err + } status.err = err close(status.failed) return err From 8b6c1ae1dba25ff6a102d818c9fb40a2650d7bb0 Mon Sep 17 00:00:00 2001 From: Dorin Geman Date: Wed, 11 Feb 2026 11:20:42 +0200 Subject: [PATCH 5/5] feat: stream backend installation progress to CLI users Signed-off-by: Dorin Geman --- cmd/cli/desktop/desktop.go | 52 ++++++++++++++++++++++-- pkg/inference/scheduling/api.go | 3 ++ pkg/inference/scheduling/http_handler.go | 26 +++++++++++- 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go index 1e864bc75..b2b5376bc 100644 --- a/cmd/cli/desktop/desktop.go +++ b/cmd/cli/desktop/desktop.go @@ -479,12 +479,58 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv TotalTokens int `json:"total_tokens"` } - // Detect streaming vs non-streaming response via Content-Type header + // Use a buffered reader so we can consume server-sent progress + // lines (e.g. "Installing vllm-metal backend...") that arrive + // before the actual SSE or JSON inference response. + br := bufio.NewReader(resp.Body) + + // Consume any plain-text progress lines that precede the real + // response. We peek ahead: if the next non-empty content starts + // with '{' (JSON) or "data:" / ":" (SSE), the progress section + // is over and we fall through to normal processing. + for { + peek, err := br.Peek(1) + if err != nil { + break + } + // JSON object or SSE stream — stop consuming progress lines. + if peek[0] == '{' || peek[0] == ':' { + break + } + line, err := br.ReadString('\n') + if err != nil && line == "" { + break + } + line = strings.TrimRight(line, "\r\n") + if line == "" { + continue + } + // SSE data line — stop, let the normal SSE parser handle it. + if strings.HasPrefix(line, "data:") { + // Put the line back by chaining a reader with the rest. + br = bufio.NewReader(io.MultiReader( + strings.NewReader(line+"\n"), + br, + )) + break + } + // Progress message — print to stderr. + fmt.Fprintln(os.Stderr, line) + } + + // Detect streaming vs non-streaming response. Because server-sent + // progress lines may have been flushed before the Content-Type was + // set, we also peek at the body content to detect SSE. isStreaming := strings.HasPrefix(resp.Header.Get("Content-Type"), "text/event-stream") + if !isStreaming { + if peek, err := br.Peek(5); err == nil { + isStreaming = strings.HasPrefix(string(peek), "data:") + } + } if !isStreaming { // Non-streaming JSON response - body, err := io.ReadAll(resp.Body) + body, err := io.ReadAll(br) if err != nil { return assistantResponse.String(), fmt.Errorf("error reading response body: %w", err) } @@ -506,7 +552,7 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv } } else { // SSE streaming response - process line by line - scanner := bufio.NewScanner(resp.Body) + scanner := bufio.NewScanner(br) for scanner.Scan() { // Check if context was cancelled diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go index 7cd444a4b..143a072b1 100644 --- a/pkg/inference/scheduling/api.go +++ b/pkg/inference/scheduling/api.go @@ -13,6 +13,9 @@ const ( // enough to encompass any real-world request but also small enough to avoid // DoS attacks. maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024 + + // modelCLIUserAgentPrefix is the user-agent prefix set by the model CLI. + modelCLIUserAgentPrefix = "docker-model-cli/" ) // trimRequestPathToOpenAIRoot trims a request path to start at the first diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go index 7a7839147..90050cac8 100644 --- a/pkg/inference/scheduling/http_handler.go +++ b/pkg/inference/scheduling/http_handler.go @@ -198,11 +198,28 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque backend = h.scheduler.selectBackendForModel(model, backend, request.Model) } + // If a deferred backend needs on-demand installation and the request + // comes from the model CLI, stream progress messages so the user sees + // what is happening while the download runs. + autoInstall := h.scheduler.installer.deferredBackends[backend.Name()] && + !h.scheduler.installer.isInstalled(backend.Name()) && + strings.Contains(r.UserAgent(), modelCLIUserAgentPrefix) + if autoInstall { + fmt.Fprintf(w, "Installing %s backend...\n", backend.Name()) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + } + // Wait for the corresponding backend installation to complete or fail. We // don't allow any requests to be scheduled for a backend until it has // completed installation. if err := h.scheduler.installer.wait(r.Context(), backend.Name()); err != nil { - if errors.Is(err, ErrBackendNotFound) { + if autoInstall { + // Headers are already sent (200 OK) from the progress + // line, so we can only write the error as plain text. + fmt.Fprintf(w, "backend installation failed: %v\n", err) + } else if errors.Is(err, ErrBackendNotFound) { http.Error(w, err.Error(), http.StatusNotFound) } else if errors.Is(err, errInstallerNotStarted) { http.Error(w, err.Error(), http.StatusServiceUnavailable) @@ -222,6 +239,13 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque return } + if autoInstall { + fmt.Fprintf(w, "%s backend installed successfully\n", backend.Name()) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + } + modelID := h.scheduler.modelManager.ResolveID(request.Model) // Request a runner to execute the request and defer its release.