From e24fb26d3926a9fa8d4f179fc4e16139ee831e43 Mon Sep 17 00:00:00 2001
From: Dorin Geman <dorin.geman@docker.com>
Date: Tue, 10 Feb 2026 19:43:43 +0200
Subject: [PATCH 1/5] feat: make vllm-metal backend installation opt-in

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
---
 cmd/cli/commands/install-runner.go            |  16 ++-
 cmd/cli/desktop/desktop.go                    |  24 ++++
 .../docker_model_install-runner.yaml          |   3 +-
 .../docker_model_reinstall-runner.yaml        |   3 +-
 .../reference/docker_model_start-runner.yaml  |   3 +-
 .../docs/reference/model_install-runner.md    |   2 +-
 .../docs/reference/model_reinstall-runner.md  |   2 +-
 cmd/cli/docs/reference/model_start-runner.md  |   2 +-
 main.go                                       |   7 +
 pkg/inference/scheduling/http_handler.go      |  35 +++++
 pkg/inference/scheduling/installer.go         | 121 +++++++++++++++++-
 pkg/inference/scheduling/scheduler.go         |  40 ++++--
 pkg/inference/scheduling/scheduler_test.go    |   2 +-
 13 files changed, 235 insertions(+), 25 deletions(-)

diff --git a/cmd/cli/commands/install-runner.go b/cmd/cli/commands/install-runner.go
index b986a93ff..50c1538e7 100644
--- a/cmd/cli/commands/install-runner.go
+++ b/cmd/cli/commands/install-runner.go
@@ -17,6 +17,7 @@ import (
 	"github.com/docker/model-runner/pkg/inference/backends/diffusers"
 	"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
 	"github.com/docker/model-runner/pkg/inference/backends/vllm"
+	"github.com/docker/model-runner/pkg/inference/backends/vllmmetal"
 	"github.com/spf13/cobra"
 )
 
@@ -28,7 +29,7 @@ const (
 	// installation will try to reach the model runner while waiting for it to
 	// be ready.
 	installWaitRetryInterval = 500 * time.Millisecond
-	backendUsage             = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "). Default: " + llamacpp.Name
+	backendUsage             = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "|" + vllmmetal.Name + "). Default: " + llamacpp.Name
 )
 
 // waitForStandaloneRunnerAfterInstall waits for a standalone model runner
@@ -237,6 +238,17 @@ type runnerOptions struct {
 
 // runInstallOrStart is shared logic for install-runner and start-runner commands
 func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error {
+	// vllm-metal is installed on-demand via the running model runner,
+	// not as a standalone container. This applies to all engine kinds.
+	if opts.backend == vllmmetal.Name {
+		cmd.Println("Installing vllm-metal backend...")
+		if err := desktopClient.InstallBackend(vllmmetal.Name); err != nil {
+			return fmt.Errorf("failed to install vllm-metal backend: %w", err)
+		}
+		cmd.Println("vllm-metal backend installed successfully")
+		return nil
+	}
+
 	var vllmOnWSL bool
 	// Ensure that we're running in a supported model runner context.
 	engineKind := modelRunner.EngineKind()
@@ -324,7 +336,7 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error
 	}
 
 	// Validate backend selection
-	validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name}
+	validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name, vllmmetal.Name}
 	if opts.backend != "" {
 		isValid := false
 		for _, valid := range validBackends {
diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go
index 593f40fcb..1e864bc75 100644
--- a/cmd/cli/desktop/desktop.go
+++ b/cmd/cli/desktop/desktop.go
@@ -782,6 +782,30 @@ func (c *Client) ShowConfigs(modelFilter string) ([]scheduling.ModelConfigEntry,
 	return configs, nil
 }
 
+// InstallBackend triggers on-demand installation of a deferred backend
+func (c *Client) InstallBackend(backend string) error {
+	installPath := inference.InferencePrefix + "/install-backend"
+	jsonData, err := json.Marshal(struct {
+		Backend string `json:"backend"`
+	}{Backend: backend})
+	if err != nil {
+		return fmt.Errorf("error marshaling request: %w", err)
+	}
+
+	resp, err := c.doRequest(http.MethodPost, installPath, bytes.NewReader(jsonData))
+	if err != nil {
+		return c.handleQueryError(err, installPath)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("install backend failed with status %s: %s", resp.Status, string(body))
+	}
+
+	return nil
+}
+
 func (c *Client) ConfigureBackend(request scheduling.ConfigureRequest) error {
 	configureBackendPath := inference.InferencePrefix + "/_configure"
 	jsonData, err := json.Marshal(request)
diff --git a/cmd/cli/docs/reference/docker_model_install-runner.yaml b/cmd/cli/docs/reference/docker_model_install-runner.yaml
index 562fd6bbf..a2ea045f3 100644
--- a/cmd/cli/docs/reference/docker_model_install-runner.yaml
+++ b/cmd/cli/docs/reference/docker_model_install-runner.yaml
@@ -8,7 +8,8 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
+      description: |
+        Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
       deprecated: false
       hidden: false
       experimental: false
diff --git a/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml b/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml
index 35d328b55..90ca05e10 100644
--- a/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml
+++ b/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml
@@ -8,7 +8,8 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
+      description: |
+        Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
       deprecated: false
       hidden: false
       experimental: false
diff --git a/cmd/cli/docs/reference/docker_model_start-runner.yaml b/cmd/cli/docs/reference/docker_model_start-runner.yaml
index 740e36c53..1cb635bba 100644
--- a/cmd/cli/docs/reference/docker_model_start-runner.yaml
+++ b/cmd/cli/docs/reference/docker_model_start-runner.yaml
@@ -10,7 +10,8 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
+      description: |
+        Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
       deprecated: false
       hidden: false
       experimental: false
diff --git a/cmd/cli/docs/reference/model_install-runner.md b/cmd/cli/docs/reference/model_install-runner.md
index de40a5028..42949c791 100644
--- a/cmd/cli/docs/reference/model_install-runner.md
+++ b/cmd/cli/docs/reference/model_install-runner.md
@@ -7,7 +7,7 @@ Install Docker Model Runner (Docker Engine only)
 
 | Name             | Type     | Default     | Description                                                                                            |
 |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
-| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp                                       |
+| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp                           |
 | `--debug`        | `bool`   |             | Enable debug logging                                                                                   |
 | `--do-not-track` | `bool`   |             | Do not track models usage in Docker Model Runner                                                       |
 | `--gpu`          | `string` | `auto`      | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann)                                               |
diff --git a/cmd/cli/docs/reference/model_reinstall-runner.md b/cmd/cli/docs/reference/model_reinstall-runner.md
index 457b322e5..3e7fd8441 100644
--- a/cmd/cli/docs/reference/model_reinstall-runner.md
+++ b/cmd/cli/docs/reference/model_reinstall-runner.md
@@ -7,7 +7,7 @@ Reinstall Docker Model Runner (Docker Engine only)
 
 | Name             | Type     | Default     | Description                                                                                            |
 |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
-| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp                                       |
+| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp                           |
 | `--debug`        | `bool`   |             | Enable debug logging                                                                                   |
 | `--do-not-track` | `bool`   |             | Do not track models usage in Docker Model Runner                                                       |
 | `--gpu`          | `string` | `auto`      | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann)                                               |
diff --git a/cmd/cli/docs/reference/model_start-runner.md b/cmd/cli/docs/reference/model_start-runner.md
index 24cf2fe12..b6fdb0d7a 100644
--- a/cmd/cli/docs/reference/model_start-runner.md
+++ b/cmd/cli/docs/reference/model_start-runner.md
@@ -7,7 +7,7 @@ Start Docker Model Runner (Docker Engine only)
 
 | Name             | Type     | Default     | Description                                                                                            |
 |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
-| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp                                       |
+| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp                           |
 | `--debug`        | `bool`   |             | Enable debug logging                                                                                   |
 | `--do-not-track` | `bool`   |             | Do not track models usage in Docker Model Runner                                                       |
 | `--gpu`          | `string` | `auto`      | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann)                                               |
diff --git a/main.go b/main.go
index 2711a25ba..40a374890 100644
--- a/main.go
+++ b/main.go
@@ -208,6 +208,12 @@ func main() {
 		backends[vllmmetal.Name] = vllmMetalBackend
 	}
 
+	// Backends whose installation is deferred until explicitly requested.
+	var deferredBackends []string
+	if vllmMetalBackend != nil {
+		deferredBackends = append(deferredBackends, vllmmetal.Name)
+	}
+
 	scheduler := scheduling.NewScheduler(
 		log,
 		backends,
@@ -220,6 +226,7 @@ func main() {
 			"",
 			false,
 		),
+		deferredBackends,
 	)
 
 	// Create the HTTP handler for the scheduler
diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go
index 2205cde8f..cdcd59db2 100644
--- a/pkg/inference/scheduling/http_handler.go
+++ b/pkg/inference/scheduling/http_handler.go
@@ -97,6 +97,7 @@ func (h *HTTPHandler) routeHandlers() map[string]http.HandlerFunc {
 	m["GET "+inference.InferencePrefix+"/v1/models"] = h.handleModels
 	m["GET "+inference.InferencePrefix+"/v1/models/{name...}"] = h.handleModels
 
+	m["POST "+inference.InferencePrefix+"/install-backend"] = h.InstallBackend
 	m["GET "+inference.InferencePrefix+"/status"] = h.GetBackendStatus
 	m["GET "+inference.InferencePrefix+"/ps"] = h.GetRunningBackends
 	m["GET "+inference.InferencePrefix+"/df"] = h.GetDiskUsage
@@ -211,6 +212,8 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 			// shutting down (since that will also cancel the request context).
 			// Either way, provide a response, even if it's ignored.
 			http.Error(w, "service unavailable", http.StatusServiceUnavailable)
+		} else if errors.Is(err, errBackendNotInstalled) {
+			http.Error(w, fmt.Sprintf("backend %q is not installed; run: docker model install-runner --backend %s", backend.Name(), backend.Name()), http.StatusPreconditionFailed)
 		} else if errors.Is(err, vllm.ErrorNotFound) {
 			http.Error(w, err.Error(), http.StatusPreconditionFailed)
 		} else {
@@ -336,6 +339,38 @@ func (h *HTTPHandler) Unload(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
+// installBackendRequest is the JSON body for the install-backend endpoint.
+type installBackendRequest struct {
+	Backend string `json:"backend"`
+}
+
+// InstallBackend handles POST <inference-prefix>/install-backend requests.
+// It triggers on-demand installation of a deferred backend.
+func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) {
+	body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
+	if err != nil {
+		http.Error(w, "failed to read request body", http.StatusInternalServerError)
+		return
+	}
+
+	var req installBackendRequest
+	if err := json.Unmarshal(body, &req); err != nil || req.Backend == "" {
+		http.Error(w, "invalid request: backend is required", http.StatusBadRequest)
+		return
+	}
+
+	if err := h.scheduler.InstallBackend(r.Context(), req.Backend); err != nil {
+		if errors.Is(err, ErrBackendNotFound) {
+			http.Error(w, err.Error(), http.StatusNotFound)
+		} else {
+			http.Error(w, fmt.Sprintf("backend installation failed: %v", err), http.StatusInternalServerError)
+		}
+		return
+	}
+
+	w.WriteHeader(http.StatusOK)
+}
+
 // Configure handles POST <inference-prefix>/{backend}/_configure requests.
 func (h *HTTPHandler) Configure(w http.ResponseWriter, r *http.Request) {
 	// Determine the requested backend and ensure that it's valid.
diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go
index ae3aba839..2178f2e85 100644
--- a/pkg/inference/scheduling/installer.go
+++ b/pkg/inference/scheduling/installer.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"net/http"
+	"sync"
 	"sync/atomic"
 
 	"github.com/docker/model-runner/pkg/inference"
@@ -17,6 +18,9 @@ var (
 	// errInstallerShuttingDown indicates that the installer's run loop has been
 	// terminated and the installer is shutting down.
 	errInstallerShuttingDown = errors.New("backend installer shutting down")
+	// errBackendNotInstalled indicates that a deferred backend has not been
+	// installed. Callers should install it via installBackend before use.
+	errBackendNotInstalled = errors.New("backend not installed")
 )
 
 // installStatus tracks the installation status of a backend.
@@ -44,14 +48,28 @@ type installer struct {
 	started atomic.Bool
 	// statuses maps backend names to their installation statuses.
 	statuses map[string]*installStatus
+	// deferredBackends tracks backends whose installation is deferred until
+	// explicitly requested via installBackend.
+	deferredBackends map[string]bool
+	// mu protects on-demand installation via installBackend.
+	mu sync.Mutex
 }
 
-// newInstaller creates a new backend installer.
+// newInstaller creates a new backend installer. Backends listed in
+// deferredBackends are skipped during the automatic run loop and must be
+// installed on-demand via installBackend.
 func newInstaller(
 	log logging.Logger,
 	backends map[string]inference.Backend,
 	httpClient *http.Client,
+	deferredBackends []string,
 ) *installer {
+	// Build the deferred set.
+	deferred := make(map[string]bool, len(deferredBackends))
+	for _, name := range deferredBackends {
+		deferred[name] = true
+	}
+
 	// Create status trackers.
 	statuses := make(map[string]*installStatus, len(backends))
 	for name := range backends {
@@ -63,10 +81,11 @@ func newInstaller(
 
 	// Create the installer.
 	return &installer{
-		log:        log,
-		backends:   backends,
-		httpClient: httpClient,
-		statuses:   statuses,
+		log:              log,
+		backends:         backends,
+		httpClient:       httpClient,
+		statuses:         statuses,
+		deferredBackends: deferred,
 	}
 }
 
@@ -84,6 +103,25 @@ func (i *installer) run(ctx context.Context) {
 	// ubiquitous backend and mlx as a relatively lightweight backend (on macOS
 	// only), this granularity is probably less of a concern.
 	for name, backend := range i.backends {
+		// For deferred backends, check if they are already installed on disk
+		// from a previous session. Only call Install() (which verifies the
+		// existing installation) when files are present, to avoid triggering
+		// a download.
+		if i.deferredBackends[name] {
+			status := i.statuses[name]
+			if diskUsage, err := backend.GetDiskUsage(); err == nil && diskUsage > 0 {
+				if err := backend.Install(ctx, i.httpClient); err != nil {
+					status.err = err
+					close(status.failed)
+				} else {
+					close(status.installed)
+				}
+			}
+			// If not on disk, leave channels open so wait() returns
+			// errBackendNotInstalled.
+			continue
+		}
+
 		status := i.statuses[name]
 
 		var installedClosed bool
@@ -114,6 +152,8 @@ func (i *installer) run(ctx context.Context) {
 }
 
 // wait waits for installation of the specified backend to complete or fail.
+// For deferred backends that have never been installed, it returns
+// errBackendNotInstalled immediately instead of blocking.
 func (i *installer) wait(ctx context.Context, backend string) error {
 	// Grab the backend status.
 	status, ok := i.statuses[backend]
@@ -121,6 +161,20 @@ func (i *installer) wait(ctx context.Context, backend string) error {
 		return ErrBackendNotFound
 	}
 
+	// For deferred backends, check whether installation has completed without
+	// blocking. This doesn't depend on the installer being started, since
+	// deferred backends are installed on-demand, not by the run loop.
+	if i.deferredBackends[backend] {
+		select {
+		case <-status.installed:
+			return nil
+		case <-status.failed:
+			return status.err
+		default:
+			return errBackendNotInstalled
+		}
+	}
+
 	// If the installer hasn't started, then don't poll for readiness, because
 	// it may never come. If it has started, then even if it's cancelled we can
 	// be sure that we'll at least see failure for all backend installations.
@@ -138,3 +192,60 @@ func (i *installer) wait(ctx context.Context, backend string) error {
 		return status.err
 	}
 }
+
+// installBackend triggers on-demand installation of a deferred backend.
+// It is idempotent: if the backend is already installed, it returns nil.
+func (i *installer) installBackend(ctx context.Context, name string) error {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	backend, ok := i.backends[name]
+	if !ok {
+		return ErrBackendNotFound
+	}
+
+	status := i.statuses[name]
+
+	// Already installed — nothing to do.
+	select {
+	case <-status.installed:
+		return nil
+	default:
+	}
+
+	// If previously failed, reset status for retry.
+	select {
+	case <-status.failed:
+		status = &installStatus{
+			installed: make(chan struct{}),
+			failed:    make(chan struct{}),
+		}
+		i.statuses[name] = status
+	default:
+	}
+
+	// Perform installation.
+	if err := backend.Install(ctx, i.httpClient); err != nil {
+		status.err = err
+		close(status.failed)
+		return err
+	}
+
+	close(status.installed)
+	return nil
+}
+
+// isInstalled returns true if the given backend has completed installation.
+// It is non-blocking.
+func (i *installer) isInstalled(name string) bool {
+	status, ok := i.statuses[name]
+	if !ok {
+		return false
+	}
+	select {
+	case <-status.installed:
+		return true
+	default:
+		return false
+	}
+}
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
index a0b1e4679..0ce6beea3 100644
--- a/pkg/inference/scheduling/scheduler.go
+++ b/pkg/inference/scheduling/scheduler.go
@@ -42,9 +42,14 @@ type Scheduler struct {
 	tracker *metrics.Tracker
 	// openAIRecorder is used to record OpenAI API inference requests and responses.
 	openAIRecorder *metrics.OpenAIRecorder
+	// deferredBackends lists backends whose installation is deferred until
+	// explicitly requested (e.g. via install-backend endpoint).
+	deferredBackends []string
 }
 
-// NewScheduler creates a new inference scheduler.
+// NewScheduler creates a new inference scheduler. Backends listed in
+// deferredBackends are not installed automatically during startup; they must
+// be installed on-demand via InstallBackend.
 func NewScheduler(
 	log logging.Logger,
 	backends map[string]inference.Backend,
@@ -52,19 +57,21 @@ func NewScheduler(
 	modelManager *models.Manager,
 	httpClient *http.Client,
 	tracker *metrics.Tracker,
+	deferredBackends []string,
 ) *Scheduler {
 	openAIRecorder := metrics.NewOpenAIRecorder(log.WithField("component", "openai-recorder"), modelManager)
 
 	// Create the scheduler.
 	s := &Scheduler{
-		log:            log,
-		backends:       backends,
-		defaultBackend: defaultBackend,
-		modelManager:   modelManager,
-		installer:      newInstaller(log, backends, httpClient),
-		loader:         newLoader(log, backends, modelManager, openAIRecorder),
-		tracker:        tracker,
-		openAIRecorder: openAIRecorder,
+		log:              log,
+		backends:         backends,
+		defaultBackend:   defaultBackend,
+		modelManager:     modelManager,
+		installer:        newInstaller(log, backends, httpClient, deferredBackends),
+		loader:           newLoader(log, backends, modelManager, openAIRecorder),
+		tracker:          tracker,
+		openAIRecorder:   openAIRecorder,
+		deferredBackends: deferredBackends,
 	}
 
 	// Scheduler successfully initialized.
@@ -105,8 +112,14 @@ func (s *Scheduler) selectBackendForModel(model types.Model, backend inference.B
 	}
 
 	if config.GetFormat() == types.FormatSafetensors {
-		// Prefer vllm-metal for safetensors models on macOS (most feature-rich for Metal)
+		// Prefer vllm-metal for safetensors models on macOS (most feature-rich for Metal),
+		// but only if it has been installed.
 		if vllmMetalBackend, ok := s.backends[vllmmetal.Name]; ok && vllmMetalBackend != nil {
+			if s.installer.isInstalled(vllmmetal.Name) {
+				return vllmMetalBackend
+			}
+			s.log.Infof("vllm-metal backend is available but not installed. "+
+				"To install, run: docker model install-runner --backend %s", vllmmetal.Name)
 			return vllmMetalBackend
 		}
 		// Fall back to MLX on macOS
@@ -131,7 +144,12 @@ func (s *Scheduler) selectBackendForModel(model types.Model, backend inference.B
 
 // ResetInstaller resets the backend installer with a new HTTP client.
 func (s *Scheduler) ResetInstaller(httpClient *http.Client) {
-	s.installer = newInstaller(s.log, s.backends, httpClient)
+	s.installer = newInstaller(s.log, s.backends, httpClient, s.deferredBackends)
+}
+
+// InstallBackend triggers on-demand installation of a deferred backend.
+func (s *Scheduler) InstallBackend(ctx context.Context, name string) error {
+	return s.installer.installBackend(ctx, name)
 }
 
 // GetRunningBackendsInfo returns information about all running backends as a slice
diff --git a/pkg/inference/scheduling/scheduler_test.go b/pkg/inference/scheduling/scheduler_test.go
index 7e8e3fd3f..0c1d4a71b 100644
--- a/pkg/inference/scheduling/scheduler_test.go
+++ b/pkg/inference/scheduling/scheduler_test.go
@@ -33,7 +33,7 @@ func TestCors(t *testing.T) {
 			discard := logrus.New()
 			discard.SetOutput(io.Discard)
 			log := logrus.NewEntry(discard)
-			s := NewScheduler(log, nil, nil, nil, nil, nil)
+			s := NewScheduler(log, nil, nil, nil, nil, nil, nil)
 			httpHandler := NewHTTPHandler(s, nil, []string{"*"})
 			req := httptest.NewRequest(http.MethodOptions, "http://model-runner.docker.internal"+tt.path, http.NoBody)
 			req.Header.Set("Origin", "docker.com")

From 18efa90acc2731fcf25e6aafbb1c8e89c68d62d8 Mon Sep 17 00:00:00 2001
From: Dorin Geman <dorin.geman@docker.com>
Date: Tue, 10 Feb 2026 19:59:48 +0200
Subject: [PATCH 2/5] fix: differentiate MaxBytesError from other read failures
 in the InstallBackend handler

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
---
 pkg/inference/scheduling/http_handler.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go
index cdcd59db2..7a7839147 100644
--- a/pkg/inference/scheduling/http_handler.go
+++ b/pkg/inference/scheduling/http_handler.go
@@ -349,7 +349,12 @@ type installBackendRequest struct {
 func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) {
 	body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
 	if err != nil {
-		http.Error(w, "failed to read request body", http.StatusInternalServerError)
+		var maxBytesError *http.MaxBytesError
+		if errors.As(err, &maxBytesError) {
+			http.Error(w, "request too large", http.StatusBadRequest)
+		} else {
+			http.Error(w, "failed to read request body", http.StatusInternalServerError)
+		}
 		return
 	}
 

From 07f29dfeceff1be34875125515c6133a1dc491e1 Mon Sep 17 00:00:00 2001
From: Dorin Geman <dorin.geman@docker.com>
Date: Tue, 10 Feb 2026 20:08:24 +0200
Subject: [PATCH 3/5] fix(installer): protect statuses map against concurrent
 read/write

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
---
 pkg/inference/scheduling/installer.go | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go
index 2178f2e85..2b51c9442 100644
--- a/pkg/inference/scheduling/installer.go
+++ b/pkg/inference/scheduling/installer.go
@@ -51,8 +51,9 @@ type installer struct {
 	// deferredBackends tracks backends whose installation is deferred until
 	// explicitly requested via installBackend.
 	deferredBackends map[string]bool
-	// mu protects on-demand installation via installBackend.
-	mu sync.Mutex
+	// mu protects statuses map mutations in installBackend. Readers
+	// (wait, isInstalled) take an RLock; installBackend takes a full Lock.
+	mu sync.RWMutex
 }
 
 // newInstaller creates a new backend installer. Backends listed in
@@ -155,8 +156,10 @@ func (i *installer) run(ctx context.Context) {
 // For deferred backends that have never been installed, it returns
 // errBackendNotInstalled immediately instead of blocking.
 func (i *installer) wait(ctx context.Context, backend string) error {
-	// Grab the backend status.
+	// Grab the backend status under a read lock, since installBackend may replace entries in the map.
+	i.mu.RLock()
 	status, ok := i.statuses[backend]
+	i.mu.RUnlock()
 	if !ok {
 		return ErrBackendNotFound
 	}
@@ -238,7 +241,9 @@ func (i *installer) installBackend(ctx context.Context, name string) error {
 // isInstalled returns true if the given backend has completed installation.
 // It is non-blocking.
 func (i *installer) isInstalled(name string) bool {
+	i.mu.RLock()
 	status, ok := i.statuses[name]
+	i.mu.RUnlock()
 	if !ok {
 		return false
 	}

From a5efa9f4c589084ed754acce9aaf663279c37763 Mon Sep 17 00:00:00 2001
From: Dorin Geman <dorin.geman@docker.com>
Date: Wed, 11 Feb 2026 11:03:11 +0200
Subject: [PATCH 4/5] feat: auto-pull deferred backends on demand

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
---
 pkg/inference/scheduling/installer.go | 42 ++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go
index 2b51c9442..8703ad031 100644
--- a/pkg/inference/scheduling/installer.go
+++ b/pkg/inference/scheduling/installer.go
@@ -54,6 +54,10 @@ type installer struct {
 	// mu protects statuses map mutations in installBackend. Readers
 	// (wait, isInstalled) take an RLock; installBackend takes a full Lock.
 	mu sync.RWMutex
+	// installMu serializes on-demand install operations so that only one
+	// goroutine performs the actual download at a time. Held independently
+	// of mu so that long-running installs don't block map readers.
+	installMu sync.Mutex
 }
 
 // newInstaller creates a new backend installer. Backends listed in
@@ -118,8 +122,8 @@ func (i *installer) run(ctx context.Context) {
 					close(status.installed)
 				}
 			}
-			// If not on disk, leave channels open so wait() returns
-			// errBackendNotInstalled.
+			// If not on disk, leave channels open so wait() can trigger
+			// on-demand installation when the backend is first needed.
 			continue
 		}
 
@@ -153,8 +157,9 @@ func (i *installer) run(ctx context.Context) {
 }
 
 // wait waits for installation of the specified backend to complete or fail.
-// For deferred backends that have never been installed, it returns
-// errBackendNotInstalled immediately instead of blocking.
+// For deferred backends that have not yet been installed, it triggers
+// on-demand installation (auto-pull), blocking until complete or the caller's
+// context is cancelled.
 func (i *installer) wait(ctx context.Context, backend string) error {
 	// Grab the backend status under a read lock, since installBackend may replace entries in the map.
 	i.mu.RLock()
@@ -164,9 +169,8 @@ func (i *installer) wait(ctx context.Context, backend string) error {
 		return ErrBackendNotFound
 	}
 
-	// For deferred backends, check whether installation has completed without
-	// blocking. This doesn't depend on the installer being started, since
-	// deferred backends are installed on-demand, not by the run loop.
+	// For deferred backends, check whether installation has already completed.
+	// If not, trigger on-demand installation (auto-pull).
 	if i.deferredBackends[backend] {
 		select {
 		case <-status.installed:
@@ -174,7 +178,7 @@ func (i *installer) wait(ctx context.Context, backend string) error {
 		case <-status.failed:
 			return status.err
 		default:
-			return errBackendNotInstalled
+			return i.installBackend(ctx, backend)
 		}
 	}
 
@@ -198,16 +202,24 @@ func (i *installer) wait(ctx context.Context, backend string) error {
 
 // installBackend triggers on-demand installation of a deferred backend.
 // It is idempotent: if the backend is already installed, it returns nil.
+// installMu serializes actual downloads so only one goroutine installs at a
+// time, while mu is held only briefly for map reads/writes so that other
+// goroutines calling wait() or isInstalled() are not blocked during the
+// (potentially long) Install() call.
 func (i *installer) installBackend(ctx context.Context, name string) error {
-	i.mu.Lock()
-	defer i.mu.Unlock()
+	// Serialize install operations so only one download runs at a time.
+	i.installMu.Lock()
+	defer i.installMu.Unlock()
 
 	backend, ok := i.backends[name]
 	if !ok {
 		return ErrBackendNotFound
 	}
 
+	// Check current status under read lock.
+	i.mu.RLock()
 	status := i.statuses[name]
+	i.mu.RUnlock()
 
 	// Already installed — nothing to do.
 	select {
@@ -223,12 +235,20 @@ func (i *installer) installBackend(ctx context.Context, name string) error {
 			installed: make(chan struct{}),
 			failed:    make(chan struct{}),
 		}
+		i.mu.Lock()
 		i.statuses[name] = status
+		i.mu.Unlock()
 	default:
 	}
 
-	// Perform installation.
+	// Perform installation without holding mu.
 	if err := backend.Install(ctx, i.httpClient); err != nil {
+		// If the caller's context was cancelled (e.g. Ctrl-C), don't
+		// permanently mark the backend as failed — leave channels open
+		// so the next request can retry.
+		if ctx.Err() != nil {
+			return err
+		}
 		status.err = err
 		close(status.failed)
 		return err

From 8b6c1ae1dba25ff6a102d818c9fb40a2650d7bb0 Mon Sep 17 00:00:00 2001
From: Dorin Geman <dorin.geman@docker.com>
Date: Wed, 11 Feb 2026 11:20:42 +0200
Subject: [PATCH 5/5] feat: stream backend installation progress to CLI users

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
---
 cmd/cli/desktop/desktop.go               | 52 ++++++++++++++++++++++--
 pkg/inference/scheduling/api.go          |  3 ++
 pkg/inference/scheduling/http_handler.go | 26 +++++++++++-
 3 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go
index 1e864bc75..b2b5376bc 100644
--- a/cmd/cli/desktop/desktop.go
+++ b/cmd/cli/desktop/desktop.go
@@ -479,12 +479,58 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
 		TotalTokens      int `json:"total_tokens"`
 	}
 
-	// Detect streaming vs non-streaming response via Content-Type header
+	// Use a buffered reader so we can consume server-sent progress
+	// lines (e.g. "Installing vllm-metal backend...") that arrive
+	// before the actual SSE or JSON inference response.
+	br := bufio.NewReader(resp.Body)
+
+	// Consume any plain-text progress lines that precede the real
+	// response. We peek ahead: if the next non-empty content starts
+	// with '{' (JSON) or "data:" / ":" (SSE), the progress section
+	// is over and we fall through to normal processing.
+	for {
+		peek, err := br.Peek(1)
+		if err != nil {
+			break
+		}
+		// JSON object or SSE stream — stop consuming progress lines.
+		if peek[0] == '{' || peek[0] == ':' {
+			break
+		}
+		line, err := br.ReadString('\n')
+		if err != nil && line == "" {
+			break
+		}
+		line = strings.TrimRight(line, "\r\n")
+		if line == "" {
+			continue
+		}
+		// SSE data line — stop, let the normal SSE parser handle it.
+		if strings.HasPrefix(line, "data:") {
+			// Put the line back by chaining a reader with the rest.
+			br = bufio.NewReader(io.MultiReader(
+				strings.NewReader(line+"\n"),
+				br,
+			))
+			break
+		}
+		// Progress message — print to stderr.
+		fmt.Fprintln(os.Stderr, line)
+	}
+
+	// Detect streaming vs non-streaming response. Because server-sent
+	// progress lines may have been flushed before the Content-Type was
+	// set, we also peek at the body content to detect SSE.
 	isStreaming := strings.HasPrefix(resp.Header.Get("Content-Type"), "text/event-stream")
+	if !isStreaming {
+		if peek, err := br.Peek(5); err == nil {
+			isStreaming = strings.HasPrefix(string(peek), "data:")
+		}
+	}
 
 	if !isStreaming {
 		// Non-streaming JSON response
-		body, err := io.ReadAll(resp.Body)
+		body, err := io.ReadAll(br)
 		if err != nil {
 			return assistantResponse.String(), fmt.Errorf("error reading response body: %w", err)
 		}
@@ -506,7 +552,7 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
 		}
 	} else {
 		// SSE streaming response - process line by line
-		scanner := bufio.NewScanner(resp.Body)
+		scanner := bufio.NewScanner(br)
 
 		for scanner.Scan() {
 			// Check if context was cancelled
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
index 7cd444a4b..143a072b1 100644
--- a/pkg/inference/scheduling/api.go
+++ b/pkg/inference/scheduling/api.go
@@ -13,6 +13,9 @@ const (
 	// enough to encompass any real-world request but also small enough to avoid
 	// DoS attacks.
 	maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024
+
+	// modelCLIUserAgentPrefix is the user-agent prefix set by the model CLI.
+	modelCLIUserAgentPrefix = "docker-model-cli/"
 )
 
 // trimRequestPathToOpenAIRoot trims a request path to start at the first
diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go
index 7a7839147..90050cac8 100644
--- a/pkg/inference/scheduling/http_handler.go
+++ b/pkg/inference/scheduling/http_handler.go
@@ -198,11 +198,28 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 		backend = h.scheduler.selectBackendForModel(model, backend, request.Model)
 	}
 
+	// If a deferred backend needs on-demand installation and the request
+	// comes from the model CLI, stream progress messages so the user sees
+	// what is happening while the download runs.
+	autoInstall := h.scheduler.installer.deferredBackends[backend.Name()] &&
+		!h.scheduler.installer.isInstalled(backend.Name()) &&
+		strings.Contains(r.UserAgent(), modelCLIUserAgentPrefix)
+	if autoInstall {
+		fmt.Fprintf(w, "Installing %s backend...\n", backend.Name())
+		if f, ok := w.(http.Flusher); ok {
+			f.Flush()
+		}
+	}
+
 	// Wait for the corresponding backend installation to complete or fail. We
 	// don't allow any requests to be scheduled for a backend until it has
 	// completed installation.
 	if err := h.scheduler.installer.wait(r.Context(), backend.Name()); err != nil {
-		if errors.Is(err, ErrBackendNotFound) {
+		if autoInstall {
+			// Headers are already sent (200 OK) from the progress
+			// line, so we can only write the error as plain text.
+			fmt.Fprintf(w, "backend installation failed: %v\n", err)
+		} else if errors.Is(err, ErrBackendNotFound) {
 			http.Error(w, err.Error(), http.StatusNotFound)
 		} else if errors.Is(err, errInstallerNotStarted) {
 			http.Error(w, err.Error(), http.StatusServiceUnavailable)
@@ -222,6 +239,13 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 		return
 	}
 
+	if autoInstall {
+		fmt.Fprintf(w, "%s backend installed successfully\n", backend.Name())
+		if f, ok := w.(http.Flusher); ok {
+			f.Flush()
+		}
+	}
+
 	modelID := h.scheduler.modelManager.ResolveID(request.Model)
 
 	// Request a runner to execute the request and defer its release.