From 5e44e00d61c2693772e48e534e44425a5589506b Mon Sep 17 00:00:00 2001 From: Jvst Me Date: Wed, 5 Mar 2025 12:00:47 +0100 Subject: [PATCH] Try more offers when starting a job Increase the number of offers tried from 15 to 25. Also make this configurable. In particular, this is needed for RunPod Community Cloud spot offers, because availability detection is imprecise there, so many offers may turn out to be unavailable. The increased limit can be seen as a temporary measure until we improve the availability detection in RunPod. --- docs/docs/reference/environment-variables.md | 2 ++ .../server/background/tasks/process_submitted_jobs.py | 3 ++- src/dstack/_internal/server/settings.py | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/docs/reference/environment-variables.md b/docs/docs/reference/environment-variables.md index ab075bf64..3ba520995 100644 --- a/docs/docs/reference/environment-variables.md +++ b/docs/docs/reference/environment-variables.md @@ -112,6 +112,8 @@ For more details on the options below, refer to the [server deployment](../guide * `DSTACK_SERVER_ROOT_LOG_LEVEL` – Sets root logger log level. Defaults to `ERROR`. * `DSTACK_SERVER_UVICORN_LOG_LEVEL` – Sets uvicorn logger log level. Defaults to `ERROR`. + * `DSTACK_SERVER_MAX_OFFERS_TRIED` - Sets how many instance offers to try when starting a job. + Setting a high value can degrade server performance. * `DSTACK_RUNNER_VERSION` – Sets exact runner version for debug. Defaults to `latest`. Ignored if `DSTACK_RUNNER_DOWNLOAD_URL` is set. * `DSTACK_RUNNER_DOWNLOAD_URL` – Overrides `dstack-runner` binary download URL. * `DSTACK_SHIM_DOWNLOAD_URL` – Overrides `dstack-shim` binary download URL. diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py index c32f04af1..f45b44db7 100644 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py @@ -35,6 +35,7 @@ ) from dstack._internal.core.models.volumes import Volume from dstack._internal.core.services.profiles import get_termination +from dstack._internal.server import settings from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( FleetModel, @@ -452,7 +453,7 @@ async def _run_job_on_new_instance( ) # Limit number of offers tried to prevent long-running processing # in case all offers fail. - for backend, offer in offers[:15]: + for backend, offer in offers[: settings.MAX_OFFERS_TRIED]: logger.debug( "%s: trying %s in %s/%s for $%0.4f per hour", fmt(job_model), diff --git a/src/dstack/_internal/server/settings.py b/src/dstack/_internal/server/settings.py index 786769209..47eb68a82 100644 --- a/src/dstack/_internal/server/settings.py +++ b/src/dstack/_internal/server/settings.py @@ -31,6 +31,8 @@ DB_POOL_SIZE = int(os.getenv("DSTACK_DB_POOL_SIZE", 10)) DB_MAX_OVERFLOW = int(os.getenv("DSTACK_DB_MAX_OVERFLOW", 10)) +MAX_OFFERS_TRIED = int(os.getenv("DSTACK_SERVER_MAX_OFFERS_TRIED", 25)) + SERVER_CONFIG_DISABLED = os.getenv("DSTACK_SERVER_CONFIG_DISABLED") is not None SERVER_CONFIG_ENABLED = not SERVER_CONFIG_DISABLED