From 5984308c177d3bbd390e52a71b69e692ca80956b Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 13 Jul 2023 15:00:48 +0500 Subject: [PATCH] Handle backend creds becoming invalid --- cli/dstack/_internal/backend/aws/__init__.py | 9 ++++ .../_internal/backend/azure/__init__.py | 41 +++++++++++-------- cli/dstack/_internal/backend/gcp/auth.py | 8 +++- cli/dstack/_internal/core/error.py | 8 +++- .../hub/background/tasks/resubmit_jobs.py | 19 +++++++-- cli/dstack/_internal/hub/routers/artifacts.py | 3 +- .../_internal/hub/routers/configurations.py | 3 +- cli/dstack/_internal/hub/routers/jobs.py | 3 +- cli/dstack/_internal/hub/routers/link.py | 3 +- cli/dstack/_internal/hub/routers/logs.py | 3 +- cli/dstack/_internal/hub/routers/projects.py | 2 +- cli/dstack/_internal/hub/routers/repos.py | 3 +- cli/dstack/_internal/hub/routers/runners.py | 3 +- cli/dstack/_internal/hub/routers/runs.py | 3 +- cli/dstack/_internal/hub/routers/secrets.py | 3 +- cli/dstack/_internal/hub/routers/storage.py | 3 +- cli/dstack/_internal/hub/routers/tags.py | 3 +- cli/dstack/_internal/hub/routers/util.py | 13 ++++++ .../{routers => services/backends}/cache.py | 1 - 19 files changed, 85 insertions(+), 49 deletions(-) rename cli/dstack/_internal/hub/{routers => services/backends}/cache.py (98%) diff --git a/cli/dstack/_internal/backend/aws/__init__.py b/cli/dstack/_internal/backend/aws/__init__.py index 6e4f8326d..34d2c05bd 100644 --- a/cli/dstack/_internal/backend/aws/__init__.py +++ b/cli/dstack/_internal/backend/aws/__init__.py @@ -1,5 +1,6 @@ from typing import Optional +import botocore.exceptions from boto3 import Session from dstack._internal.backend.aws import utils as aws_utils @@ -10,6 +11,7 @@ from dstack._internal.backend.aws.storage import AWSStorage from dstack._internal.backend.base import ComponentBasedBackend from dstack._internal.backend.base import runs as base_runs +from dstack._internal.core.error import BackendAuthError class AwsBackend(ComponentBasedBackend): @@ -46,6 +48,7 @@ def __init__( logs_client=aws_utils.get_logs_client(self._session), bucket_name=self.backend_config.bucket_name, ) + self._check_credentials() @classmethod def load(cls) -> Optional["AwsBackend"]: @@ -73,3 +76,9 @@ def create_run(self, repo_id: str) -> str: aws_utils.get_logs_client(self._session), self.backend_config.bucket_name, repo_id ) return base_runs.create_run(self._storage) + + def _check_credentials(self): + try: + self.list_repo_heads() + except (botocore.exceptions.ClientError, botocore.exceptions.NoCredentialsError): + raise BackendAuthError() diff --git a/cli/dstack/_internal/backend/azure/__init__.py b/cli/dstack/_internal/backend/azure/__init__.py index d51ed4c42..24e4c9a9d 100644 --- a/cli/dstack/_internal/backend/azure/__init__.py +++ b/cli/dstack/_internal/backend/azure/__init__.py @@ -1,6 +1,7 @@ from typing import Optional from azure.core.credentials import TokenCredential +from azure.core.exceptions import ClientAuthenticationError from azure.identity import ClientSecretCredential, DefaultAzureCredential from dstack._internal.backend.azure.compute import AzureCompute @@ -9,6 +10,7 @@ from dstack._internal.backend.azure.secrets import AzureSecretsManager from dstack._internal.backend.azure.storage import AzureStorage from dstack._internal.backend.base import ComponentBasedBackend +from dstack._internal.core.error import BackendAuthError class AzureBackend(ComponentBasedBackend): @@ -25,24 +27,27 @@ def __init__(self, backend_config: AzureConfig, credential: Optional[TokenCreden ) else: credential = DefaultAzureCredential() - self._secrets_manager = AzureSecretsManager( - credential=credential, - vault_url=self.backend_config.vault_url, - ) - self._storage = AzureStorage( - credential=credential, - storage_account=self.backend_config.storage_account, - ) - self._compute = AzureCompute( - credential=credential, - azure_config=self.backend_config, - ) - self._logging = AzureLogging( - credential=credential, - subscription_id=self.backend_config.subscription_id, - resource_group=self.backend_config.resource_group, - storage_account=self.backend_config.storage_account, - ) + try: + self._secrets_manager = AzureSecretsManager( + credential=credential, + vault_url=self.backend_config.vault_url, + ) + self._storage = AzureStorage( + credential=credential, + storage_account=self.backend_config.storage_account, + ) + self._compute = AzureCompute( + credential=credential, + azure_config=self.backend_config, + ) + self._logging = AzureLogging( + credential=credential, + subscription_id=self.backend_config.subscription_id, + resource_group=self.backend_config.resource_group, + storage_account=self.backend_config.storage_account, + ) + except ClientAuthenticationError: + raise BackendAuthError() @classmethod def load(cls) -> Optional["AzureBackend"]: diff --git a/cli/dstack/_internal/backend/gcp/auth.py b/cli/dstack/_internal/backend/gcp/auth.py index 71e7d5b55..c3ad37b30 100644 --- a/cli/dstack/_internal/backend/gcp/auth.py +++ b/cli/dstack/_internal/backend/gcp/auth.py @@ -5,11 +5,12 @@ import google.auth import googleapiclient.discovery import googleapiclient.errors +from google.auth.exceptions import DefaultCredentialsError from google.oauth2 import service_account from dstack._internal.backend.gcp import utils as gcp_utils from dstack._internal.backend.gcp.config import GCPConfig -from dstack._internal.core.error import BackendError +from dstack._internal.core.error import BackendAuthError, BackendError class NotEnoughPermissionError(BackendError): @@ -21,7 +22,10 @@ def authenticate(backend_config: GCPConfig): return service_account.Credentials.from_service_account_info( json.loads(backend_config.credentials["data"]) ) - default_credentials, _ = google.auth.default() + try: + default_credentials, _ = google.auth.default() + except DefaultCredentialsError: + raise BackendAuthError() service_account_email = backend_config.credentials["service_account_email"] iam_service = googleapiclient.discovery.build("iam", "v1", credentials=default_credentials) diff --git a/cli/dstack/_internal/core/error.py b/cli/dstack/_internal/core/error.py index a8403dc56..23c5a456e 100644 --- a/cli/dstack/_internal/core/error.py +++ b/cli/dstack/_internal/core/error.py @@ -7,8 +7,12 @@ def __init__(self, message: Optional[str] = None): class BackendError(DstackError): - def __init__(self, message: Optional[str] = None): - self.message = message + pass + + +class BackendAuthError(BackendError): + code = "invalid_backend_credentials" + message = "Backend credentials are invalid" class NoMatchingInstanceError(BackendError): diff --git a/cli/dstack/_internal/hub/background/tasks/resubmit_jobs.py b/cli/dstack/_internal/hub/background/tasks/resubmit_jobs.py index 98adc6f29..88a45e72a 100644 --- a/cli/dstack/_internal/hub/background/tasks/resubmit_jobs.py +++ b/cli/dstack/_internal/hub/background/tasks/resubmit_jobs.py @@ -1,13 +1,17 @@ from typing import List from dstack._internal.backend.base import Backend +from dstack._internal.core.error import BackendAuthError from dstack._internal.core.job import JobStatus from dstack._internal.hub.db.models import Project from dstack._internal.hub.repository.projects import ProjectManager -from dstack._internal.hub.routers.cache import get_backend from dstack._internal.hub.services.backends import get_configurator +from dstack._internal.hub.services.backends.cache import get_backend from dstack._internal.hub.utils.common import run_async from dstack._internal.utils.common import get_milliseconds_since_epoch +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) RESUBMISSION_INTERVAL = 60 @@ -19,11 +23,19 @@ async def resubmit_jobs(): async def _resubmit_projects_jobs(projects: List[Project]): for project in projects: - backend = await get_backend(project) - configurator = get_configurator(backend) + logger.info("Resubmitting jobs for %s project", project.name) + try: + backend = await get_backend(project) + except BackendAuthError: + logger.warning( + "Credentials for %s project are invalid. Skipping job resubmission.", project.name + ) + continue + configurator = get_configurator(backend.name) if configurator is None: continue await run_async(_resubmit_backend_jobs, backend) + logger.info("Finished resubmitting jobs for %s project", project.name) def _resubmit_backend_jobs(backend: Backend): @@ -59,3 +71,4 @@ def _resubmit_backend_jobs(backend: Backend): job=job, failed_to_start_job_new_status=JobStatus.FAILED, ) + logger.info("Resubmitted job %s", job.job_id) diff --git a/cli/dstack/_internal/hub/routers/artifacts.py b/cli/dstack/_internal/hub/routers/artifacts.py index 68b088276..b13e195f5 100644 --- a/cli/dstack/_internal/hub/routers/artifacts.py +++ b/cli/dstack/_internal/hub/routers/artifacts.py @@ -4,8 +4,7 @@ from dstack._internal.core.artifact import Artifact from dstack._internal.hub.models import ArtifactsList -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import get_project +from dstack._internal.hub.routers.util import get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/configurations.py b/cli/dstack/_internal/hub/routers/configurations.py index 5f6fbfdd7..80b28ddbc 100644 --- a/cli/dstack/_internal/hub/routers/configurations.py +++ b/cli/dstack/_internal/hub/routers/configurations.py @@ -2,8 +2,7 @@ from dstack._internal.core.repo import RepoRef from dstack._internal.hub.db.models import User -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import get_project +from dstack._internal.hub.routers.util import get_backend, get_project from dstack._internal.hub.security.permissions import Authenticated, ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/jobs.py b/cli/dstack/_internal/hub/routers/jobs.py index 41236490c..97343437d 100644 --- a/cli/dstack/_internal/hub/routers/jobs.py +++ b/cli/dstack/_internal/hub/routers/jobs.py @@ -5,8 +5,7 @@ from dstack._internal.core.job import Job, JobHead from dstack._internal.hub.db.models import User from dstack._internal.hub.models import JobHeadList, JobsGet, JobsList -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import get_project +from dstack._internal.hub.routers.util import get_backend, get_project from dstack._internal.hub.security.permissions import Authenticated, ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/link.py b/cli/dstack/_internal/hub/routers/link.py index 0fffe320d..9b815da4d 100644 --- a/cli/dstack/_internal/hub/routers/link.py +++ b/cli/dstack/_internal/hub/routers/link.py @@ -4,8 +4,7 @@ from dstack._internal.backend.local import LocalBackend from dstack._internal.hub.models import StorageLink -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import get_project +from dstack._internal.hub.routers.util import get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/logs.py b/cli/dstack/_internal/hub/routers/logs.py index e9552d4e3..149fc3887 100644 --- a/cli/dstack/_internal/hub/routers/logs.py +++ b/cli/dstack/_internal/hub/routers/logs.py @@ -6,8 +6,7 @@ from dstack._internal.core.log_event import LogEvent from dstack._internal.hub.models import PollLogs -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import get_project +from dstack._internal.hub.routers.util import get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember from dstack._internal.hub.utils.common import run_async from dstack._internal.utils.common import get_current_datetime diff --git a/cli/dstack/_internal/hub/routers/projects.py b/cli/dstack/_internal/hub/routers/projects.py index 55c22a659..b7eade317 100644 --- a/cli/dstack/_internal/hub/routers/projects.py +++ b/cli/dstack/_internal/hub/routers/projects.py @@ -12,7 +12,6 @@ ProjectValues, ) from dstack._internal.hub.repository.projects import ProjectManager -from dstack._internal.hub.routers.cache import clear_backend_cache from dstack._internal.hub.routers.util import error_detail, get_backend_configurator, get_project from dstack._internal.hub.security.permissions import ( Authenticated, @@ -21,6 +20,7 @@ ensure_user_project_admin, ) from dstack._internal.hub.services.backends.base import BackendConfigError +from dstack._internal.hub.services.backends.cache import clear_backend_cache from dstack._internal.hub.utils.common import run_async router = APIRouter(prefix="/api/projects", tags=["project"]) diff --git a/cli/dstack/_internal/hub/routers/repos.py b/cli/dstack/_internal/hub/routers/repos.py index 3fc33c1c6..526469780 100644 --- a/cli/dstack/_internal/hub/routers/repos.py +++ b/cli/dstack/_internal/hub/routers/repos.py @@ -4,8 +4,7 @@ from dstack._internal.core.repo import RemoteRepoCredentials, RepoHead, RepoRef from dstack._internal.hub.models import RepoHeadGet, ReposDelete, ReposUpdate, SaveRepoCredentials -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import error_detail, get_project +from dstack._internal.hub.routers.util import error_detail, get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/runners.py b/cli/dstack/_internal/hub/routers/runners.py index be196f3dc..eb7c4fb7b 100644 --- a/cli/dstack/_internal/hub/routers/runners.py +++ b/cli/dstack/_internal/hub/routers/runners.py @@ -4,8 +4,7 @@ from dstack._internal.core.error import NoMatchingInstanceError from dstack._internal.core.job import Job, JobStatus from dstack._internal.hub.models import StopRunners -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import error_detail, get_project +from dstack._internal.hub.routers.util import error_detail, get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/runs.py b/cli/dstack/_internal/hub/routers/runs.py index 29043242d..9492bd6b8 100644 --- a/cli/dstack/_internal/hub/routers/runs.py +++ b/cli/dstack/_internal/hub/routers/runs.py @@ -12,8 +12,7 @@ from dstack._internal.core.run import RunHead from dstack._internal.hub.db.models import User from dstack._internal.hub.models import RunsDelete, RunsGetPlan, RunsList, RunsStop -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import error_detail, get_project +from dstack._internal.hub.routers.util import error_detail, get_backend, get_project from dstack._internal.hub.security.permissions import Authenticated, ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/secrets.py b/cli/dstack/_internal/hub/routers/secrets.py index 7e3ea6e9b..e843b90ab 100644 --- a/cli/dstack/_internal/hub/routers/secrets.py +++ b/cli/dstack/_internal/hub/routers/secrets.py @@ -5,8 +5,7 @@ from dstack._internal.core.repo import RepoRef from dstack._internal.core.secret import Secret from dstack._internal.hub.models import SecretAddUpdate -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import error_detail, get_project +from dstack._internal.hub.routers.util import error_detail, get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/storage.py b/cli/dstack/_internal/hub/routers/storage.py index b0671cbf0..f80780b8c 100644 --- a/cli/dstack/_internal/hub/routers/storage.py +++ b/cli/dstack/_internal/hub/routers/storage.py @@ -8,8 +8,7 @@ from dstack._internal.backend.base import Backend from dstack._internal.backend.local import LocalBackend from dstack._internal.hub.models import FileObject -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import error_detail, get_project +from dstack._internal.hub.routers.util import error_detail, get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember diff --git a/cli/dstack/_internal/hub/routers/tags.py b/cli/dstack/_internal/hub/routers/tags.py index 31c99654d..72cb4c7ac 100644 --- a/cli/dstack/_internal/hub/routers/tags.py +++ b/cli/dstack/_internal/hub/routers/tags.py @@ -5,8 +5,7 @@ from dstack._internal.core.repo import RepoRef from dstack._internal.core.tag import TagHead from dstack._internal.hub.models import AddTagPath, AddTagRun -from dstack._internal.hub.routers.cache import get_backend -from dstack._internal.hub.routers.util import error_detail, get_project +from dstack._internal.hub.routers.util import error_detail, get_backend, get_project from dstack._internal.hub.security.permissions import ProjectMember from dstack._internal.hub.utils.common import run_async diff --git a/cli/dstack/_internal/hub/routers/util.py b/cli/dstack/_internal/hub/routers/util.py index d11a6f741..909e2e4e2 100644 --- a/cli/dstack/_internal/hub/routers/util.py +++ b/cli/dstack/_internal/hub/routers/util.py @@ -2,8 +2,11 @@ from fastapi import HTTPException, status +from dstack._internal.backend.base import Backend +from dstack._internal.core.error import BackendAuthError from dstack._internal.hub.models import Project from dstack._internal.hub.repository.projects import ProjectManager +from dstack._internal.hub.services.backends import cache as backends_cache from dstack._internal.hub.services.backends import get_configurator from dstack._internal.hub.services.backends.base import Configurator @@ -19,6 +22,16 @@ async def get_project(project_name: str) -> Project: return project +async def get_backend(project: Project) -> Optional[Backend]: + try: + return await backends_cache.get_backend(project) + except BackendAuthError: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=error_detail(BackendAuthError.message, code=BackendAuthError.code), + ) + + def get_backend_configurator(backend_type: str) -> Configurator: configurator = get_configurator(backend_type) if configurator is None: diff --git a/cli/dstack/_internal/hub/routers/cache.py b/cli/dstack/_internal/hub/services/backends/cache.py similarity index 98% rename from cli/dstack/_internal/hub/routers/cache.py rename to cli/dstack/_internal/hub/services/backends/cache.py index 02d21f500..758cd68b8 100644 --- a/cli/dstack/_internal/hub/routers/cache.py +++ b/cli/dstack/_internal/hub/services/backends/cache.py @@ -1,4 +1,3 @@ -import json from typing import Optional from dstack._internal.backend.base import Backend