Skip to content

Commit

Permalink
fix: fix the error message for auto checkpoint download (#5201)
Browse files Browse the repository at this point in the history
(cherry picked from commit 98ef84e)
  • Loading branch information
hanyucui authored and dzhu committed Oct 10, 2022
1 parent 0e9f57b commit 56372bb
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
26 changes: 21 additions & 5 deletions harness/determined/common/experimental/checkpoint/_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,7 @@ def download(self, path: Optional[str] = None, mode: DownloadMode = DownloadMode
self._download_via_master(self._session, self.uuid, local_ckpt_dir)

elif mode == DownloadMode.AUTO:
try:
self._download_direct(checkpoint_storage, local_ckpt_dir)
except errors.NoDirectStorageAccess:
logging.info("Unable to download directly, proxying download through master")
self._download_via_master(self._session, self.uuid, local_ckpt_dir)
self._download_auto(checkpoint_storage, local_ckpt_dir)

else:
raise ValueError(f"Unknown download mode {mode}")
Expand All @@ -190,6 +186,26 @@ def download(self, path: Optional[str] = None, mode: DownloadMode = DownloadMode

return str(local_ckpt_dir)

def _download_auto(
self, checkpoint_storage: Dict[str, Any], local_ckpt_dir: pathlib.Path
) -> None:
try:
self._download_direct(checkpoint_storage, local_ckpt_dir)

except errors.NoDirectStorageAccess:
if checkpoint_storage["type"] != "s3":
raise

logging.info("Unable to download directly, proxying download through master")
try:
self._download_via_master(self._session, self.uuid, local_ckpt_dir)
except Exception as e:
raise errors.MultipleDownloadsFailed(
"Auto checkpoint download mode was enabled. "
"Attempted direct download and proxied download through master "
"but they both failed."
) from e

def _download_direct(
self, checkpoint_storage: Dict[str, Any], local_ckpt_dir: pathlib.Path
) -> None:
Expand Down
6 changes: 6 additions & 0 deletions harness/determined/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,9 @@ class ProxiedDownloadFailed(Exception):
"""Proxied checkpoint download through master failed"""

pass


class MultipleDownloadsFailed(Exception):
"""Multiple checkpoint download methods failed"""

pass

0 comments on commit 56372bb

Please sign in to comment.