diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 38e84869c5..59728b1d3d 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -132,10 +132,8 @@ def changed(self, path_info, hash_info): logger.debug("cache for '%s'('%s') has changed.", path_info, hash_) return True - typ, actual = self.tree.get_hash(path_info) - assert typ == self.tree.PARAM_CHECKSUM - - if hash_ != actual: + actual = self.tree.get_hash(path_info) + if hash_ != actual.value: logger.debug( "hash value '%s' for '%s' has changed (actual '%s').", hash_, @@ -319,7 +317,7 @@ def changed_cache_file(self, hash_): ) return False - _, actual = self.tree.get_hash(cache_info) + actual = self.tree.get_hash(cache_info) logger.debug( "cache '%s' expected '%s' actual '%s'", cache_info, hash_, actual, @@ -328,7 +326,7 @@ def changed_cache_file(self, hash_): if not hash_ or not actual: return True - if actual.split(".")[0] == hash_.split(".")[0]: + if actual.value.split(".")[0] == hash_.split(".")[0]: # making cache file read-only so we don't need to check it # next time self.tree.protect(cache_info) @@ -634,5 +632,5 @@ def merge(self, ancestor_info, our_info, their_info): their = self.get_dir_cache(their_hash) merged = self._merge_dirs(ancestor, our, their) - typ, merged_hash = self.tree.save_dir_info(merged) - return {typ: merged_hash} + hash_info = self.tree.save_dir_info(merged) + return {hash_info.name: hash_info.value} diff --git a/dvc/cache/local.py b/dvc/cache/local.py index d0a9ff8949..3d63ada81c 100644 --- a/dvc/cache/local.py +++ b/dvc/cache/local.py @@ -91,14 +91,14 @@ def hashes_exist( def already_cached(self, path_info): assert path_info.scheme in ["", "local"] - typ, current_md5 = self.tree.get_hash(path_info) + current = self.tree.get_hash(path_info) - assert typ == "md5" + assert current.name == "md5" - if not current_md5: + if not current: return False - return not self.changed_cache(current_md5) + return not self.changed_cache(current.value) def _verify_link(self, path_info, link_type): if link_type == "hardlink" and self.tree.getsize(path_info) == 0: diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 015297dce4..01bc44115a 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -66,7 +66,7 @@ def _get_checksum(self, locked=True): if tree.isdir(path): return self.repo.cache.local.tree.get_hash( path, tree=tree - )[1] + ).value return tree.get_file_hash(path) def workspace_status(self): diff --git a/dvc/hash_info.py b/dvc/hash_info.py new file mode 100644 index 0000000000..c3422d174f --- /dev/null +++ b/dvc/hash_info.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + + +@dataclass +class HashInfo: + name: str + value: str + + def __bool__(self): + return bool(self.value) diff --git a/dvc/output/base.py b/dvc/output/base.py index a6bc95d41b..1c9968ba9d 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -177,7 +177,7 @@ def checksum(self): return self.info.get(self.tree.PARAM_CHECKSUM) def get_checksum(self): - return self.tree.get_hash(self.path_info)[1] + return self.tree.get_hash(self.path_info).value @property def is_dir_checksum(self): diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 19838aa770..f02bf8b9eb 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -37,7 +37,7 @@ def _to_path(output): def _to_checksum(output): if on_working_tree: - return self.cache.local.tree.get_hash(output.path_info)[1] + return self.cache.local.tree.get_hash(output.path_info).value return output.checksum def _exists(output): diff --git a/dvc/tree/azure.py b/dvc/tree/azure.py index 1b9c88c298..1aaa16b7aa 100644 --- a/dvc/tree/azure.py +++ b/dvc/tree/azure.py @@ -5,6 +5,7 @@ from funcy import cached_property, wrap_prop +from dvc.hash_info import HashInfo from dvc.path_info import CloudURLInfo from dvc.progress import Tqdm from dvc.scheme import Schemes @@ -153,7 +154,7 @@ def remove(self, path_info): ).delete_blob() def get_file_hash(self, path_info): - return self.PARAM_CHECKSUM, self.get_etag(path_info) + return HashInfo(self.PARAM_CHECKSUM, self.get_etag(path_info)) def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 41f0e1a90f..c0e3060c1d 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -15,6 +15,7 @@ DvcIgnoreInCollectedDirError, RemoteCacheRequiredError, ) +from dvc.hash_info import HashInfo from dvc.ignore import DvcIgnore from dvc.path_info import PathInfo, URLInfo from dvc.progress import Tqdm @@ -242,7 +243,7 @@ def get_hash(self, path_info, **kwargs): ) if not self.exists(path_info): - return self.PARAM_CHECKSUM, None + return None # pylint: disable=assignment-from-none hash_ = self.state.get(path_info) @@ -260,17 +261,17 @@ def get_hash(self, path_info, **kwargs): hash_ = None if hash_: - return self.PARAM_CHECKSUM, hash_ + return HashInfo(self.PARAM_CHECKSUM, hash_) if self.isdir(path_info): - typ, hash_ = self.get_dir_hash(path_info, **kwargs) + hash_info = self.get_dir_hash(path_info, **kwargs) else: - typ, hash_ = self.get_file_hash(path_info) + hash_info = self.get_file_hash(path_info) - if hash_ and self.exists(path_info): - self.state.save(path_info, hash_) + if hash_info and self.exists(path_info): + self.state.save(path_info, hash_info.value) - return typ, hash_ + return hash_info def get_file_hash(self, path_info): raise NotImplementedError @@ -294,8 +295,8 @@ def path_to_hash(self, path): return "".join(parts) def save_info(self, path_info, **kwargs): - typ, hash_ = self.get_hash(path_info, **kwargs) - return {typ: hash_} + hash_info = self.get_hash(path_info, **kwargs) + return {hash_info.name: hash_info.value} def _calculate_hashes(self, file_infos): file_infos = list(file_infos) @@ -306,9 +307,7 @@ def _calculate_hashes(self, file_infos): ) as pbar: worker = pbar.wrap_fn(self.get_file_hash) with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor: - hashes = ( - value for typ, value in executor.map(worker, file_infos) - ) + hashes = (hi.value for hi in executor.map(worker, file_infos)) return dict(zip(file_infos, hashes)) def _collect_dir(self, path_info, **kwargs): @@ -346,17 +345,17 @@ def _collect_dir(self, path_info, **kwargs): return sorted(result, key=itemgetter(self.PARAM_RELPATH)) def save_dir_info(self, dir_info): - typ, hash_, tmp_info = self._get_dir_info_hash(dir_info) - new_info = self.cache.tree.hash_to_path_info(hash_) - if self.cache.changed_cache_file(hash_): + hash_info, tmp_info = self._get_dir_info_hash(dir_info) + new_info = self.cache.tree.hash_to_path_info(hash_info.value) + if self.cache.changed_cache_file(hash_info.value): self.cache.tree.makedirs(new_info.parent) self.cache.tree.move( tmp_info, new_info, mode=self.cache.CACHE_MODE ) - self.state.save(new_info, hash_) + self.state.save(new_info, hash_info.value) - return typ, hash_ + return hash_info def _get_dir_info_hash(self, dir_info): # Sorting the list by path to ensure reproducibility @@ -371,8 +370,9 @@ def _get_dir_info_hash(self, dir_info): to_info = tree.path_info / tmp_fname("") tree.upload(from_info, to_info, no_progress_bar=True) - typ, hash_ = tree.get_file_hash(to_info) - return typ, hash_ + self.CHECKSUM_DIR_SUFFIX, to_info + hash_info = tree.get_file_hash(to_info) + hash_info.value += self.CHECKSUM_DIR_SUFFIX + return hash_info, to_info def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): diff --git a/dvc/tree/dvc.py b/dvc/tree/dvc.py index db917d76d7..128683b380 100644 --- a/dvc/tree/dvc.py +++ b/dvc/tree/dvc.py @@ -2,6 +2,7 @@ import os from dvc.exceptions import OutputNotFoundError +from dvc.hash_info import HashInfo from dvc.path_info import PathInfo from ._metadata import Metadata @@ -245,7 +246,7 @@ def get_dir_hash(self, path_info, **kwargs): out = outs[0] # other code expects us to fetch the dir at this point self._fetch_dir(out, **kwargs) - return out.tree.PARAM_CHECKSUM, out.checksum + return HashInfo(out.tree.PARAM_CHECKSUM, out.checksum) except OutputNotFoundError: pass @@ -257,11 +258,11 @@ def get_file_hash(self, path_info): raise OutputNotFoundError out = outs[0] if out.is_dir_checksum: - return ( + return HashInfo( out.tree.PARAM_CHECKSUM, self._get_granular_checksum(path_info, out), ) - return out.tree.PARAM_CHECKSUM, out.checksum + return HashInfo(out.tree.PARAM_CHECKSUM, out.checksum) def metadata(self, path_info): path_info = PathInfo(os.path.abspath(path_info)) diff --git a/dvc/tree/gs.py b/dvc/tree/gs.py index c35a490b46..b95622d03b 100644 --- a/dvc/tree/gs.py +++ b/dvc/tree/gs.py @@ -7,6 +7,7 @@ from funcy import cached_property, wrap_prop from dvc.exceptions import DvcException +from dvc.hash_info import HashInfo from dvc.path_info import CloudURLInfo from dvc.progress import Tqdm from dvc.scheme import Schemes @@ -189,11 +190,11 @@ def get_file_hash(self, path_info): path = path_info.path blob = self.gs.bucket(bucket).get_blob(path) if not blob: - return self.PARAM_CHECKSUM, None + return HashInfo(self.PARAM_CHECKSUM, None) b64_md5 = blob.md5_hash md5 = base64.b64decode(b64_md5) - return ( + return HashInfo( self.PARAM_CHECKSUM, codecs.getencoder("hex")(md5)[0].decode("utf-8"), ) diff --git a/dvc/tree/hdfs.py b/dvc/tree/hdfs.py index 9ba6010be0..60c0da7fe6 100644 --- a/dvc/tree/hdfs.py +++ b/dvc/tree/hdfs.py @@ -7,6 +7,7 @@ from contextlib import closing, contextmanager from urllib.parse import urlparse +from dvc.hash_info import HashInfo from dvc.scheme import Schemes from dvc.utils import fix_env, tmp_fname @@ -175,7 +176,9 @@ def get_file_hash(self, path_info): stdout = self.hadoop_fs( f"checksum {path_info.url}", user=path_info.user ) - return self.PARAM_CHECKSUM, self._group(regex, stdout, "checksum") + return HashInfo( + self.PARAM_CHECKSUM, self._group(regex, stdout, "checksum") + ) def _upload(self, from_file, to_info, **_kwargs): with self.hdfs(to_info) as hdfs: diff --git a/dvc/tree/http.py b/dvc/tree/http.py index d818c5116f..18d793cf37 100644 --- a/dvc/tree/http.py +++ b/dvc/tree/http.py @@ -6,6 +6,7 @@ import dvc.prompt as prompt from dvc.exceptions import DvcException, HTTPError +from dvc.hash_info import HashInfo from dvc.path_info import HTTPURLInfo from dvc.progress import Tqdm from dvc.scheme import Schemes @@ -151,7 +152,7 @@ def get_file_hash(self, path_info): "Content-MD5 header for '{url}'".format(url=url) ) - return self.PARAM_CHECKSUM, etag + return HashInfo(self.PARAM_CHECKSUM, etag) def _download(self, from_info, to_file, name=None, no_progress_bar=False): response = self.request("GET", from_info.url, stream=True) diff --git a/dvc/tree/local.py b/dvc/tree/local.py index 73efccf4b7..17b38f0136 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -7,6 +7,7 @@ from shortuuid import uuid from dvc.exceptions import DvcException +from dvc.hash_info import HashInfo from dvc.path_info import PathInfo from dvc.scheme import Schemes from dvc.system import System @@ -309,7 +310,7 @@ def is_protected(self, path_info): return stat.S_IMODE(mode) == self.CACHE_MODE def get_file_hash(self, path_info): - return self.PARAM_CHECKSUM, file_md5(path_info)[0] + return HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0]) @staticmethod def getsize(path_info): diff --git a/dvc/tree/repo.py b/dvc/tree/repo.py index 0d92e71706..49c4b467bf 100644 --- a/dvc/tree/repo.py +++ b/dvc/tree/repo.py @@ -11,6 +11,7 @@ from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError +from dvc.hash_info import HashInfo from dvc.path_info import PathInfo from dvc.utils import file_md5, is_exec from dvc.utils.fs import copy_fobj_to_file, makedirs @@ -332,7 +333,7 @@ def get_file_hash(self, path_info): return dvc_tree.get_file_hash(path_info) except OutputNotFoundError: pass - return self.PARAM_CHECKSUM, file_md5(path_info, self)[0] + return HashInfo(self.PARAM_CHECKSUM, file_md5(path_info, self)[0]) def copytree(self, top, dest): top = PathInfo(top) diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index d57a5af92b..1e7eda813e 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -7,6 +7,7 @@ from dvc.config import ConfigError from dvc.exceptions import DvcException, ETagMismatchError +from dvc.hash_info import HashInfo from dvc.path_info import CloudURLInfo from dvc.progress import Tqdm from dvc.scheme import Schemes @@ -332,10 +333,7 @@ def _copy(cls, s3, from_info, to_info, extra_args): def get_file_hash(self, path_info): with self._get_obj(path_info) as obj: - return ( - self.PARAM_CHECKSUM, - obj.e_tag.strip('"'), - ) + return HashInfo(self.PARAM_CHECKSUM, obj.e_tag.strip('"'),) def _upload(self, from_file, to_info, name=None, no_progress_bar=False): with self._get_obj(to_info) as obj: diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index c5ecccbaa3..f22ae8946e 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -10,6 +10,7 @@ from funcy import first, memoize, silent, wrap_with import dvc.prompt as prompt +from dvc.hash_info import HashInfo from dvc.scheme import Schemes from ..base import BaseTree @@ -238,7 +239,7 @@ def get_file_hash(self, path_info): raise NotImplementedError with self.ssh(path_info) as ssh: - return self.PARAM_CHECKSUM, ssh.md5(path_info.path) + return HashInfo(self.PARAM_CHECKSUM, ssh.md5(path_info.path)) def getsize(self, path_info): with self.ssh(path_info) as ssh: diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 8909a77547..9bb2705173 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -7,6 +7,7 @@ from dvc.config import ConfigError from dvc.exceptions import DvcException +from dvc.hash_info import HashInfo from dvc.path_info import HTTPURLInfo, WebDAVURLInfo from dvc.progress import Tqdm from dvc.scheme import Schemes @@ -142,7 +143,7 @@ def get_file_hash(self, path_info): "Content-MD5 header for '{url}'".format(url=path_info.url) ) - return self.PARAM_CHECKSUM, etag + return HashInfo(self.PARAM_CHECKSUM, etag) # Checks whether path points to directory def isdir(self, path_info): diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 149d3baa46..2e5ec7743e 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -218,7 +218,7 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): # into dvc.cache, not fetched or streamed from a remote tree = RepoTree(erepo_dir.dvc, stream=True) expected = [ - tree.get_file_hash(PathInfo(erepo_dir / path))[1] + tree.get_file_hash(PathInfo(erepo_dir / path)).value for path in ("dir/bar", "dir/subdir/foo") ] diff --git a/tests/unit/remote/test_azure.py b/tests/unit/remote/test_azure.py index 0f49045e39..da312d3bb6 100644 --- a/tests/unit/remote/test_azure.py +++ b/tests/unit/remote/test_azure.py @@ -36,7 +36,9 @@ def test_get_file_hash(tmp_dir, azure): to_info = azure tree.upload(PathInfo("foo"), to_info) assert tree.exists(to_info) - _, hash_ = tree.get_file_hash(to_info) + hash_info = tree.get_file_hash(to_info) + assert hash_info.name == "etag" + hash_ = hash_info.value assert hash_ assert isinstance(hash_, str) assert hash_.strip("'").strip('"') == hash_ diff --git a/tests/unit/tree/test_dvc.py b/tests/unit/tree/test_dvc.py index 6d32d96c7a..2966ebf8ba 100644 --- a/tests/unit/tree/test_dvc.py +++ b/tests/unit/tree/test_dvc.py @@ -3,6 +3,7 @@ import pytest +from dvc.hash_info import HashInfo from dvc.path_info import PathInfo from dvc.tree.dvc import DvcTree @@ -206,9 +207,8 @@ def test_isdvc(tmp_dir, dvc): def test_get_hash_file(tmp_dir, dvc): tmp_dir.dvc_gen({"foo": "foo"}) tree = DvcTree(dvc) - assert tree.get_hash(PathInfo(tmp_dir) / "foo") == ( - "md5", - "acbd18db4cc2f85cedef654fccc4a4d8", + assert tree.get_hash(PathInfo(tmp_dir) / "foo") == HashInfo( + "md5", "acbd18db4cc2f85cedef654fccc4a4d8", ) @@ -218,9 +218,8 @@ def test_get_hash_dir(tmp_dir, dvc, mocker): ) tree = DvcTree(dvc) get_file_hash_spy = mocker.spy(tree, "get_file_hash") - assert tree.get_hash(PathInfo(tmp_dir) / "dir") == ( - "md5", - "8761c4e9acad696bee718615e23e22db.dir", + assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( + "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert not get_file_hash_spy.called @@ -231,11 +230,9 @@ def test_get_hash_granular(tmp_dir, dvc): ) tree = DvcTree(dvc, fetch=True) subdir = PathInfo(tmp_dir) / "dir" / "subdir" - assert tree.get_hash(subdir) == ( - "md5", - "af314506f1622d107e0ed3f14ec1a3b5.dir", + assert tree.get_hash(subdir) == HashInfo( + "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) - assert tree.get_hash(subdir / "data") == ( - "md5", - "8d777f385d3dfec8815d20f7496026dc", + assert tree.get_hash(subdir / "data") == HashInfo( + "md5", "8d777f385d3dfec8815d20f7496026dc", ) diff --git a/tests/unit/tree/test_repo.py b/tests/unit/tree/test_repo.py index 8d1a348aa7..40998c1063 100644 --- a/tests/unit/tree/test_repo.py +++ b/tests/unit/tree/test_repo.py @@ -4,6 +4,7 @@ import pytest +from dvc.hash_info import HashInfo from dvc.path_info import PathInfo from dvc.tree.repo import RepoTree @@ -363,9 +364,8 @@ def test_get_hash_cached_file(tmp_dir, dvc, mocker): tmp_dir.dvc_gen({"foo": "foo"}) tree = RepoTree(dvc) dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash") - assert tree.get_hash(PathInfo(tmp_dir) / "foo") == ( - "md5", - "acbd18db4cc2f85cedef654fccc4a4d8", + assert tree.get_hash(PathInfo(tmp_dir) / "foo") == HashInfo( + "md5", "acbd18db4cc2f85cedef654fccc4a4d8", ) assert dvc_tree_spy.called @@ -377,9 +377,8 @@ def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tree = RepoTree(dvc) get_file_hash_spy = mocker.spy(tree, "get_file_hash") dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_dir_hash") - assert tree.get_hash(PathInfo(tmp_dir) / "dir") == ( - "md5", - "8761c4e9acad696bee718615e23e22db.dir", + assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( + "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert not get_file_hash_spy.called assert dvc_tree_spy.called @@ -392,12 +391,10 @@ def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tree = RepoTree(dvc, fetch=True) dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash") subdir = PathInfo(tmp_dir) / "dir" / "subdir" - assert tree.get_hash(subdir) == ( - "md5", - "af314506f1622d107e0ed3f14ec1a3b5.dir", + assert tree.get_hash(subdir) == HashInfo( + "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) - assert tree.get_hash(subdir / "data") == ( - "md5", - "8d777f385d3dfec8815d20f7496026dc", + assert tree.get_hash(subdir / "data") == HashInfo( + "md5", "8d777f385d3dfec8815d20f7496026dc", ) assert dvc_tree_spy.called