Skip to content

Commit

Permalink
index: md5: use hash_file
Browse files Browse the repository at this point in the history
The main issue is that we don't use md5 provided by the fs (e.g. dvcfs),
which results in needless hash recomputing. We can just use tried-and-tested
`hash_file` here for now.

Fixes iterative/dvc#10059
  • Loading branch information
efiop committed Nov 17, 2023
1 parent 1161685 commit 80975fc
Showing 1 changed file with 10 additions and 26 deletions.
36 changes: 10 additions & 26 deletions src/dvc_data/index/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

from dvc_objects.fs.callbacks import DEFAULT_CALLBACK

from ..hashfile.hash import DEFAULT_ALGORITHM
from ..hashfile.hash_info import HashInfo
from ..hashfile.hash import DEFAULT_ALGORITHM, hash_file
from ..hashfile.meta import Meta
from ..hashfile.tree import Tree

Expand All @@ -25,7 +24,6 @@ def md5(
name: str = DEFAULT_ALGORITHM,
check_meta: bool = True,
) -> None:
from ..hashfile.hash import fobj_md5
from .index import DataIndexEntry

entries = {}
Expand All @@ -39,37 +37,23 @@ def md5(

fs, path = index.storage_map.get_storage(entry, storage)

info = None
if check_meta:
try:
meta = Meta.from_info(fs.info(path), fs.protocol)
info = fs.info(path)
except FileNotFoundError:
continue

meta = Meta.from_info(info, fs.protocol)
if entry.meta != meta:
continue

if state:
_, hash_info = state.get(path, fs)
if hash_info:
entries[key] = DataIndexEntry(
key=entry.key,
meta=entry.meta,
hash_info=hash_info,
)
continue

with fs.open(path, "rb") as fobj:
entries[key] = DataIndexEntry(
key=entry.key,
meta=entry.meta,
hash_info=HashInfo(
name,
fobj_md5(fobj, name=name),
),
)

if state:
state.save(path, fs, entries[key].hash_info)
meta, hash_info = hash_file(path, fs, name, state=state, info=info)
entries[key] = DataIndexEntry(
key=entry.key,
meta=entry.meta,
hash_info=hash_info,
)

for key, entry in entries.items():
index[key] = entry
Expand Down

0 comments on commit 80975fc

Please sign in to comment.