From 244000b5e540cace5e05edafa56e0cbae2db57d7 Mon Sep 17 00:00:00 2001 From: Derrick Gilland Date: Wed, 1 Jul 2015 18:03:11 -0400 Subject: [PATCH] Add "is_duplicate" attribute to HashAddress and make HashFS.put() return HashAddress with "is_duplicate=True" when file with same hash already exists on disk. Refs #1. --- CHANGES.rst | 2 ++ README.rst | 3 +++ hashfs/hashfs.py | 22 +++++++++++++++++----- tests/test_hashfs.py | 11 +++++++++++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8a6cded..d292dc1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,8 @@ Changelog - Rename private method ``HashFS.copy`` to ``HashFS._copy``. +- Add ``is_duplicate`` attribute to ``HashAddress``. +- Make ``HashFS.put()`` return ``HashAddress`` with ``is_duplicate=True`` when file with same hash already exists on disk. v0.4.0 (2015-06-03) diff --git a/README.rst b/README.rst index ac0aa81..aa275e3 100644 --- a/README.rst +++ b/README.rst @@ -104,6 +104,9 @@ Add content to the folder using either readable objects (e.g. ``StringIO``) or f # The path relative to fs.root. address.relpath + # Whether the file previously existed. + address.is_duplicate + Retrieving File Address ----------------------- diff --git a/hashfs/hashfs.py b/hashfs/hashfs.py index 3e6545d..dae8b7a 100644 --- a/hashfs/hashfs.py +++ b/hashfs/hashfs.py @@ -64,9 +64,9 @@ def put(self, file, extension=None): with closing(stream): id = self.computehash(stream) - filepath = self.copy(stream, id, extension) + filepath, is_duplicate = self._copy(stream, id, extension) - return HashAddress(id, self.relpath(filepath), filepath) + return HashAddress(id, self.relpath(filepath), filepath, is_duplicate) def _copy(self, stream, id, extension=None): """Copy the contents of `stream` onto disk with an optional file @@ -77,11 +77,14 @@ def _copy(self, stream, id, extension=None): # Only copy file if it doesn't already exist. if not os.path.isfile(filepath): + is_duplicate = False with tmpfile(stream, self.fmode) as fname: self.makepath(os.path.dirname(filepath)) shutil.copy(fname, filepath) + else: + is_duplicate = True - return filepath + return (filepath, is_duplicate) def get(self, file): """Return :class:`HashAdress` from given id or path. If `file` does not @@ -329,15 +332,24 @@ def __len__(self): return self.count() -class HashAddress(namedtuple('HashAddress', ['id', 'relpath', 'abspath'])): +class HashAddress(namedtuple('HashAddress', + ['id', 'relpath', 'abspath', 'is_duplicate'])): """File address containing file's path on disk and it's content hash ID. Attributes: id (str): Hash ID (hexdigest) of file contents. relpath (str): Relative path location to :attr:`HashFS.root`. abspath (str): Absoluate path location of file on disk. + is_duplicate (boolean, optional): Whether the hash address created was + a duplicate of a previously existing file. Can only be ``True`` + after a put operation. Defaults to ``False``. """ - pass + def __new__(cls, id, relpath, abspath, is_duplicate=False): + return super(HashAddress, cls).__new__(cls, + id, + relpath, + abspath, + is_duplicate) class Stream(object): diff --git a/tests/test_hashfs.py b/tests/test_hashfs.py index 1ad4f30..3a26ebc 100755 --- a/tests/test_hashfs.py +++ b/tests/test_hashfs.py @@ -96,6 +96,14 @@ def test_hashfs_put_file(fs, filepath): assert fileobj.read() == to_bytes(filepath.read()) +def test_hashfs_put_duplicate(fs, stringio): + address_a = fs.put(stringio) + address_b = fs.put(stringio) + + assert not address_a.is_duplicate + assert address_b.is_duplicate + + @pytest.mark.parametrize('extension', [ 'txt', '.txt', @@ -108,6 +116,7 @@ def test_hashfs_put_extension(fs, stringio, extension): assert_file_put(fs, address) assert os.path.sep in address.abspath assert os.path.splitext(address.abspath)[1].endswith(extension) + assert not address.is_duplicate def test_hashfs_put_error(fs): @@ -121,6 +130,7 @@ def test_hashfs_address(fs, stringio): assert fs.root not in address.relpath assert os.path.join(fs.root, address.relpath) == address.abspath assert address.relpath.replace(os.sep, '') == address.id + assert not address.is_duplicate @pytest.mark.parametrize('extension,address_attr', [ @@ -166,6 +176,7 @@ def test_hashfs_contains(fs, stringio): def test_hashfs_get(fs, stringio): address = fs.put(stringio) + assert not address.is_duplicate assert fs.get(address.id) == address assert fs.get(address.relpath) == address assert fs.get(address.abspath) == address