h5py · takluyver · Nov 25, 2022 · Nov 25, 2022 · Nov 25, 2022 · Nov 26, 2022
diff --git a/docs/high/dataset.rst b/docs/high/dataset.rst
@@ -623,6 +623,11 @@ Reference
         String with the currently applied compression filter, or None if
         compression is not enabled for this dataset.  See :ref:`dataset_compression`.
 
+        This only recognises the built-in compression options ``'gzip'``,
+        ``'lzf'`` and ``'szip'``. Other compression mechanisms will show as
+        ``'unknown'`` from h5py 3.8. Use :attr:`filter_ids` and
+        :attr:`filter_names` to get more complete information.
+
     .. attribute:: compression_opts
 
         Options for the compression filter.  See :ref:`dataset_compression`.
@@ -641,6 +646,21 @@ Reference
 
         Whether Fletcher32 checksumming is enabled (T/F).  See :ref:`dataset_fletcher32`.
 
+    .. attribute:: filter_ids
+                   filter_names
+
+        The numeric filter IDs and the string names (as stored in the file) of
+        the filters in use. Each attribute is a tuple.
+
+        Filters are mostly used to compress data, but can also do things like
+        checksumming (see :ref:`dataset_compression`). Other attributes listed
+        above provide convenient shortcuts to check on common filters.
+        IDs for filters built into h5py can be found in the :mod:`h5py.h5z`
+        module, while filter IDs from plugins are listed in `HDF Group's registry
+        <https://portal.hdfgroup.org/display/support/Registered+Filter+Plugins>`_.
+
+        .. versionadded:: 3.8
+
     .. attribute:: fillvalue
 
         Value used when reading uninitialized portions of the dataset, or None

diff --git a/h5py/_hl/dataset.py b/h5py/_hl/dataset.py
@@ -98,7 +98,7 @@ def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,
         compression = 'gzip'
 
     # Legacy
-    if compression in _LEGACY_GZIP_COMPRESSION_VALS:
+    if compression in _LEGACY_GZIP_COMPRESSION_VALS and (compression is not False):
         if compression_opts is not None:
             raise TypeError("Conflict in compression options")
         compression_opts = compression
@@ -560,6 +560,8 @@ def compression(self):
         for x in ('gzip','lzf','szip'):
             if x in self._filters:
                 return x
+        if any(f not in filters._COMP_FILTERS for f in self._filters):
+            return 'unknown'  # Filter from a plugin
         return None
 
     @property
@@ -568,6 +570,21 @@ def compression_opts(self):
         """ Compression setting.  Int(0-9) for gzip, 2-tuple for szip. """
         return self._filters.get(self.compression, None)
 
+    @property
+    @with_phil
+    def filter_ids(self):
+        """Numeric IDs of HDF5 filters used for this dataset"""
+        pl = self._dcpl
+        return tuple([pl.get_filter(i)[0] for i in range(pl.get_nfilters())])
+
+    @property
+    @with_phil
+    def filter_names(self):
+        """Names, as stored in the file, of the filters used for this dataset"""
+        pl = self._dcpl
+        return tuple([pl.get_filter(i)[3].decode('utf-8', 'surrogateescape')
+                for i in range(pl.get_nfilters())])
+
     @property
     @with_phil
     def shuffle(self):

diff --git a/h5py/_hl/filters.py b/h5py/_hl/filters.py
@@ -143,6 +143,113 @@ class Gzip(FilterRefBase):
     def __init__(self, level=DEFAULT_GZIP):
         self.filter_options = (level,)
 
+
+class FilterPipeline:
+    def __init__(self, filters: list):
+        self.filters = filters
+
+    @classmethod
+    def from_plist(cls, plist):
+        return cls([plist.get_filter(i) for i in range(plist.get_nfilters())])
+
+    def apply_to_plist(self, plist):
+        # Remove all existing filters
+        nf = plist.get_nfilters()
+        while nf:
+            fid = plist.get_filter(0)[0]
+            plist.remove_filter(fid)
+            nf_new = plist.get_nfilters()
+            assert nf_new < nf  # Protect against infinite loops
+            nf = nf_new
+
+        # Add filters
+        for fid, flags, opts, _ in self.filters:
+            plist.set_filter(fid, flags, opts)
+
+    def copy(self):
+        return type(self)(self.filters.copy())
+
+    def __eq__(self, other):
+        return [f[:3] for f in self.filters] == [f[:3] for f in other.filters]
+
+    def find(self, filter_id):
+        for i, row in enumerate(self.filters):
+            if row[0] == filter_id:
+                return i
+
+    def drop(self, filter_id):
+        i = self.find(filter_id)
+        if i is not None:
+            del self.filters[i]
+
+    def add_or_replace(self, filter_id, insert_at, options=(), flags=1):
+        i = self.find(filter_id)
+        if i is None:
+            self.filters.insert(insert_at, (filter_id, flags, options, b''))
+        else:
+            self.filters[i] = (filter_id, flags, options, self.filters[i][3])
+
+    def set_scaleoffset(self, scaleoffset, dtype):
+        if scaleoffset is False:
+            self.drop(h5z.FILTER_SCALEOFFSET)
+        if dtype.kind in ('u', 'i'):
+            so_opts = (h5z.SO_INT, scaleoffset)
+        else: # dtype.kind == 'f'
+            so_opts = (h5z.SO_FLOAT_DSCALE, scaleoffset)
+        self.add_or_replace(h5z.FILTER_SCALEOFFSET, 0, options=so_opts)
+
+    def set_shuffle(self, shuffle):
+        if shuffle:
+            # Shuffle goes after scaleoffset, but before any compression
+            insert_at = 0
+            if self.filters and (self.filters[0][0] == h5z.FILTER_SCALEOFFSET):
+                insert_at = 1
+            self.add_or_replace(h5z.FILTER_SHUFFLE, insert_at)
+        else:
+            self.drop(h5z.FILTER_SHUFFLE)
+
+    def clear_compression(self):
+        # Wipe out any filters already in the pipeline, except those that are
+        # separate from h5py's compression keyword arg
+        non_compression_filters = {
+            h5z.FILTER_SCALEOFFSET, h5z.FILTER_SHUFFLE, h5z.FILTER_FLETCHER32
+        }
+        self.filters = [f for f in self.filters if f[0] in non_compression_filters]
+
+    def set_compression_filter(self, filter_id: int, opts=(), allow_unknown=False):
+        if not allow_unknown and not h5z.filter_avail(filter_id):
+            raise ValueError("Unknown compression filter number: %s" % filter_id)
+
+        self.clear_compression()
+        insert_at = 0
+        while insert_at < len(self.filters) and (self.filters[insert_at][0] in {
+            h5z.FILTER_SCALEOFFSET, h5z.FILTER_SHUFFLE
+        }):
+            insert_at += 1
+
+        self.filters.insert(insert_at, (filter_id, h5z.FLAG_OPTIONAL, opts, b''))
+
+    def set_deflate(self, level=5):
+        self.set_compression_filter(h5z.FILTER_DEFLATE, (level,))
+
+    def set_lzf(self):
+        self.set_compression_filter(h5z.FILTER_LZF)
+
+    def set_szip(self, szmethod, szpix):
+        opts = {'ec': h5z.SZIP_EC_OPTION_MASK, 'nn': h5z.SZIP_NN_OPTION_MASK}
+        self.set_compression_filter(h5z.FILTER_SZIP, (opts[szmethod], szpix))
+
+    def set_fletcher32(self, enabled):
+        # `fletcher32` must come after `compression`, otherwise, if `compression`
+        # is "szip" and the data is 64bit, the fletcher32 checksum will be wrong
+        # (see GitHub issue #953).
+        if enabled:
+            insert_at = len(self.filters)
+            # flags=0 means not optional (matching dcpl.set_fletcher32() )
+            self.add_or_replace(h5z.FILTER_FLETCHER32, insert_at, flags=0)
+        else:
+            self.drop(h5z.FILTER_FLETCHER32)
+
 def fill_dcpl(plist, shape, dtype, chunks, compression, compression_opts,
               shuffle, fletcher32, maxshape, scaleoffset, external,
               allow_unknown_filter=False):
@@ -244,49 +351,50 @@ def rq_tuple(tpl, name):
     external = _normalize_external(external)
     # End argument validation
 
-    if (chunks is True) or \
-    (chunks is None and any((shuffle, fletcher32, compression, maxshape,
-                             scaleoffset is not None))):
-        chunks = guess_chunk(shape, maxshape, dtype.itemsize)
+    for item in external:
+        plist.set_external(*item)
+
+    if chunks is None and any((
+        shuffle, fletcher32, compression, maxshape, scaleoffset is not None
+    )):
+        chunks = True
 
-    if maxshape is True:
-        maxshape = (None,)*len(shape)
+    pre_chunked = plist.get_layout() == h5d.CHUNKED and plist.get_chunk() != ()
+    if chunks is True:
+        # Guess chunk shape unless a passed-in property list already has chunks
+        chunks = None if pre_chunked else guess_chunk(shape, maxshape, dtype.itemsize)
 
     if chunks is not None:
         plist.set_chunk(chunks)
         plist.set_fill_time(h5d.FILL_TIME_ALLOC)  # prevent resize glitch
 
-    # scale-offset must come before shuffle and compression
-    if scaleoffset is not None:
-        if dtype.kind in ('u', 'i'):
-            plist.set_scaleoffset(h5z.SO_INT, scaleoffset)
-        else: # dtype.kind == 'f'
-            plist.set_scaleoffset(h5z.SO_FLOAT_DSCALE, scaleoffset)
+    filters = FilterPipeline.from_plist(plist)
+    prior_filters = filters.copy()
 
-    for item in external:
-        plist.set_external(*item)
+    if scaleoffset is not None:
+        filters.set_scaleoffset(scaleoffset, dtype)
 
-    if shuffle:
-        plist.set_shuffle()
+    if shuffle is not None:
+        filters.set_shuffle(shuffle)
 
-    if compression == 'gzip':
-        plist.set_deflate(gzip_level)
+    if compression is False:
+        filters.clear_compression()
+    elif compression == 'gzip':
+        filters.set_deflate(gzip_level)
     elif compression == 'lzf':
-        plist.set_filter(h5z.FILTER_LZF, h5z.FLAG_OPTIONAL)
+        filters.set_lzf()
     elif compression == 'szip':
-        opts = {'ec': h5z.SZIP_EC_OPTION_MASK, 'nn': h5z.SZIP_NN_OPTION_MASK}
-        plist.set_szip(opts[szmethod], szpix)
+        filters.set_szip(szmethod, szpix)
     elif isinstance(compression, int):
-        if not allow_unknown_filter and not h5z.filter_avail(compression):
-            raise ValueError("Unknown compression filter number: %s" % compression)
+        filters.set_compression_filter(
+            compression, compression_opts, allow_unknown_filter
+        )
 
-        plist.set_filter(compression, h5z.FLAG_OPTIONAL, compression_opts)
+    if fletcher32 is not None:
+        filters.set_fletcher32(fletcher32)
 
-    # `fletcher32` must come after `compression`, otherwise, if `compression`
-    # is "szip" and the data is 64bit, the fletcher32 checksum will be wrong
-    # (see GitHub issue #953).
-    if fletcher32:
-        plist.set_fletcher32()
+    if filters != prior_filters:
+        filters.apply_to_plist(plist)
 
     return plist
 

diff --git a/h5py/_hl/group.py b/h5py/_hl/group.py
@@ -298,7 +298,7 @@ def require_dataset(self, name, shape, dtype, exact=False, **kwds):
 
             return dset
 
-    def create_dataset_like(self, name, other, **kwupdate):
+    def create_dataset_like(self, name, other, **kwargs):
         """ Create a dataset similar to `other`.
 
         name
@@ -313,22 +313,26 @@ def create_dataset_like(self, name, other, **kwupdate):
         shape and dtype, in which case the provided values take precedence over
         those from `other`.
         """
-        for k in ('shape', 'dtype', 'chunks', 'compression',
-                  'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
-                  'fillvalue'):
-            kwupdate.setdefault(k, getattr(other, k))
-        # TODO: more elegant way to pass these (dcpl to create_dataset?)
+        kwargs.setdefault('shape', other.shape)
+        kwargs.setdefault('dtype', other.dtype)
         dcpl = other.id.get_create_plist()
-        kwupdate.setdefault('track_times', dcpl.get_obj_track_times())
-        kwupdate.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
+        # track_times needs to be passed specifically, because otherwise h5py
+        # applies its default (False) to override the HDF5 default.
+        kwargs.setdefault('track_times', dcpl.get_obj_track_times())
 
         # Special case: the maxshape property always exists, but if we pass it
         # to create_dataset, the new dataset will automatically get chunked
         # layout. So we copy it only if it is different from shape.
         if other.maxshape != other.shape:
-            kwupdate.setdefault('maxshape', other.maxshape)
+            kwargs.setdefault('maxshape', other.maxshape)
 
-        return self.create_dataset(name, **kwupdate)
+        # For these keywords, passing None means turning something off.
+        # We use False to distinguish this from 'no change'.
+        for k in ('compression', 'scaleoffset'):
+            if kwargs.get(k, ...) is None:
+                kwargs[k] = False
+
+        return self.create_dataset(name, dcpl=dcpl, **kwargs)
 
     def require_group(self, name):
         # TODO: support kwargs like require_dataset

diff --git a/h5py/tests/test_dataset.py b/h5py/tests/test_dataset.py
@@ -949,6 +949,38 @@ def test_maxshape(self):
         self.assertEqual(similar.shape, (10,))
         self.assertEqual(similar.maxshape, (20,))
 
+    @ut.skipIf('gzip' not in h5py.filters.encode, "DEFLATE is not installed")
+    def test_chunking_compression(self):
+        foo = self.f.create_dataset(
+            'foo', dtype=np.uint16, shape=(100,), chunks=(21,), compression='gzip'
+        )
+        bar = self.f.create_dataset_like('bar', foo)
+        self.assertEqual(bar.shape, (100,))
+        self.assertEqual(bar.chunks, (21,))
+        self.assertEqual(bar.filter_ids, (h5py.h5z.FILTER_DEFLATE,))
+
+    @ut.skipIf('gzip' not in h5py.filters.encode, "DEFLATE is not installed")
+    def test_remove_compression(self):
+        foo = self.f.create_dataset(
+            'foo', dtype=np.uint16, shape=(100,), chunks=(21,), compression='gzip'
+        )
+        baz = self.f.create_dataset_like('baz', foo, compression=None)
+        self.assertEqual(baz.filter_ids, ())
+        self.assertEqual(baz.chunks, (21,))
+
+    @ut.skipIf('gzip' not in h5py.filters.encode, "DEFLATE is not installed")
+    @ut.skipIf('lzf' not in h5py.filters.encode, "LZF is not installed")
+    def test_replace_compression(self):
+        foo = self.f.create_dataset(
+            'foo', dtype=np.uint16, shape=(100,), chunks=(21,), compression='gzip',
+            shuffle=True, fletcher32=True,
+        )
+        baz = self.f.create_dataset_like('baz', foo, compression='lzf')
+        self.assertEqual(baz.filter_ids, (
+            h5py.h5z.FILTER_SHUFFLE, h5py.h5z.FILTER_LZF, h5py.h5z.FILTER_FLETCHER32
+        ))
+        self.assertEqual(baz.chunks, (21,))
+
 class TestChunkIterator(BaseDataset):
     def test_no_chunks(self):
         dset = self.f.create_dataset("foo", ())
@@ -1857,6 +1889,7 @@ def test_allow_unknown_filter(writable_file):
         allow_unknown_filter=True
     )
     assert str(fake_filter_id) in ds._filters
+    assert ds.compression == 'unknown'
 
 
 def test_dset_chunk_cache():
@@ -1951,3 +1984,14 @@ def test_virtual_prefix_require(self):
         self.assertEqual(virtual_prefix, virtual_prefix_readback)
         self.assertIsInstance(dset, Dataset)
         self.assertEqual(dset.shape, (10, 3))
+
+
+def test_filter_properties(writable_file):
+    ds = writable_file.create_dataset(
+        'foo', shape=1000, dtype=np.float32,
+        fletcher32=True, shuffle=True, compression='lzf'
+    )
+    assert ds.filter_ids == (
+        h5py.h5z.FILTER_SHUFFLE, h5py.h5z.FILTER_LZF, h5py.h5z.FILTER_FLETCHER32
+    )
+    assert ds.filter_names == ('shuffle', 'lzf', 'fletcher32')