Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better access to compression/filter information #2180

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/high/dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,11 @@ Reference
String with the currently applied compression filter, or None if
compression is not enabled for this dataset. See :ref:`dataset_compression`.

This only recognises the built-in compression options ``'gzip'``,
``'lzf'`` and ``'szip'``. Other compression mechanisms will show as
``'unknown'`` from h5py 3.8. Use :attr:`filter_ids` and
:attr:`filter_names` to get more complete information.

.. attribute:: compression_opts

Options for the compression filter. See :ref:`dataset_compression`.
Expand All @@ -641,6 +646,21 @@ Reference

Whether Fletcher32 checksumming is enabled (T/F). See :ref:`dataset_fletcher32`.

.. attribute:: filter_ids
filter_names

The numeric filter IDs and the string names (as stored in the file) of
the filters in use. Each attribute is a tuple.

Filters are mostly used to compress data, but can also do things like
checksumming (see :ref:`dataset_compression`). Other attributes listed
above provide convenient shortcuts to check on common filters.
IDs for filters built into h5py can be found in the :mod:`h5py.h5z`
module, while filter IDs from plugins are listed in `HDF Group's registry
<https://portal.hdfgroup.org/display/support/Registered+Filter+Plugins>`_.

.. versionadded:: 3.8

.. attribute:: fillvalue

Value used when reading uninitialized portions of the dataset, or None
Expand Down
19 changes: 18 additions & 1 deletion h5py/_hl/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,
compression = 'gzip'

# Legacy
if compression in _LEGACY_GZIP_COMPRESSION_VALS:
if compression in _LEGACY_GZIP_COMPRESSION_VALS and (compression is not False):
if compression_opts is not None:
raise TypeError("Conflict in compression options")
compression_opts = compression
Expand Down Expand Up @@ -560,6 +560,8 @@ def compression(self):
for x in ('gzip','lzf','szip'):
if x in self._filters:
return x
if any(f not in filters._COMP_FILTERS for f in self._filters):
return 'unknown' # Filter from a plugin
return None

@property
Expand All @@ -568,6 +570,21 @@ def compression_opts(self):
""" Compression setting. Int(0-9) for gzip, 2-tuple for szip. """
return self._filters.get(self.compression, None)

@property
@with_phil
def filter_ids(self):
"""Numeric IDs of HDF5 filters used for this dataset"""
pl = self._dcpl
return tuple([pl.get_filter(i)[0] for i in range(pl.get_nfilters())])

@property
@with_phil
def filter_names(self):
"""Names, as stored in the file, of the filters used for this dataset"""
pl = self._dcpl
return tuple([pl.get_filter(i)[3].decode('utf-8', 'surrogateescape')
for i in range(pl.get_nfilters())])

@property
@with_phil
def shuffle(self):
Expand Down
166 changes: 137 additions & 29 deletions h5py/_hl/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,113 @@ class Gzip(FilterRefBase):
def __init__(self, level=DEFAULT_GZIP):
self.filter_options = (level,)


class FilterPipeline:
def __init__(self, filters: list):
self.filters = filters

@classmethod
def from_plist(cls, plist):
return cls([plist.get_filter(i) for i in range(plist.get_nfilters())])

def apply_to_plist(self, plist):
# Remove all existing filters
nf = plist.get_nfilters()
while nf:
fid = plist.get_filter(0)[0]
plist.remove_filter(fid)
nf_new = plist.get_nfilters()
assert nf_new < nf # Protect against infinite loops
nf = nf_new

# Add filters
for fid, flags, opts, _ in self.filters:
plist.set_filter(fid, flags, opts)

def copy(self):
return type(self)(self.filters.copy())

def __eq__(self, other):
return [f[:3] for f in self.filters] == [f[:3] for f in other.filters]

def find(self, filter_id):
for i, row in enumerate(self.filters):
if row[0] == filter_id:
return i

def drop(self, filter_id):
i = self.find(filter_id)
if i is not None:
del self.filters[i]

def add_or_replace(self, filter_id, insert_at, options=(), flags=1):
i = self.find(filter_id)
if i is None:
self.filters.insert(insert_at, (filter_id, flags, options, b''))
else:
self.filters[i] = (filter_id, flags, options, self.filters[i][3])

def set_scaleoffset(self, scaleoffset, dtype):
if scaleoffset is False:
self.drop(h5z.FILTER_SCALEOFFSET)
if dtype.kind in ('u', 'i'):
so_opts = (h5z.SO_INT, scaleoffset)
else: # dtype.kind == 'f'
so_opts = (h5z.SO_FLOAT_DSCALE, scaleoffset)
self.add_or_replace(h5z.FILTER_SCALEOFFSET, 0, options=so_opts)

def set_shuffle(self, shuffle):
if shuffle:
# Shuffle goes after scaleoffset, but before any compression
insert_at = 0
if self.filters and (self.filters[0][0] == h5z.FILTER_SCALEOFFSET):
insert_at = 1
self.add_or_replace(h5z.FILTER_SHUFFLE, insert_at)
else:
self.drop(h5z.FILTER_SHUFFLE)

def clear_compression(self):
# Wipe out any filters already in the pipeline, except those that are
# separate from h5py's compression keyword arg
non_compression_filters = {
h5z.FILTER_SCALEOFFSET, h5z.FILTER_SHUFFLE, h5z.FILTER_FLETCHER32
}
self.filters = [f for f in self.filters if f[0] in non_compression_filters]

def set_compression_filter(self, filter_id: int, opts=(), allow_unknown=False):
if not allow_unknown and not h5z.filter_avail(filter_id):
raise ValueError("Unknown compression filter number: %s" % filter_id)

self.clear_compression()
insert_at = 0
while insert_at < len(self.filters) and (self.filters[insert_at][0] in {
h5z.FILTER_SCALEOFFSET, h5z.FILTER_SHUFFLE
}):
insert_at += 1

self.filters.insert(insert_at, (filter_id, h5z.FLAG_OPTIONAL, opts, b''))

def set_deflate(self, level=5):
self.set_compression_filter(h5z.FILTER_DEFLATE, (level,))

def set_lzf(self):
self.set_compression_filter(h5z.FILTER_LZF)

def set_szip(self, szmethod, szpix):
opts = {'ec': h5z.SZIP_EC_OPTION_MASK, 'nn': h5z.SZIP_NN_OPTION_MASK}
self.set_compression_filter(h5z.FILTER_SZIP, (opts[szmethod], szpix))

def set_fletcher32(self, enabled):
# `fletcher32` must come after `compression`, otherwise, if `compression`
# is "szip" and the data is 64bit, the fletcher32 checksum will be wrong
# (see GitHub issue #953).
if enabled:
insert_at = len(self.filters)
# flags=0 means not optional (matching dcpl.set_fletcher32() )
self.add_or_replace(h5z.FILTER_FLETCHER32, insert_at, flags=0)
else:
self.drop(h5z.FILTER_FLETCHER32)

def fill_dcpl(plist, shape, dtype, chunks, compression, compression_opts,
shuffle, fletcher32, maxshape, scaleoffset, external,
allow_unknown_filter=False):
Expand Down Expand Up @@ -244,49 +351,50 @@ def rq_tuple(tpl, name):
external = _normalize_external(external)
# End argument validation

if (chunks is True) or \
(chunks is None and any((shuffle, fletcher32, compression, maxshape,
scaleoffset is not None))):
chunks = guess_chunk(shape, maxshape, dtype.itemsize)
for item in external:
plist.set_external(*item)

if chunks is None and any((
shuffle, fletcher32, compression, maxshape, scaleoffset is not None
)):
chunks = True

if maxshape is True:
maxshape = (None,)*len(shape)
pre_chunked = plist.get_layout() == h5d.CHUNKED and plist.get_chunk() != ()
if chunks is True:
# Guess chunk shape unless a passed-in property list already has chunks
chunks = None if pre_chunked else guess_chunk(shape, maxshape, dtype.itemsize)

if chunks is not None:
plist.set_chunk(chunks)
plist.set_fill_time(h5d.FILL_TIME_ALLOC) # prevent resize glitch

# scale-offset must come before shuffle and compression
if scaleoffset is not None:
if dtype.kind in ('u', 'i'):
plist.set_scaleoffset(h5z.SO_INT, scaleoffset)
else: # dtype.kind == 'f'
plist.set_scaleoffset(h5z.SO_FLOAT_DSCALE, scaleoffset)
filters = FilterPipeline.from_plist(plist)
prior_filters = filters.copy()

for item in external:
plist.set_external(*item)
if scaleoffset is not None:
filters.set_scaleoffset(scaleoffset, dtype)

if shuffle:
plist.set_shuffle()
if shuffle is not None:
filters.set_shuffle(shuffle)

if compression == 'gzip':
plist.set_deflate(gzip_level)
if compression is False:
filters.clear_compression()
elif compression == 'gzip':
filters.set_deflate(gzip_level)
elif compression == 'lzf':
plist.set_filter(h5z.FILTER_LZF, h5z.FLAG_OPTIONAL)
filters.set_lzf()
elif compression == 'szip':
opts = {'ec': h5z.SZIP_EC_OPTION_MASK, 'nn': h5z.SZIP_NN_OPTION_MASK}
plist.set_szip(opts[szmethod], szpix)
filters.set_szip(szmethod, szpix)
elif isinstance(compression, int):
if not allow_unknown_filter and not h5z.filter_avail(compression):
raise ValueError("Unknown compression filter number: %s" % compression)
filters.set_compression_filter(
compression, compression_opts, allow_unknown_filter
)

plist.set_filter(compression, h5z.FLAG_OPTIONAL, compression_opts)
if fletcher32 is not None:
filters.set_fletcher32(fletcher32)

# `fletcher32` must come after `compression`, otherwise, if `compression`
# is "szip" and the data is 64bit, the fletcher32 checksum will be wrong
# (see GitHub issue #953).
if fletcher32:
plist.set_fletcher32()
if filters != prior_filters:
filters.apply_to_plist(plist)

return plist

Expand Down
24 changes: 14 additions & 10 deletions h5py/_hl/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def require_dataset(self, name, shape, dtype, exact=False, **kwds):

return dset

def create_dataset_like(self, name, other, **kwupdate):
def create_dataset_like(self, name, other, **kwargs):
""" Create a dataset similar to `other`.

name
Expand All @@ -313,22 +313,26 @@ def create_dataset_like(self, name, other, **kwupdate):
shape and dtype, in which case the provided values take precedence over
those from `other`.
"""
for k in ('shape', 'dtype', 'chunks', 'compression',
'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
'fillvalue'):
kwupdate.setdefault(k, getattr(other, k))
# TODO: more elegant way to pass these (dcpl to create_dataset?)
kwargs.setdefault('shape', other.shape)
kwargs.setdefault('dtype', other.dtype)
dcpl = other.id.get_create_plist()
kwupdate.setdefault('track_times', dcpl.get_obj_track_times())
kwupdate.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
# track_times needs to be passed specifically, because otherwise h5py
# applies its default (False) to override the HDF5 default.
kwargs.setdefault('track_times', dcpl.get_obj_track_times())

# Special case: the maxshape property always exists, but if we pass it
# to create_dataset, the new dataset will automatically get chunked
# layout. So we copy it only if it is different from shape.
if other.maxshape != other.shape:
kwupdate.setdefault('maxshape', other.maxshape)
kwargs.setdefault('maxshape', other.maxshape)

return self.create_dataset(name, **kwupdate)
# For these keywords, passing None means turning something off.
# We use False to distinguish this from 'no change'.
for k in ('compression', 'scaleoffset'):
if kwargs.get(k, ...) is None:
kwargs[k] = False

return self.create_dataset(name, dcpl=dcpl, **kwargs)

def require_group(self, name):
# TODO: support kwargs like require_dataset
Expand Down
44 changes: 44 additions & 0 deletions h5py/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,38 @@ def test_maxshape(self):
self.assertEqual(similar.shape, (10,))
self.assertEqual(similar.maxshape, (20,))

@ut.skipIf('gzip' not in h5py.filters.encode, "DEFLATE is not installed")
def test_chunking_compression(self):
foo = self.f.create_dataset(
'foo', dtype=np.uint16, shape=(100,), chunks=(21,), compression='gzip'
)
bar = self.f.create_dataset_like('bar', foo)
self.assertEqual(bar.shape, (100,))
self.assertEqual(bar.chunks, (21,))
self.assertEqual(bar.filter_ids, (h5py.h5z.FILTER_DEFLATE,))

@ut.skipIf('gzip' not in h5py.filters.encode, "DEFLATE is not installed")
def test_remove_compression(self):
foo = self.f.create_dataset(
'foo', dtype=np.uint16, shape=(100,), chunks=(21,), compression='gzip'
)
baz = self.f.create_dataset_like('baz', foo, compression=None)
self.assertEqual(baz.filter_ids, ())
self.assertEqual(baz.chunks, (21,))

@ut.skipIf('gzip' not in h5py.filters.encode, "DEFLATE is not installed")
@ut.skipIf('lzf' not in h5py.filters.encode, "LZF is not installed")
def test_replace_compression(self):
foo = self.f.create_dataset(
'foo', dtype=np.uint16, shape=(100,), chunks=(21,), compression='gzip',
shuffle=True, fletcher32=True,
)
baz = self.f.create_dataset_like('baz', foo, compression='lzf')
self.assertEqual(baz.filter_ids, (
h5py.h5z.FILTER_SHUFFLE, h5py.h5z.FILTER_LZF, h5py.h5z.FILTER_FLETCHER32
))
self.assertEqual(baz.chunks, (21,))

class TestChunkIterator(BaseDataset):
def test_no_chunks(self):
dset = self.f.create_dataset("foo", ())
Expand Down Expand Up @@ -1857,6 +1889,7 @@ def test_allow_unknown_filter(writable_file):
allow_unknown_filter=True
)
assert str(fake_filter_id) in ds._filters
assert ds.compression == 'unknown'


def test_dset_chunk_cache():
Expand Down Expand Up @@ -1951,3 +1984,14 @@ def test_virtual_prefix_require(self):
self.assertEqual(virtual_prefix, virtual_prefix_readback)
self.assertIsInstance(dset, Dataset)
self.assertEqual(dset.shape, (10, 3))


def test_filter_properties(writable_file):
ds = writable_file.create_dataset(
'foo', shape=1000, dtype=np.float32,
fletcher32=True, shuffle=True, compression='lzf'
)
assert ds.filter_ids == (
h5py.h5z.FILTER_SHUFFLE, h5py.h5z.FILTER_LZF, h5py.h5z.FILTER_FLETCHER32
)
assert ds.filter_names == ('shuffle', 'lzf', 'fletcher32')