From e25aba51e48c7a8d74b530b3f8d44ab3443ed39a Mon Sep 17 00:00:00 2001 From: Matthew Larson Date: Tue, 5 Dec 2023 16:22:13 -0600 Subject: [PATCH] Implement low level H5Dread_multi call --- h5py/_proxy.pxd | 4 +- h5py/_proxy.pyx | 183 +++++++++++++++++++--------- h5py/api_functions.txt | 3 + h5py/h5d.pyx | 98 ++++++++++++--- h5py/tests/__init__.py | 2 +- h5py/tests/test_h5d_direct_chunk.py | 70 +++++++++++ news/dset_multi.rst | 4 + 7 files changed, 288 insertions(+), 76 deletions(-) create mode 100644 news/dset_multi.rst diff --git a/h5py/_proxy.pxd b/h5py/_proxy.pxd index 5fd2cc685..49b79440d 100644 --- a/h5py/_proxy.pxd +++ b/h5py/_proxy.pxd @@ -12,7 +12,7 @@ from .defs cimport * cdef herr_t attr_rw(hid_t attr, hid_t mtype, void *progbuf, int read) except -1 -cdef herr_t dset_rw(hid_t dset, hid_t mtype, hid_t mspace, hid_t fspace, - hid_t dxpl, void* progbuf, int read) except -1 +cdef herr_t dset_rw(size_t count, hid_t* _dset, hid_t* _mtype, hid_t* _mspace, hid_t* _fspace, + hid_t dxpl, void **progbuf, int read) except -1 cdef htri_t needs_bkg_buffer(hid_t src, hid_t dst) except -1 diff --git a/h5py/_proxy.pyx b/h5py/_proxy.pyx index e40504f53..12f853020 100644 --- a/h5py/_proxy.pyx +++ b/h5py/_proxy.pyx @@ -12,6 +12,7 @@ """ Proxy functions for read/write, to work around the HDF5 bogus type issue. """ +import numpy as np include "config.pxi" @@ -81,85 +82,153 @@ cdef herr_t attr_rw(hid_t attr, hid_t mtype, void *progbuf, int read) except -1: # ============================================================================= # Proxy for vlen buf workaround +cdef herr_t dset_rw(size_t count, hid_t* dset, hid_t* mtype, hid_t* mspace, hid_t* _fspace, + hid_t dxpl, void **progbuf, int read) except -1: -cdef herr_t dset_rw(hid_t dset, hid_t mtype, hid_t mspace, hid_t fspace, - hid_t dxpl, void* progbuf, int read) except -1: - - cdef htri_t need_bkg - cdef hid_t dstype = -1 # Dataset datatype + cdef hid_t plist_id = -1 cdef hid_t rawdstype = -1 - cdef hid_t dspace = -1 # Dataset dataspace - cdef hid_t cspace = -1 # Temporary contiguous dataspaces - cdef void* back_buf = NULL - cdef void* conv_buf = NULL - cdef hsize_t npoints + cdef hid_t* dstype = NULL # Dataset datatype + cdef hid_t* cspace = NULL # Temporary contiguous dataspaces + cdef hid_t* mspace_tmp = NULL + cdef hid_t* fspace_tmp = NULL + + cdef htri_t* need_bkg = NULL + + cdef void** back_buf = NULL + cdef void** conv_buf = NULL + + cdef hsize_t* npoints = NULL + + cdef bint rw_needs_proxy = False try: + # Make local list of mem/file spaces which may be freely modified + mspace_tmp = malloc(sizeof(hid_t*) * count) + fspace_tmp = malloc(sizeof(hid_t*) * count) + dstype = malloc(sizeof(hid_t*) * count) + + for i in range(count): + mspace_tmp[i] = mspace[i] + fspace_tmp[i] = _fspace[i] + # Issue 372: when a compound type is involved, using the dataset type # may result in uninitialized data being sent to H5Tconvert for fields # not present in the memory type. Limit the type used for the dataset # to only those fields present in the memory type. We can't use the # memory type directly because of course that triggers HDFFV-1063. - if (H5Tget_class(mtype) == H5T_COMPOUND) and (not read): - rawdstype = H5Dget_type(dset) - dstype = make_reduced_type(mtype, rawdstype) - H5Tclose(rawdstype) - else: - dstype = H5Dget_type(dset) - - if not (needs_proxy(dstype) or needs_proxy(mtype)): - if read: - H5Dread(dset, mtype, mspace, fspace, dxpl, progbuf) + for i in range(count): + if (H5Tget_class(mtype[i]) == H5T_COMPOUND) and (not read): + rawdstype = H5Dget_type(dset[i]) + dstype[i] = make_reduced_type(mtype[i], rawdstype) + H5Tclose(rawdstype) + rawdstype = -1 else: - H5Dwrite(dset, mtype, mspace, fspace, dxpl, progbuf) - else: - - if mspace == H5S_ALL and fspace != H5S_ALL: - mspace = fspace - elif mspace != H5S_ALL and fspace == H5S_ALL: - fspace = mspace - elif mspace == H5S_ALL and fspace == H5S_ALL: - fspace = mspace = dspace = H5Dget_space(dset) - - npoints = H5Sget_select_npoints(mspace) - cspace = H5Screate_simple(1, &npoints, NULL) + dstype[i] = H5Dget_type(dset[i]) - conv_buf = create_buffer(H5Tget_size(dstype), H5Tget_size(mtype), npoints) + rw_needs_proxy = rw_needs_proxy or (needs_proxy(dstype[i]) or needs_proxy(mtype[i])) - # Only create a (contiguous) backing buffer if absolutely - # necessary. Note this buffer always has memory type. + if not rw_needs_proxy: if read: - need_bkg = needs_bkg_buffer(dstype, mtype) + if count > 1: + H5Dread_multi(count, dset, mtype, mspace_tmp, fspace_tmp, dxpl, progbuf) + else: + H5Dread(dset[0], mtype[0], mspace_tmp[0], fspace_tmp[0], dxpl, progbuf[0]) else: - need_bkg = needs_bkg_buffer(mtype, dstype) - if need_bkg: - back_buf = create_buffer(H5Tget_size(dstype), H5Tget_size(mtype), npoints) + if count > 1: + H5Dwrite_multi(count, dset, mtype, mspace_tmp, fspace_tmp, dxpl, progbuf) + else: + H5Dwrite(dset[0], mtype[0],mspace_tmp[0], fspace_tmp[0], dxpl, progbuf[0]) + else: + cspace = malloc(sizeof(hid_t*) * count) + need_bkg = malloc(sizeof(htri_t) * count) + back_buf = malloc(sizeof(void*) * count) + conv_buf = malloc(sizeof(void*) * count) + npoints = malloc(sizeof(hsize_t) * count) + + for i in range(count): + back_buf[i] = NULL + conv_buf[i] = NULL + + for i in range(count): + if mspace_tmp[i] == H5S_ALL and fspace_tmp[i] != H5S_ALL: + mspace_tmp[i] = fspace_tmp[i] + elif mspace_tmp[i] != H5S_ALL and fspace_tmp[i] == H5S_ALL: + fspace_tmp[i] = mspace_tmp[i] + elif mspace_tmp[i] == H5S_ALL and fspace_tmp[i] == H5S_ALL: + mspace_tmp[i] = fspace_tmp[i] = H5Dget_space(dset[i]) + + npoints[i] = H5Sget_select_npoints(mspace_tmp[i]) + cspace[i] = H5Screate_simple(1, &npoints[i], NULL) + + conv_buf[i] = create_buffer(H5Tget_size(dstype[i]), H5Tget_size(mtype[i]), npoints[i]) + + # Only create a (contiguous) backing buffer if absolutely + # necessary. Note this buffer always has memory type. if read: - h5py_copy(mtype, mspace, back_buf, progbuf, H5PY_GATHER) + need_bkg[i] = needs_bkg_buffer(dstype[i], mtype[i]) + else: + need_bkg[i] = needs_bkg_buffer(mtype[i], dstype[i]) + + if need_bkg[i]: + back_buf[i] = create_buffer(H5Tget_size(dstype[i]), H5Tget_size(mtype[i]), npoints[i]) + if read: + h5py_copy(mtype[i], mspace_tmp[i], back_buf[i], progbuf[i], H5PY_GATHER) if read: - H5Dread(dset, dstype, cspace, fspace, dxpl, conv_buf) - H5Tconvert(dstype, mtype, npoints, conv_buf, back_buf, dxpl) - h5py_copy(mtype, mspace, conv_buf, progbuf, H5PY_SCATTER) + if count > 1: + H5Dread_multi(count, dset, mtype, mspace_tmp, fspace_tmp, dxpl, conv_buf) + else: + H5Dread(dset[0], dstype[0], cspace[0], fspace_tmp[0], dxpl, conv_buf[0]) + + for i in range(count): + H5Tconvert(dstype[i], mtype[i], npoints[i], conv_buf[i], back_buf[i], dxpl) + h5py_copy(mtype[i], mspace_tmp[i], conv_buf[i], progbuf[i], H5PY_SCATTER) else: - h5py_copy(mtype, mspace, conv_buf, progbuf, H5PY_GATHER) - H5Tconvert(mtype, dstype, npoints, conv_buf, back_buf, dxpl) - H5Dwrite(dset, dstype, cspace, fspace, dxpl, conv_buf) - H5Dvlen_reclaim(dstype, cspace, H5P_DEFAULT, conv_buf) + for i in range(count): + h5py_copy(mtype[i], mspace_tmp[i], conv_buf[i], progbuf[i], H5PY_GATHER) + H5Tconvert(mtype[i], dstype[i], npoints[i], conv_buf[i], back_buf[i], dxpl) - finally: - free(back_buf) - free(conv_buf) - if dstype > 0: - H5Tclose(dstype) - if dspace > 0: - H5Sclose(dspace) - if cspace > 0: - H5Sclose(cspace) + if count > 1: + H5Dwrite_multi(count, dset, dstype, cspace, fspace_tmp, dxpl, conv_buf) + else: + H5Dwrite(dset[0], dstype[0], cspace[0], fspace_tmp[0], dxpl, conv_buf[0]) - return 0 + for i in range(count): + H5Dvlen_reclaim(dstype[i], cspace[i], H5P_DEFAULT, conv_buf[i]) + finally: + + for i in range(count): + if (back_buf != NULL) and (need_bkg[i]) and (back_buf[i] != NULL): + free(back_buf[i]) + + if (conv_buf != NULL) and (conv_buf[i] != NULL): + free(conv_buf[i]) + + if cspace and (cspace[i] > 0): + H5Sclose(cspace[i]) + if dstype and (dstype[i] > 0): + H5Tclose(dstype[i]) + + if mspace_tmp != NULL: + free(mspace_tmp) + if fspace_tmp != NULL: + free(fspace_tmp) + + if npoints != NULL: + free(npoints) + if need_bkg != NULL: + free(need_bkg) + if back_buf != NULL: + free(back_buf) + if conv_buf != NULL: + free(conv_buf) + if cspace != NULL: + free(cspace) + + if rawdstype > 0: + H5Tclose(rawdstype) cdef hid_t make_reduced_type(hid_t mtype, hid_t dstype): # Go through dstype, pick out the fields which also appear in mtype, and diff --git a/h5py/api_functions.txt b/h5py/api_functions.txt index 1a1a226da..93284a8d6 100644 --- a/h5py/api_functions.txt +++ b/h5py/api_functions.txt @@ -99,6 +99,9 @@ hdf5: herr_t H5Dread(hid_t dset_id, hid_t mem_type_id, hid_t mem_space_id, hid_t file_space_id, hid_t plist_id, void *buf) nogil herr_t H5Dwrite(hid_t dset_id, hid_t mem_type, hid_t mem_space, hid_t file_space, hid_t xfer_plist, void* buf) nogil + 1.14.0-1.16.99 herr_t H5Dread_multi(size_t count, hid_t *dset_id, hid_t *mem_type_id, hid_t *mem_space_id, hid_t *file_space_id, hid_t dxpl_id, void **buf) + 1.14.0-1.16.99 herr_t H5Dwrite_multi(size_t count, hid_t *dset_id, hid_t *mem_type_id, hid_t *mem_space_id, hid_t *file_space_id, hid_t dxpl_id, const void **buf) + herr_t H5Dextend(hid_t dataset_id, hsize_t *size) nogil herr_t H5Dfill(void *fill, hid_t fill_type_id, void *buf, hid_t buf_type_id, hid_t space_id) nogil diff --git a/h5py/h5d.pyx b/h5py/h5d.pyx index e03c06997..a791d9ac8 100644 --- a/h5py/h5d.pyx +++ b/h5py/h5d.pyx @@ -18,6 +18,8 @@ include "config.pxi" from collections import namedtuple cimport cython from ._objects cimport pdefault +# TODO +import numpy as np from numpy cimport ndarray, import_array, PyArray_DATA from .utils cimport check_numpy_read, check_numpy_write, \ convert_tuple, convert_dims, emalloc, efree @@ -26,6 +28,7 @@ from .h5s cimport SpaceID from .h5p cimport PropID, propwrap from ._proxy cimport dset_rw +from ._hl import selections as sel from ._objects import phil, with_phil from cpython cimport PyBUF_ANY_CONTIGUOUS, \ PyBuffer_Release, \ @@ -151,6 +154,55 @@ def open(ObjectID loc not None, char* name, PropID dapl=None): """ return DatasetID(H5Dopen(loc.id, name, pdefault(dapl))) +IF HDF5_VERSION >= (1, 14, 0): + @with_phil + def read_multi(count, list dataset_ids, list mspace_ids, list fspace_ids, + list type_ids, list bufs not None, PropID dxpl=None): + """ (int count, list dataset_ids, list mspace_ids, list fspace_ids, + list type_ids, list bufs not None, PropID dxpl=None) + + Read raw data from a set of datasets into the provided buffers. + + For each dataset that will be read, its id, the id of a corresponding memory + and file space, a type id, and a buffer should be provided. The dataset + transfer property list applies to all transfers. + """ + + cdef hid_t* type_hids + cdef hid_t* mspace_hids + cdef hid_t* fspace_hids + cdef hid_t* dataset_hids + cdef void** buffer_ptrs + + cdef hid_t plist_id + + try: + buffer_ptrs = malloc(count * sizeof(void*)) + type_hids = malloc(count * sizeof(hid_t*)) + mspace_hids = malloc(count * sizeof(hid_t*)) + fspace_hids = malloc(count * sizeof(hid_t*)) + dataset_hids = malloc(count * sizeof(hid_t*)) + + plist_id = pdefault(dxpl) + + for i in range(count): + buffer_ptrs[i] = PyArray_DATA(bufs[i]) + type_hids[i] = type_ids[i] + mspace_hids[i] = mspace_ids[i] + fspace_hids[i] = fspace_ids[i] + dataset_hids[i] = dataset_ids[i] + + dset_rw(count, dataset_hids, type_hids, mspace_hids, fspace_hids, plist_id, buffer_ptrs, 1) + + finally: + free(type_hids) + free(mspace_hids) + free(fspace_hids) + free(dataset_hids) + free(buffer_ptrs) + + + # --- Proxy functions for safe(r) threading ----------------------------------- @@ -224,22 +276,29 @@ cdef class DatasetID(ObjectID): this is not the case, ValueError will be raised and the read will fail. Keyword dxpl may be a dataset transfer property list. """ - cdef hid_t self_id, mtype_id, mspace_id, fspace_id, plist_id - cdef void* data + cdef hid_t *self_id + cdef hid_t *mtype_id + cdef hid_t *mspace_id + cdef hid_t *fspace_id + cdef hid_t plist_id + + cdef void** data cdef int oldflags if mtype is None: mtype = py_create(arr_obj.dtype) check_numpy_write(arr_obj, -1) - self_id = self.id - mtype_id = mtype.id - mspace_id = mspace.id - fspace_id = fspace.id + self_id = &self.id + mtype_id = &mtype.id + mspace_id = &mspace.id + fspace_id = &fspace.id plist_id = pdefault(dxpl) - data = PyArray_DATA(arr_obj) - dset_rw(self_id, mtype_id, mspace_id, fspace_id, plist_id, data, 1) + data_tmp = PyArray_DATA(arr_obj) + data = &data_tmp + + dset_rw(1, self_id, mtype_id, mspace_id, fspace_id, plist_id, data, 1) @with_phil @@ -264,22 +323,29 @@ cdef class DatasetID(ObjectID): The provided Numpy array must be C-contiguous. If this is not the case, ValueError will be raised and the read will fail. """ - cdef hid_t self_id, mtype_id, mspace_id, fspace_id, plist_id - cdef void* data + cdef hid_t *self_id + cdef hid_t *mtype_id + cdef hid_t *mspace_id + cdef hid_t *fspace_id + cdef hid_t plist_id + + cdef void** data cdef int oldflags if mtype is None: mtype = py_create(arr_obj.dtype) check_numpy_read(arr_obj, -1) - self_id = self.id - mtype_id = mtype.id - mspace_id = mspace.id - fspace_id = fspace.id + self_id = &(self.id) + mtype_id = &(mtype.id) + mspace_id = &(mspace.id) + fspace_id = &(fspace.id) plist_id = pdefault(dxpl) - data = PyArray_DATA(arr_obj) - dset_rw(self_id, mtype_id, mspace_id, fspace_id, plist_id, data, 0) + data_tmp = PyArray_DATA(arr_obj) + data = &data_tmp + + dset_rw(1, self_id, mtype_id, mspace_id, fspace_id, plist_id, data, 0) @with_phil diff --git a/h5py/tests/__init__.py b/h5py/tests/__init__.py index 3fb68a2d3..807bfd5b5 100644 --- a/h5py/tests/__init__.py +++ b/h5py/tests/__init__.py @@ -18,6 +18,6 @@ def run_tests(args=''): from shlex import split from subprocess import call from sys import executable - cli = [executable, "-m", "pytest", "--pyargs", "h5py"] + cli = [executable, "-m", "pytest", "-vv", "--pyargs", "h5py"] cli.extend(split(args)) return call(cli) diff --git a/h5py/tests/test_h5d_direct_chunk.py b/h5py/tests/test_h5d_direct_chunk.py index ad489a846..d98ac3a0a 100644 --- a/h5py/tests/test_h5d_direct_chunk.py +++ b/h5py/tests/test_h5d_direct_chunk.py @@ -186,3 +186,73 @@ def test_fail_buffer_not_contiguous(self, writable_file): out = array[:, :, ::2] # Array is not contiguous with pytest.raises(ValueError): dataset.id.read_direct_chunk((0, 0), out=out) + + +class TestReadMulti(TestCase): + def test_read_multi_one_dataset(self): + filename = self.mktemp().encode() + with h5py.File(filename, "w") as filehandle: + shape = (10, 10, 10) + dt = numpy.int32 + + # Write data + data_in = numpy.reshape(numpy.arange(numpy.prod(shape)), shape) + dataset = filehandle.create_dataset("data", shape, + dtype=dt, data=data_in) + + self.assertTrue(numpy.array_equal(data_in, dataset[...])) + + mspace_id = h5py.h5s.create_simple(shape) + fspace_id = h5py.h5s.create_simple(shape) + type_id = dataset.id.get_type() + data_out = numpy.zeros(shape=shape, dtype=dt) + + # Read back data and verify + h5py.h5d.read_multi(1, [dataset.id.id], [mspace_id.id], + [fspace_id.id], [type_id.id], [data_out], + None) + + self.assertTrue(numpy.array_equal(data_in, data_out)) + + def test_read_multi_many_datasets(self): + filename = self.mktemp().encode() + with h5py.File(filename, "w") as filehandle: + shape = (2, 2, 2) + dt = numpy.int32 + count = 3 + data = numpy.reshape(numpy.arange(numpy.prod(shape)), shape) + data_in = [] + data_out = [] + + datasets = [] + mspaces = [] + fspaces = [] + types = [] + + for i in range(count): + # Write data + data_in.append(data + i) + dataset = filehandle.create_dataset("data" + str(i), shape, + dtype=dt, data=data_in[i]) + + self.assertTrue(numpy.array_equal(data_in[i], dataset[...])) + + datasets.append(dataset.id) + mspaces.append(h5py.h5s.create_simple(shape)) + fspaces.append(h5py.h5s.create_simple(shape)) + types.append(dataset.id.get_type()) + + data_out.append(numpy.zeros(shape=shape, dtype=dt)) + + dataset_ids = [d.id for d in datasets] + mspace_ids = [m.id for m in mspaces] + fspace_ids = [f.id for f in fspaces] + type_ids = [t.id for t in types] + + # Read back data and verify + h5py.h5d.read_multi(count, dataset_ids, mspace_ids, + fspace_ids, type_ids, data_out, + None) + + for i in range(count): + self.assertTrue(numpy.array_equal(data_in[i], data_out[i])) diff --git a/news/dset_multi.rst b/news/dset_multi.rst new file mode 100644 index 000000000..a66d7e50f --- /dev/null +++ b/news/dset_multi.rst @@ -0,0 +1,4 @@ +Exposing HDF5 functions +----------------------- + +* Implemented low-level h5d.read_multi to expose H5Dread_multi