Merge pull request #642 from duncanmmacleod/hdf5-T1800014

Improved HDF5 writing to follow T1800014 proposals
gwpy · Jan 18, 2018 · 46e10d8 · 46e10d8
2 parents 88ae495 + 51bcd67
commit 46e10d8
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 101 deletions.
diff --git a/.codeclimate.yml b/.codeclimate.yml
@@ -1,4 +1,11 @@
-engines:
+version: "2"
+
+checks:
+  file-lines:
+    config:
+      threshold: 500
+
+plugins:
   duplication:
     enabled: true
     config:
@@ -14,6 +21,9 @@ engines:
     enabled: true
   radon:
     enabled: true
+    config:
+      threshold: "C"
+
 ratings:
   paths:
   - "**.py"

diff --git a/gwpy/io/hdf5.py b/gwpy/io/hdf5.py
@@ -131,46 +131,39 @@ def decorated_func(obj, fobj, *args, **kwargs):
     return decorated_func
 
 
-@with_write_hdf5
-def write_object_dataset(obj, target, create_func, append=False,
-                         overwrite=False, **kwargs):
-    """Write the given dataset to the file
+def create_dataset(parent, path, overwrite=False, **kwargs):
+    """Create a new dataset inside the parent HDF5 object
 
     Parameters
     ----------
-    obj : `object`
-        the object to write into the dataset
+    parent : `h5py.Group`, `h5py.File`
+        the object in which to create a new dataset
 
-    target : `str`, `h5py.File`, `h5py.Group`
-        the output filepath, or the HDF5 object in which to write
+    path : `str`
+        the path at which to create the new dataset
 
-    create_func : `callable`
-        a callable that can write the ``obj`` into an `h5py.Dataset`,
-        must take an ``h5py.Group`` as the first argument, and ``obj``
-        as the second, other keyword arguments may follow
-
-    append : `bool`, default: `False`
-        if `True`, write new dataset to existing file, otherwise an
-        exception will be raised if the output file exists (only used if
-        ``f`` is `str`)
-
-    overwrite : `bool`, default: `False`
-        if `True`, overwrite an existing dataset in an existing file,
-        otherwise an exception will be raised if a dataset exists with
-        the given name (only used if ``f`` is `str`)
+    overwrite : `bool`
+        if `True`, delete any existing dataset at the desired path,
+        default: `False`
 
     **kwargs
-        other keyword arguments to pass to ``create_func``
+        other arguments are passed directly to
+        :meth:`h5py.Group.create_dataset`
 
     Returns
     -------
-    dset : `h5py.Dataset`
-        the dataset as created in the file
-
-    Raises
-    ------
-    ValueError
-        if the output file exists and ``append=False``
+    dataset : `h5py.Dataset`
+        the newly created dataset
     """
-    return create_func(target, obj, append=append, overwrite=overwrite,
-                       **kwargs)
+    # force deletion of existing dataset
+    if path in parent and overwrite:
+        del parent[path]
+
+    # create new dataset with improved error handling
+    try:
+        return parent.create_dataset(path, **kwargs)
+    except RuntimeError as exc:
+        if str(exc) == 'Unable to create link (Name already exists)':
+            exc.args = ('{0}: {1!r}, pass overwrite=True '
+                        'to ignore existing datasets'.format(str(exc), path),)
+        raise
diff --git a/gwpy/spectrogram/io/__init__.py b/gwpy/spectrogram/io/__init__.py
@@ -19,9 +19,8 @@
 """Input/Output routines for the Spectrogram.
 """
 
-__author__ = "Duncan Macleod <duncan.macleod@ligo.org>"
+from . import (
+    hdf5
+)
 
-try:
-    from . import hdf5
-except ImportError:
-    pass
+__author__ = "Duncan Macleod <duncan.macleod@ligo.org>"
diff --git a/gwpy/timeseries/io/hdf5.py b/gwpy/timeseries/io/hdf5.py
@@ -19,12 +19,16 @@
 """This module attaches the HDF5 input output methods to the TimeSeries.
 """
 
+from astropy import units
+
 from ...io import registry as io_registry
 from ...io.hdf5 import (identify_hdf5, with_read_hdf5, with_write_hdf5)
-from ...types.io.hdf5 import (read_hdf5_array, write_hdf5_array)
+from ...types.io.hdf5 import (read_hdf5_array, write_hdf5_series)
 from .. import (TimeSeries, TimeSeriesDict,
                 StateVector, StateVectorDict)
 
+SEC_UNIT = units.second
+
 __author__ = 'Duncan Macleod <duncan.macleod@ligo.org>'
 
 
@@ -42,6 +46,12 @@ def read_hdf5_timeseries(h5f, path=None, start=None, end=None, **kwargs):
     return series
 
 
+def _is_timeseries_dataset(dataset):
+    """Returns `True` if a dataset contains `TimeSeries` data
+    """
+    return SEC_UNIT.is_equivalent(dataset.attrs.get('xunit', 'undef'))
+
+
 @with_read_hdf5
 def read_hdf5_dict(h5f, names=None, group=None, **kwargs):
     """Read a `TimeSeriesDict` from HDF5
@@ -54,9 +64,7 @@ def read_hdf5_dict(h5f, names=None, group=None, **kwargs):
 
     # find list of names to read
     if names is None:
-        # TODO: improve the TimeSeries -> HDF5 format to make detecting
-        #       a TimeSeries easier
-        names = [key for key in h5g if 'dx' in h5g[key]]
+        names = [key for key in h5g if _is_timeseries_dataset(h5g[key])]
 
     # read names
     out = kwargs.pop('dict_type', TimeSeriesDict)()
@@ -108,7 +116,7 @@ def write_hdf5_dict(tsdict, h5f, group=None, **kwargs):
 for series_class in (TimeSeries, StateVector):
     reader = read_hdf5_factory(series_class)
     io_registry.register_reader('hdf5', series_class, reader)
-    io_registry.register_writer('hdf5', series_class, write_hdf5_array)
+    io_registry.register_writer('hdf5', series_class, write_hdf5_series)
     io_registry.register_identifier('hdf5', series_class, identify_hdf5)
 
 # dict classes