From eb8c20ed5b491d99d35486bf234265a05de43e71 Mon Sep 17 00:00:00 2001 From: darothen Date: Thu, 4 May 2017 13:49:55 -0400 Subject: [PATCH] Add hotfix for issue with overwriting encoding attributes --- doc/usage.rst | 27 +++++++++++++++++++++++---- xbpch/common.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/doc/usage.rst b/doc/usage.rst index 73d7bec..5b3f8f6 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -126,7 +126,7 @@ data may look something like: This graph illustrates that dask is expected to process 12 chunks of data - one for each month (timestep) in the dataset. The graph shows the operations for reading the data, casting it to the correct data type, and re-scaling, which are -applied automatically by **xbpch** and xarray. +applied automatically by **xbpch** and xarray. At this point, the data has only been processed in such a way that it fits the numpy.ndarray memory model, and thus can be used to construct xarray @@ -146,13 +146,13 @@ which produces the computational graph :alt: Normalization calculation on monthly data Computational graph for normalizing monthly data - + A second key function of ``dask`` is to analyze and parse these computational graphs into a simplified form. In practice, the resulting graph will be much simpler, which can dramatically speed up your analysis. For instance, if you sub-sample the variables and timesteps used in your analysis, **xbpch** (through dask) will avoid reading extra, unused data from the input files you passed -it. +it. .. note:: @@ -222,7 +222,7 @@ encoded: .. ipython:: python :verbatim: - + import matplotlib.pyplot as plt import cartopy.crs as ccrs @@ -348,3 +348,22 @@ They can then be read back in via xarray import xarray as xr ds = xr.open_dataset("my_bpch_data.nc") + +.. note:: + + As of v0.2.0, immediately writing to netcdf may not work due to the way variable + units and scaling factors are encoded when they are read into **xbpch**. This + will be fixed once some upstream issues with xarray are patched. If you run into + the following ``ValueError``:: + + ValueError: Failed hard to prevent overwriting key 'scale_factor' + + then before you save it, process it with the :meth:`xbpch.common.fix_attr_encoding()` + method + + .. ipython:: python + :verbatim: + + my_ds = xbpch.common.fix_attr_encoding(my_ds) + + my_ds.to_netcdf("my_data.nc") diff --git a/xbpch/common.py b/xbpch/common.py index 8284740..c6476c2 100644 --- a/xbpch/common.py +++ b/xbpch/common.py @@ -46,4 +46,46 @@ def get_timestamp(time=True, date=True, fmt=None): else: raise ValueError("One of `date` or `time` must be True!") - return datetime.now().strftime(fmt) \ No newline at end of file + return datetime.now().strftime(fmt) + + +def fix_attr_encoding(ds): + """ This is a temporary hot-fix to handle the way metadata is encoded + when we read data directly from bpch files. It removes the 'scale_factor' + and 'units' attributes we encode with the data we ingest, converts the + 'hydrocarbon' and 'chemical' attribute to a binary integer instead of a + boolean, and removes the 'units' attribute from the "time" dimension since + that too is implicitly encoded. + + In future versions of this library, when upstream issues in decoding + data wrapped in dask arrays is fixed, this won't be necessary and will be + removed. + + """ + + def _maybe_del_attr(da, attr): + """ Possibly delete an attribute on a DataArray if it's present """ + if attr in da.attrs: + del da.attrs[attr] + return da + + def _maybe_decode_attr(da, attr): + # TODO: Fix this so that bools get written as attributes just fine + """ Possibly coerce an attribute on a DataArray to an easier type + to write to disk. """ + # bool -> int + if (attr in da.attrs) and (type(da.attrs[attr] == bool)): + da.attrs[attr] = int(da.attrs[attr]) + return da + + for v in ds.data_vars: + da = ds[v] + da = _maybe_del_attr(da, 'scale_factor') + da = _maybe_del_attr(da, 'units') + da = _maybe_decode_attr(da, 'hydrocarbon') + da = _maybe_decode_attr(da, 'chemical') + # Also delete attributes on time. + times = ds.time + times = _maybe_del_attr(times, 'units') + + return ds