From eb8c20ed5b491d99d35486bf234265a05de43e71 Mon Sep 17 00:00:00 2001
From: darothen <darothen@mit.edu>
Date: Thu, 4 May 2017 13:49:55 -0400
Subject: [PATCH] Add hotfix for issue with overwriting encoding attributes

---
 doc/usage.rst   | 27 +++++++++++++++++++++++----
 xbpch/common.py | 44 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/doc/usage.rst b/doc/usage.rst
index 73d7bec..5b3f8f6 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -126,7 +126,7 @@ data may look something like:
 This graph illustrates that dask is expected to process 12 chunks of data - one
 for each month (timestep) in the dataset. The graph shows the operations for
 reading the data, casting it to the correct data type, and re-scaling, which are
-applied automatically by **xbpch** and xarray. 
+applied automatically by **xbpch** and xarray.
 
 At this point, the data has only been processed in such a way that it fits
 the numpy.ndarray memory model, and thus can be used to construct xarray
@@ -146,13 +146,13 @@ which produces the computational graph
     :alt: Normalization calculation on monthly data
 
     Computational graph for normalizing monthly data
-    
+
 A second key function of ``dask`` is to analyze and parse these computational
 graphs into a simplified form. In practice, the resulting graph will be
 much simpler, which can dramatically speed up your analysis. For instance, if
 you sub-sample the variables and timesteps used in your analysis, **xbpch**
 (through dask) will avoid reading extra, unused data from the input files you passed
-it. 
+it.
 
 .. note::
 
@@ -222,7 +222,7 @@ encoded:
 
 .. ipython:: python
     :verbatim:
-             
+
     import matplotlib.pyplot as plt
     import cartopy.crs as ccrs
 
@@ -348,3 +348,22 @@ They can then be read back in via xarray
 
     import xarray as xr
     ds = xr.open_dataset("my_bpch_data.nc")
+
+.. note::
+
+   As of v0.2.0, immediately writing to netcdf may not work due to the way variable
+   units and scaling factors are encoded when they are read into **xbpch**. This
+   will be fixed once some upstream issues with xarray are patched. If you run into
+   the following ``ValueError``::
+
+     ValueError: Failed hard to prevent overwriting key 'scale_factor'
+
+   then before you save it, process it with the :meth:`xbpch.common.fix_attr_encoding()`
+   method
+
+   .. ipython:: python
+     :verbatim:
+
+     my_ds = xbpch.common.fix_attr_encoding(my_ds)
+
+     my_ds.to_netcdf("my_data.nc")
diff --git a/xbpch/common.py b/xbpch/common.py
index 8284740..c6476c2 100644
--- a/xbpch/common.py
+++ b/xbpch/common.py
@@ -46,4 +46,46 @@ def get_timestamp(time=True, date=True, fmt=None):
         else:
             raise ValueError("One of `date` or `time` must be True!")
 
-    return datetime.now().strftime(fmt)
\ No newline at end of file
+    return datetime.now().strftime(fmt)
+
+
+def fix_attr_encoding(ds):
+    """ This is a temporary hot-fix to handle the way metadata is encoded
+    when we read data directly from bpch files. It removes the 'scale_factor'
+    and 'units' attributes we encode with the data we ingest, converts the
+    'hydrocarbon' and 'chemical' attribute to a binary integer instead of a
+    boolean, and removes the 'units' attribute from the "time" dimension since
+    that too is implicitly encoded.
+
+    In future versions of this library, when upstream issues in decoding
+    data wrapped in dask arrays is fixed, this won't be necessary and will be
+    removed.
+
+    """
+
+    def _maybe_del_attr(da, attr):
+        """ Possibly delete an attribute on a DataArray if it's present """
+        if attr in da.attrs:
+            del da.attrs[attr]
+        return da
+
+    def _maybe_decode_attr(da, attr):
+        # TODO: Fix this so that bools get written as attributes just fine
+        """ Possibly coerce an attribute on a DataArray to an easier type
+        to write to disk. """
+        # bool -> int
+        if (attr in da.attrs) and (type(da.attrs[attr] == bool)):
+            da.attrs[attr] = int(da.attrs[attr])
+        return da
+
+    for v in ds.data_vars:
+        da = ds[v]
+        da = _maybe_del_attr(da, 'scale_factor')
+        da = _maybe_del_attr(da, 'units')
+        da = _maybe_decode_attr(da, 'hydrocarbon')
+        da = _maybe_decode_attr(da, 'chemical')
+    # Also delete attributes on time.
+    times = ds.time
+    times = _maybe_del_attr(times, 'units')
+
+    return ds