Fix to_npz (#12)

if multiindex is contained it is restored when loading. This requires saving of metadata. In case metadata is not available because the file was saved with a previous version the index class is inferred by the array values.
datarevenue-berlin · Jul 11, 2017 · 1c10bed · 1c10bed
1 parent ed6ae45
commit 1c10bed
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 1 deletion.
diff --git a/sparsity/io.py b/sparsity/io.py
@@ -26,17 +26,21 @@ def traildb_to_coo(db, fieldname):
 
 def to_npz(sf, filename):
     data = _csr_to_dict(sf.data)
+    data['metadata'] = \
+        {'multiindex': True if isinstance(sf.index, pd.MultiIndex) else False}
     data['frame_index'] = sf.index.values
     data['frame_columns'] = sf.columns.values
     np.savez(filename, **data)
 
+
 def read_npz(filename):
     loader = np.load(filename)
     csr_mat = _load_csr(loader)
-    idx = loader['frame_index']
+    idx = _load_idx_from_npz(loader)
     cols = loader['frame_columns']
     return (csr_mat, idx, cols)
 
+
 def _csr_to_dict(array):
     return dict(data = array.data ,indices=array.indices,
                 indptr =array.indptr, shape=array.shape)
@@ -48,6 +52,17 @@ def _load_csr(loader):
                              shape=loader['shape'])
 
 
+def _load_idx_from_npz(loader):
+    idx = loader['frame_index']
+    try:
+        if loader['metadata'][()]['multiindex']:
+            idx = pd.MultiIndex.from_tuples(idx)
+    except KeyError:
+        if all(map(lambda x: isinstance(x, tuple), idx)):
+            idx = pd.MultiIndex.from_tuples(idx)
+    return idx
+
+
 def _just_read_array(path):
     if path.endswith('hdf') or path.endswith('hdf5'):
         return pd.read_hdf(path, '/df').values

diff --git a/sparsity/test/test_sparse_frame.py b/sparsity/test/test_sparse_frame.py
@@ -9,6 +9,7 @@
 from scipy import sparse
 
 from sparsity import SparseFrame, sparse_one_hot
+from sparsity.io import _csr_to_dict
 
 from .conftest import tmpdir
 
@@ -230,6 +231,26 @@ def test_set_index(sf_midx):
     # assert np.all(sf.loc[[4, 5]].data.todense() == np.identity(5)[[3, 4]])
 
 
+def test_save_load_multiindex(sf_midx):
+    with tmpdir() as tmp:
+        # test new
+        path = os.path.join(tmp, 'sf.npz')
+        sf_midx.to_npz(path)
+        res = SparseFrame.read_npz(path)
+        assert isinstance(res.index, pd.MultiIndex)
+
+        # test backwards compatibility
+        def _to_npz_legacy(sf, filename):
+            data = _csr_to_dict(sf.data)
+            data['frame_index'] = sf.index.values
+            data['frame_columns'] = sf.columns.values
+            np.savez(filename, **data)
+
+        _to_npz_legacy(sf_midx, path)
+        res = SparseFrame.read_npz(path)
+        assert isinstance(res.index, pd.MultiIndex)
+
+
 def test_new_column_assign_array():
     sf = SparseFrame(np.identity(5))
     sf[6] = np.ones(5)