Skip to content

Commit

Permalink
Fix to_npz (#12)
Browse files Browse the repository at this point in the history
if multiindex is contained it is restored when loading. This requires saving of metadata. In case metadata is not available because the file was saved with a previous version the index class is inferred by the array values.
  • Loading branch information
kayibal committed Jul 11, 2017
1 parent ed6ae45 commit 1c10bed
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 1 deletion.
17 changes: 16 additions & 1 deletion sparsity/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,21 @@ def traildb_to_coo(db, fieldname):

def to_npz(sf, filename):
data = _csr_to_dict(sf.data)
data['metadata'] = \
{'multiindex': True if isinstance(sf.index, pd.MultiIndex) else False}
data['frame_index'] = sf.index.values
data['frame_columns'] = sf.columns.values
np.savez(filename, **data)


def read_npz(filename):
loader = np.load(filename)
csr_mat = _load_csr(loader)
idx = loader['frame_index']
idx = _load_idx_from_npz(loader)
cols = loader['frame_columns']
return (csr_mat, idx, cols)


def _csr_to_dict(array):
return dict(data = array.data ,indices=array.indices,
indptr =array.indptr, shape=array.shape)
Expand All @@ -48,6 +52,17 @@ def _load_csr(loader):
shape=loader['shape'])


def _load_idx_from_npz(loader):
idx = loader['frame_index']
try:
if loader['metadata'][()]['multiindex']:
idx = pd.MultiIndex.from_tuples(idx)
except KeyError:
if all(map(lambda x: isinstance(x, tuple), idx)):
idx = pd.MultiIndex.from_tuples(idx)
return idx


def _just_read_array(path):
if path.endswith('hdf') or path.endswith('hdf5'):
return pd.read_hdf(path, '/df').values
Expand Down
21 changes: 21 additions & 0 deletions sparsity/test/test_sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scipy import sparse

from sparsity import SparseFrame, sparse_one_hot
from sparsity.io import _csr_to_dict

from .conftest import tmpdir

Expand Down Expand Up @@ -230,6 +231,26 @@ def test_set_index(sf_midx):
# assert np.all(sf.loc[[4, 5]].data.todense() == np.identity(5)[[3, 4]])


def test_save_load_multiindex(sf_midx):
with tmpdir() as tmp:
# test new
path = os.path.join(tmp, 'sf.npz')
sf_midx.to_npz(path)
res = SparseFrame.read_npz(path)
assert isinstance(res.index, pd.MultiIndex)

# test backwards compatibility
def _to_npz_legacy(sf, filename):
data = _csr_to_dict(sf.data)
data['frame_index'] = sf.index.values
data['frame_columns'] = sf.columns.values
np.savez(filename, **data)

_to_npz_legacy(sf_midx, path)
res = SparseFrame.read_npz(path)
assert isinstance(res.index, pd.MultiIndex)


def test_new_column_assign_array():
sf = SparseFrame(np.identity(5))
sf[6] = np.ones(5)
Expand Down

0 comments on commit 1c10bed

Please sign in to comment.