In [1]:
import numpy as np
import sparse
import scipy.sparse as ss
from GCRS2 import CSR,CSC

The csr/csc arrays shown here are loosely based on the GCRS/GCCS formats presented in [Shaikh et al. 2015](https://ieeexplore.ieee.org/document/7237032). However, I've used a different linearization function where the first half of the axes represent the rows of the underlying sparse matrix and the remaining axes represent the columns. This is in line with numpy's reshape method. In general the compression ratio does not change much as additional dimensions are added to csr/csc and consequently offers much better compression than coo. In principle it should be possible to use these arrays in any place that expects the numpy ndarray API and also anything that works with scipy.sparse matrices. Dask, scikit-learn, and xarray are all good candidates for this.  Currently, csr/csc is much faster than coo for indexing 2d arrays, as should be the case. For arrays with more dimensions, the runtime is a bit longer because there are the additional steps of transforming nd-coords to 2d-coords and sometimes a last step of transforming coordinates afterwards. With a few algorithmic improvements and possibly compiling these other steps with numba I suspect that csr/csc will be faster than coo. The csc indexing still has some bugs that I'm working out but csr should mostly work. 1d arrays don't make a ton of sense for csr/csc and it might be best to return a 1d coo array when returning anything 1d. I'm not sure about that though. This codebase is very young and most everything is likely to change. I'm hoping that when it is ready, this code might be merged with pydata/sparse.

In [2]:
from GCRS2.convert2 import uncompress_dimension
def assert_eq(csr,coo):
    coords = np.vstack((uncompress_dimension(csr.indptr,csr.indices),csr.indices))
    assert np.array_equal(coords, coo.reshape(csr.compressed_shape).coords)
    assert np.array_equal(csr.data, coo.data)

In [3]:
coo = sparse.random((10,10,10),density=.2)
csr = CSR(coo)
assert_eq(csr,coo)

# Indexing
I think there are still a few bugs here and there but a fair amount works.

- for 2d scipy is still much faster

In [4]:
assert_eq(csr[:5,:5,:5],coo[:5,:5,:5])
assert_eq(csr[0,:5,:5],coo[0,:5,:5])
assert_eq(csr[:5,0,:5],coo[:5,0,:5])
assert_eq(csr[:5,:5,0],coo[:5,:5,0])
assert_eq(csr[:5,np.arange(5),0],coo[:5,np.arange(5),0])

In [5]:
coo = sparse.random((1000,1000),density=.2)
csr = CSR(coo)
scipy_test = coo.tocsr()

In [6]:
%timeit scipy_test[:800,:800]

370 µs ± 8.82 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
%timeit csr[:800,:800]

1.6 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit coo[:800,:800]

6.96 ms ± 169 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# 3D

In [9]:
coo = sparse.random((100,100,100),density=.2)
csr = CSR(coo)

In [10]:
%timeit csr[:80,:50,:90]

6.09 ms ± 104 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%timeit coo[:80,:50,:90]

5.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# 4D

In [12]:
coo = sparse.random((100,100,100,100),density=.2)
csr = CSR(coo)

In [13]:
%timeit csr[:80,:80,:80,:80]

1.14 s ± 3.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit coo[:80,:80,:80,:80]

863 ms ± 2.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Compression
# 2D Density of .2 and .01

In [15]:
#create random sparse array
coo = sparse.random((100,100),density=.2)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  80000  storage ratio:  1.0
no. bytes coo:  48000  storage ratio:  0.6
no. bytes csr:  32808  storage ratio:  0.4101
no. bytes csc:  32808  storage ratio:  0.4101


In [16]:
#create random sparse array
coo = sparse.random((100,100),density=.01)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  80000  storage ratio:  1.0
no. bytes coo:  2400  storage ratio:  0.03
no. bytes csr:  2408  storage ratio:  0.0301
no. bytes csc:  2408  storage ratio:  0.0301


# 3D Density of .2 and .01

In [17]:
#create random sparse array
coo = sparse.random((100,100,100),density=.2)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  8000000  storage ratio:  1.0
no. bytes coo:  6400000  storage ratio:  0.8
no. bytes csr:  3280008  storage ratio:  0.410001
no. bytes csc:  3200808  storage ratio:  0.400101


In [18]:
#create random sparse array
coo = sparse.random((100,100,100),density=.01)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  8000000  storage ratio:  1.0
no. bytes coo:  320000  storage ratio:  0.04
no. bytes csr:  240008  storage ratio:  0.030001
no. bytes csc:  160808  storage ratio:  0.020101


# 4D Density of .2 and .01

In [19]:
#create random sparse array
coo = sparse.random((50,50,50,50),density=.2)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  50000000  storage ratio:  1.0
no. bytes coo:  50000000  storage ratio:  1.0
no. bytes csr:  20020008  storage ratio:  0.40040016
no. bytes csc:  20020008  storage ratio:  0.40040016


In [20]:
#create random sparse array
coo = sparse.random((50,50,50,50),density=.01)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  50000000  storage ratio:  1.0
no. bytes coo:  2500000  storage ratio:  0.05
no. bytes csr:  1020008  storage ratio:  0.02040016
no. bytes csc:  1020008  storage ratio:  0.02040016


# 5D Density of .2 and .01

In [21]:
#create random sparse array
coo = sparse.random((15,15,15,15,15),density=.2)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  6075000  storage ratio:  1.0
no. bytes coo:  7290000  storage ratio:  1.2
no. bytes csr:  2457008  storage ratio:  0.40444576131687243
no. bytes csc:  2431808  storage ratio:  0.40029761316872425


In [22]:
#create random sparse array
coo = sparse.random((15,15,15,15,15),density=.01)
dense = coo.todense()
csr = CSR(coo)
csc = CSC(coo)
print('no. bytes dense: ',dense.nbytes,' storage ratio: ', dense.nbytes/dense.nbytes)
print('no. bytes coo: ',coo.nbytes, ' storage ratio: ', coo.nbytes/dense.nbytes)
print('no. bytes csr: ',csr.nbytes,' storage ratio: ', csr.nbytes/dense.nbytes)
print('no. bytes csc: ',csc.nbytes,' storage ratio: ', csc.nbytes/dense.nbytes)

no. bytes dense:  6075000  storage ratio:  1.0
no. bytes coo:  364464  storage ratio:  0.059994074074074075
no. bytes csr:  148496  storage ratio:  0.024443786008230453
no. bytes csc:  123296  storage ratio:  0.020295637860082306
