In [1]:
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
mat = csr_matrix((3,4), dtype=np.int8)
mat

<3x4 sparse matrix of type '<class 'numpy.int8'>'
	with 0 stored elements in Compressed Sparse Row format>

In [3]:
mat.toarray()

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int8)

In [7]:
# In (data, (row_idx, col_idx)) format: m[row_idx[k], col_idx[k]] = data[k]
data = np.array([1, 2, 3, 4, 5, 6])
row_indices = np.array([0, 0, 1, 2, 2, 2])
col_indices = np.array([0, 2, 2, 0, 1, 2])
mat = csr_matrix((data, (row_indices, col_indices)), shape=(3, 3))
mat

<3x3 sparse matrix of type '<class 'numpy.longlong'>'
	with 6 stored elements in Compressed Sparse Row format>

In [8]:
mat.toarray()

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]], dtype=int64)

In [9]:
# Standard Compressed Row Storage (CRS) format 
# In (data, indices, indptr) foramt
# The column indices for row i are stored in indices[indptr[i]:indptr[i+1]] 
# and their corresponding values are stored in data[indptr[i]:indptr[i+1]].
data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])
mat = csr_matrix((data, indices, indptr), shape=(3, 3))
mat.toarray()

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]])

In [16]:
V = np.array([5, 8, 3, 6])
COL_INDEX = np.array([0, 1, 2, 1])
ROW_PTR = np.array([0, 0, 2, 3, 4])
mat = csr_matrix((V, COL_INDEX, ROW_PTR), shape=(4, 4))
mat.toarray()

array([[0, 0, 0, 0],
       [5, 8, 0, 0],
       [0, 0, 3, 0],
       [0, 6, 0, 0]])

In [19]:
# Extract Row 0 (1st row)
row = 0
row_start = ROW_PTR[row]
row_end = ROW_PTR[row+1]
print('Row {} start: {}, end: {}'.format(row, row_start, row_end))
print(V[row_start: row_end])

# Extract Row 1 (2nd row)
row = 1
row_start = ROW_PTR[row]
row_end = ROW_PTR[row+1]
print('Row {} start: {}, end: {}'.format(row, row_start, row_end))
print(V[row_start: row_end])

start: 0, end: 0
[]
start: 0, end: 2
[5 8]


In [25]:
DATA = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
COL_INDEX = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2])
ROW_PTR = np.array([0, 3, 6, 9])
mat = csr_matrix((DATA, COL_INDEX, ROW_PTR), shape=(3, 3))
print(mat.toarray())

for i in range(3):
    row_start = ROW_PTR[i]
    row_end = ROW_PTR[i+1]
    print('Row {} start: {}, end: {}'.format(i, row_start, row_end))


[[1 2 3]
 [4 5 6]
 [7 8 9]]
Row 0 start: 0, end: 3
Row 1 start: 3, end: 6
Row 2 start: 6, end: 9


In [26]:
# Duplicate entries are summed together
row = np.array([0, 1, 2, 0])
col = np.array([0, 1, 1, 0])
data = np.array([1, 2, 4, 8])
csr_matrix((data, (row, col)), shape=(3, 3)).toarray()

array([[9, 0, 0],
       [0, 2, 0],
       [0, 4, 0]], dtype=int64)

In [31]:
# An example of how to construct a CSR matrix incrementally

# document-term matrix
# hello, world, goodbye, cruel
# 2,     1,     0,       0
# 0,     1,     1,       1   

docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]

data = [] # Dummy variable
indices = []
indptr = [0]
vocabulary = {}
for row in docs:
    for word in row:
        # If the key exist, this has no effect.
        index = vocabulary.setdefault(word, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))

print(vocabulary)
print(data)
print(indices)
print(indptr)
print(csr_matrix((data, indices, indptr)).toarray())

{'hello': 0, 'world': 1, 'goodbye': 2, 'cruel': 3}
[1, 1, 1, 1, 1, 1]
[0, 1, 0, 2, 3, 1]
[0, 3, 6]
[[2 1 0 0]
 [0 1 1 1]]


In [39]:
# Access data
DATA = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
COL_INDEX = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2])
ROW_PTR = np.array([0, 3, 6, 9])
mat = csr_matrix((DATA, COL_INDEX, ROW_PTR), shape=(3, 3))
print(mat.toarray())

# get Row 0
print(mat.getrow(0).toarray())

# get Column 1
print(mat.getcol(1).toarray())

# get a cell
print(mat[0,2])

# update cell
mat[0,2] = 100
print(mat.toarray())

# update row
mat[1] = np.array([40, 50 ,60])
print(mat.toarray())

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 2 3]]
[[2]
 [5]
 [8]]
3
[[  1   2 100]
 [  4   5   6]
 [  7   8   9]]
[[  1   2 100]
 [ 40  50  60]
 [  7   8   9]]
