In [1]:
from tinygrad.densetensor import DenseTensor
from tinygrad.sparsetensor import SparseTensor
import numpy as np

%load_ext autoreload
%autoreload 2

DEVICE:GPU


In [2]:
x_init = np.random.randn(2,6).astype(np.float32)
x2_init = np.random.randn(3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(6,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)

x = DenseTensor(x_init)
W = DenseTensor(W_init)
m = DenseTensor(m_init)
out = x.dot(W).relu()
out = out.logsoftmax()
out = out.mul(m).add(m).sum()
out.backward()

out.cpu().data, x

x2 = DenseTensor(x2_init)#.gpu()
W = SparseTensor(W_init)
out = W.dot(x2).relu().sum()

out.backward()

out.cpu().data, x

In [3]:
import numpy as np
import pyopencl as cl

mf = cl.mem_flags

In [4]:
dim1 = 8
dim2 = 16
dim3 = 5
topkx = 5
topky = 8
topk  = topkx
bs = dim3

np.random.seed(9)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.4

a = np.zeros((dim1,dim2))
b = np.zeros((dim2,dim3)).astype(np.float32)

a.shape, b.shape

((8, 16), (16, 5))

In [5]:
x_init = np.random.randn(dim1,dim3).astype(np.float32)
w_init = np.random.randn(dim2,dim3).astype(np.float32)

In [6]:
w_init

array([[-6.56452596e-01, -5.62572964e-02, -4.99902606e-01,
         4.36419368e-01, -3.75813037e-01],
       [-9.23061609e-01,  1.91725028e+00, -1.50302842e-01,
        -6.38729751e-01,  8.24770331e-01],
       [-1.21083879e+00, -5.03405392e-01, -7.01915681e-01,
        -1.97427106e+00, -2.65573215e+00],
       [-5.76822497e-02, -6.56186581e-01, -6.61706686e-01,
         7.69348443e-01, -8.99004877e-01],
       [ 1.69363797e+00, -1.69733524e+00, -2.79337025e+00,
        -2.26150647e-01,  3.97428840e-01],
       [ 1.65970361e+00, -4.93746817e-01, -3.76097679e-01,
        -1.69739768e-01,  2.41710639e+00],
       [-1.80884051e+00,  3.39751154e-01, -2.27297600e-02,
        -9.59997058e-01, -3.83114427e-01],
       [ 1.09529994e-01, -8.55162859e-01,  2.21606664e-04,
         6.63855076e-01,  7.49480963e-01],
       [-4.65818375e-01, -2.77439266e-01,  3.54995355e-02,
         8.48221183e-01,  1.62998557e-01],
       [ 1.20862365e+00,  5.02520800e-01, -1.58382213e+00,
         1.02303350e+00

In [7]:
def fill_sparse(mat, sparsity=0.5):
    indices = np.array(range(mat.shape[1]))
    nrows = int(mat.shape[1]*sparsity)
    for row in range(mat.shape[0]):
        lim = nrows #+ int(np.random.random()*3)
        mat[row][np.random.permutation(indices)[:lim]] = np.random.random(lim)
    return mat

a = fill_sparse(a, sparsity).astype(np.float32)
b = fill_sparse(b, sparsity).astype(np.float32)

In [8]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [9]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [10]:
x2_init.T

array([-1.6508101 ,  0.20557435, -0.4799411 ], dtype=float32)

In [11]:
mult = a.dot(b)
mult.shape

(8, 5)

In [12]:
mult.shape

(8, 5)

In [13]:
def to_data(mat):
    all_rows = []
    all_idxs = []
    all_nnzs = []
    for row in range(mat.shape[0]):
        rowdata = []
        colidxs = []
        all_nnzs.append(0)
        for col in range(mat.shape[1]):
            val = mat[row][col]
            if val != 0:
                rowdata.append(val)
                colidxs.append(col)
                all_nnzs[-1] += 1
        all_rows.append(rowdata)
        all_idxs.append(colidxs)
    
    ellwidth = min(int(np.sqrt(np.max(all_nnzs))+1)**2, mat.shape[1])
    ellwidth = mat.shape[1]
    #all_rows = np.array(all_rows)#.astype(np.float32).flatten()           
    for row in range(mat.shape[0]):
        #print(row, all_rows)
        all_rows[row] = np.array(all_rows[row])
        all_rows[row].resize(ellwidth)
        all_idxs[row] = np.array(all_idxs[row])
        all_idxs[row].resize(ellwidth)
        #print(all_idxs[row])
    all_rows = np.array(all_rows)
    all_idxs = np.array(all_idxs)
    all_nnzs = np.array(all_nnzs)
    
#     while (not all_rows[:,-1].any()):
#         all_rows = all_rows[:,:-1]
#         all_idxs = all_idxs[:,:-1]
#         ellwidth -= 1
        
    
    all_rows = np.array(all_rows).astype(np.float32).flatten()
    all_idxs = np.array(all_idxs).astype(np.uint32).flatten()
    
    all_nnzs = np.array(all_nnzs).astype(np.uint32)
    
    
    return all_rows, all_idxs, all_nnzs, ellwidth

In [14]:
def to_dense(data, cols, nnzs, ellw, shape):
    out = np.zeros(shape)
    for row in range(shape[0]):
        for icol in range(nnzs[row]):
            out[row,cols[row*ellw+icol]] = data[row*ellw+icol]
    return out

In [15]:
wdata, wcols, wnnz, ellww = to_data(w_init)
wdata, wcols, wnnz, ellww

(array([-6.56452596e-01, -5.62572964e-02, -4.99902606e-01,  4.36419368e-01,
        -3.75813037e-01, -9.23061609e-01,  1.91725028e+00, -1.50302842e-01,
        -6.38729751e-01,  8.24770331e-01, -1.21083879e+00, -5.03405392e-01,
        -7.01915681e-01, -1.97427106e+00, -2.65573215e+00, -5.76822497e-02,
        -6.56186581e-01, -6.61706686e-01,  7.69348443e-01, -8.99004877e-01,
         1.69363797e+00, -1.69733524e+00, -2.79337025e+00, -2.26150647e-01,
         3.97428840e-01,  1.65970361e+00, -4.93746817e-01, -3.76097679e-01,
        -1.69739768e-01,  2.41710639e+00, -1.80884051e+00,  3.39751154e-01,
        -2.27297600e-02, -9.59997058e-01, -3.83114427e-01,  1.09529994e-01,
        -8.55162859e-01,  2.21606664e-04,  6.63855076e-01,  7.49480963e-01,
        -4.65818375e-01, -2.77439266e-01,  3.54995355e-02,  8.48221183e-01,
         1.62998557e-01,  1.20862365e+00,  5.02520800e-01, -1.58382213e+00,
         1.02303350e+00, -6.53017402e-01,  5.37045121e-01, -7.97706190e-03,
         9.2

In [16]:
wdatat, wcolst, wnnzt, ellwwt = to_data(w_init.T)
wdatat, wcolst, wnnzt, ellwwt

(array([-6.56452596e-01, -9.23061609e-01, -1.21083879e+00, -5.76822497e-02,
         1.69363797e+00,  1.65970361e+00, -1.80884051e+00,  1.09529994e-01,
        -4.65818375e-01,  1.20862365e+00,  5.37045121e-01, -1.65366137e+00,
         5.61277032e-01, -1.27321005e+00,  9.70861197e-01, -5.30199170e-01,
        -5.62572964e-02,  1.91725028e+00, -5.03405392e-01, -6.56186581e-01,
        -1.69733524e+00, -4.93746817e-01,  3.39751154e-01, -8.55162859e-01,
        -2.77439266e-01,  5.02520800e-01, -7.97706190e-03,  1.36799216e+00,
         2.39725500e-01,  9.20076132e-01,  1.15472412e+00,  8.14134836e-01,
        -4.99902606e-01, -1.50302842e-01, -7.01915681e-01, -6.61706686e-01,
        -2.79337025e+00, -3.76097679e-01, -2.27297600e-02,  2.21606664e-04,
         3.54995355e-02, -1.58382213e+00,  9.24784184e-01,  2.51062457e-02,
         4.79899108e-01,  1.05491853e+00,  4.22608823e-01, -1.01988423e+00,
         4.36419368e-01, -6.38729751e-01, -1.97427106e+00,  7.69348443e-01,
        -2.2

In [17]:
adata, acols, annz, ellwa = to_data(a)
adata, acols, annz, ellwa

(array([0.68347037, 0.8035886 , 0.08023349, 0.8858316 , 0.17861794,
        0.46201044, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.9562663 , 0.5531271 , 0.5375557 , 0.20370705,
        0.35530138, 0.9186601 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.20103766, 0.74341315, 0.5787496 ,
        0.05387774, 0.5917517 , 0.17797045, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.71459514, 0.7448136 , 0.66635793, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.59893256,
        0.9217044 , 0.16333483, 0.17167741, 0.7605819 , 0.48675168,
        0.        , 0.        , 0.        , 0.  

In [18]:
adatat, acolst, annzt, ellwat = to_data(a.T)
adatat, acolst, annzt, ellwat

(array([0.9562663 , 0.11415514, 0.28872353, 0.67626965, 0.        ,
        0.        , 0.        , 0.        , 0.68347037, 0.5531271 ,
        0.20103766, 0.82574075, 0.04889954, 0.        , 0.        ,
        0.        , 0.8035886 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.14296253,
        0.59893256, 0.6493426 , 0.35563806, 0.        , 0.        ,
        0.        , 0.        , 0.5375557 , 0.9103574 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.74341315, 0.03242496, 0.27978534, 0.33329687, 0.        ,
        0.        , 0.        , 0.        , 0.69630474, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.08023349, 0.71459514, 0.9217044 , 0.25018668,
        0.        , 0.        , 0.        , 0.        , 0.8858316 ,
        0.6753613 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.5787496 , 0.16

In [19]:
bdata, bcols, bnnz, ellwb = to_data(b)
bdata, bcols, bnnz, ellwb

(array([0.49395892, 0.46451807, 0.        , 0.        , 0.        ,
        0.65864426, 0.9786625 , 0.        , 0.        , 0.        ,
        0.31044865, 0.02502257, 0.        , 0.        , 0.        ,
        0.46765924, 0.03160795, 0.        , 0.        , 0.        ,
        0.71082693, 0.58997136, 0.        , 0.        , 0.        ,
        0.84622455, 0.5916316 , 0.        , 0.        , 0.        ,
        0.46450275, 0.9429863 , 0.        , 0.        , 0.        ,
        0.34248227, 0.7672639 , 0.        , 0.        , 0.        ,
        0.9855889 , 0.2535647 , 0.        , 0.        , 0.        ,
        0.7705159 , 0.31522992, 0.        , 0.        , 0.        ,
        0.7167741 , 0.8238369 , 0.        , 0.        , 0.        ,
        0.91970533, 0.7873889 , 0.        , 0.        , 0.        ,
        0.404568  , 0.841323  , 0.        , 0.        , 0.        ,
        0.674503  , 0.48820347, 0.        , 0.        , 0.        ,
        0.18725161, 0.11607183, 0.        , 0.  

In [20]:
bdatat, bcolst, bnnzt, ellwbt = to_data(b.T)
adatat, bcolst, bnnzt, ellwbt

(array([0.9562663 , 0.11415514, 0.28872353, 0.67626965, 0.        ,
        0.        , 0.        , 0.        , 0.68347037, 0.5531271 ,
        0.20103766, 0.82574075, 0.04889954, 0.        , 0.        ,
        0.        , 0.8035886 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.14296253,
        0.59893256, 0.6493426 , 0.35563806, 0.        , 0.        ,
        0.        , 0.        , 0.5375557 , 0.9103574 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.74341315, 0.03242496, 0.27978534, 0.33329687, 0.        ,
        0.        , 0.        , 0.        , 0.69630474, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.08023349, 0.71459514, 0.9217044 , 0.25018668,
        0.        , 0.        , 0.        , 0.        , 0.8858316 ,
        0.6753613 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.5787496 , 0.16

In [21]:
adense = to_dense(adata, acols, annz, ellwa, a.shape)

In [22]:
adenset = to_dense(adatat, acolst, annzt, ellwat, a.T.shape)

In [23]:
bdense = to_dense(bdata, bcols, bnnz, ellwb, b.shape)

In [24]:
bdenset = to_dense(bdatat, bcolst, bnnzt, ellwbt, b.T.shape)

In [25]:
adense

array([[0.        , 0.68347037, 0.80358863, 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.88583159, 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.95626628, 0.55312711, 0.        , 0.        , 0.53755569,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.59175169, 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.91035742,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.74481362, 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [26]:
adenset.T == adense

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  Tru

In [27]:
bdenset.T == bdense

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [28]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [29]:
a == adense

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  Tru

In [30]:
a.shape

(8, 16)

In [31]:
adata.shape, acols.shape, annz.shape, ellwa

((128,), (128,), (8,), 16)

In [32]:
#acols = acols.astype(np.uint32)
#annz = annz.astype(np.uint32)

In [33]:
adata, acols, annz, b

(array([0.68347037, 0.8035886 , 0.08023349, 0.8858316 , 0.17861794,
        0.46201044, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.9562663 , 0.5531271 , 0.5375557 , 0.20370705,
        0.35530138, 0.9186601 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.20103766, 0.74341315, 0.5787496 ,
        0.05387774, 0.5917517 , 0.17797045, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.71459514, 0.7448136 , 0.66635793, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.59893256,
        0.9217044 , 0.16333483, 0.17167741, 0.7605819 , 0.48675168,
        0.        , 0.        , 0.        , 0.  

## MatMul (Sparse-Dense)

adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      float sum = 0;
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[col*ncols+gid2];
          //if (gid==0 && gid2==2)
          //  printf("aval, xval: %.2f,%.2f: (%i,%i) \\n", aval, xval, col, index);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [34]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          uint xidx = col*ncols+gid2;
          float xval  = vector_x[xidx];
          if (gid==0 && gid2==1)
            printf("aval, xval: %.2f,%.2f: (%i,%i) - %i \\n", aval, xval, col, index, xidx);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [35]:
a.shape, b.shape

((8, 16), (16, 5))

In [36]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [37]:
rows = a.shape[0]

In [38]:
mult = mult.astype(np.float32)

In [39]:
outshape = (a.shape[0], b.shape[1])
outshape

(8, 5)

In [40]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul2  # Use this Kernel object for repeated calls
knl(queue, [outshape[0]], None, adata_buf, acols_buf, annzs_buf, np.uint32(ellwa), np.uint32(outshape[1]), b_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

aval, xval: 0.68,0.00: (1,0) - 6 
aval, xval: 0.80,0.00: (2,1) - 11 
aval, xval: 0.08,0.34: (7,2) - 36 
aval, xval: 0.89,0.00: (8,3) - 41 
aval, xval: 0.18,0.72: (10,4) - 51 
aval, xval: 0.46,0.00: (12,5) - 61 


<pyopencl._cl.NannyEvent at 0x7f5a6439e4f0>

In [41]:
(res_np-mult).sum()

0.0

In [42]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [43]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [44]:
res_buf

<pyopencl._cl.Buffer at 0x7f5a645f7cc0>

In [45]:
res_np

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [46]:
mult

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [47]:
res_np==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [48]:
res_np.shape

(8, 5)

In [49]:
mult.shape

(8, 5)

## MatMul (dense * sparse)

In [50]:
bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmul(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        uint nnz = rowNnz[gid2];
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid2 * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[gid*mwidth+col];
          sum  += aval * xval;
          if (gid==0 && gid2==0)
            printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [51]:
a.shape, b.shape

((8, 16), (16, 5))

In [52]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [53]:
rows = a.shape[0]

In [54]:
a.shape, b.shape

((8, 16), (16, 5))

In [55]:
mult = a.dot(b)
mult = mult.astype(np.float32)

In [56]:
outshape = np.array([a.shape[0], b.shape[1]])
outshape

array([8, 5])

In [57]:
b.T

array([[0.49395892, 0.65864426, 0.31044865, 0.        , 0.71082693,
        0.        , 0.46450275, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.18725161,
        0.7250254 ],
       [0.        , 0.        , 0.        , 0.46765924, 0.        ,
        0.        , 0.        , 0.34248227, 0.        , 0.7705159 ,
        0.7167741 , 0.91970533, 0.        , 0.        , 0.11607183,
        0.        ],
       [0.46451807, 0.9786625 , 0.02502257, 0.        , 0.58997136,
        0.84622455, 0.        , 0.7672639 , 0.9855889 , 0.31522992,
        0.        , 0.        , 0.        , 0.674503  , 0.        ,
        0.12346577],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5916316 , 0.        , 0.        , 0.        , 0.        ,
        0.8238369 , 0.7873889 , 0.404568  , 0.48820347, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.03160795, 0.        ,
        0.        , 0.9429863 , 

In [58]:
a.T

array([[0.        , 0.9562663 , 0.        , 0.        , 0.        ,
        0.11415514, 0.28872353, 0.67626965],
       [0.68347037, 0.5531271 , 0.20103766, 0.        , 0.        ,
        0.82574075, 0.04889954, 0.        ],
       [0.8035886 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.59893256,
        0.        , 0.6493426 , 0.35563806],
       [0.        , 0.5375557 , 0.        , 0.9103574 , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.74341315, 0.03242496, 0.        ,
        0.27978534, 0.33329687, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.69630474],
       [0.08023349, 0.        , 0.        , 0.71459514, 0.9217044 ,
        0.25018668, 0.        , 0.        ],
       [0.8858316 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.      

In [59]:
outshape.T

array([8, 5])

In [60]:
b.shape, outshape

((16, 5), array([8, 5]))

In [61]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul  # Use this Kernel object for repeated calls
knl(queue, [outshape.T[0]], None, bdatat_buf, bcolst_buf, bnnzst_buf, np.uint32(ellwbt), np.uint32(b.shape[0]), np.uint32(outshape.T[1]), a_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

aval, xval: 0.49,0.00 - 0.00: (0,0) 
aval, xval: 0.66,0.68 - 0.45: (1,1) 
aval, xval: 0.31,0.80 - 0.70: (2,2) 
aval, xval: 0.71,0.00 - 0.70: (4,3) 
aval, xval: 0.46,0.00 - 0.70: (6,4) 
aval, xval: 0.19,0.00 - 0.70: (14,5) 
aval, xval: 0.73,0.00 - 0.70: (15,6) 


<pyopencl._cl.NannyEvent at 0x7f5a643b2db0>

In [62]:
(res_np-mult).sum()

0.0

In [63]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [64]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [65]:
res_buf

<pyopencl._cl.Buffer at 0x7f5a6439e8b0>

In [66]:
res_np

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [67]:
mult

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [68]:
res_np==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [69]:
res_np-mult

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)

In [70]:
res_np.shape

(8, 5)

In [71]:
mult.shape

(8, 5)

## MatMul2 (dense * sparse)

wdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdata)
wcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcols)
wnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnz)
wdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdatat)
wcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcolst)
wnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnzt)
x_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=x_init)

prg = cl.Program(ctx, """
    // DENSE x SPARSE-T
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint nnz = rowNnz[gid];

      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[gid2*ncols+col];
          sum  += aval * xval;
          if (gid==0 && gid2==1)
            printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid2*ncols+gid] = sum;
      }
    }""").build()

In [72]:
wdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdata)
wcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcols)
wnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnz)
wdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdatat)
wcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcolst)
wnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnzt)
x_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=x_init)

prg = cl.Program(ctx, """
    // DENSE x SPARSE-T
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols0,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint nnz = rowNnz[gid];
      uint gid2 = get_global_id(1);
      uint ncols = get_global_size(1);

      float sum = 0;
      for (uint i = 0; i < nnz; i++) {
        uint index   = (gid2 * ellwidth) + i;
        uint col     = colIdx[index];
        float aval  = matData[index];
        float xval  = vector_x[gid*mwidth+col];
        sum  += aval * xval;
        if (gid==1 && gid2==0) {
          printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
      }
      //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
      vector_y[gid*ncols+gid2] = sum;
    }""").build()

In [73]:
outshape

array([8, 5])

In [74]:
w_init.shape, x_init.shape
w = w_init
x = x_init

In [75]:
res = np.zeros(w.shape[0]).astype(np.float32)
#res

In [76]:
rows = w.shape[0]

In [77]:
mult = mult.astype(np.float32)

In [78]:
outshape = np.array([x.shape[0], w.shape[0]])
outshape

array([ 8, 16])

In [79]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul2  # Use this Kernel object for repeated calls
knl(queue, outshape, None, wdata_buf, wcols_buf, wnnzs_buf, np.uint32(ellww), np.uint32(w.shape[1]), np.uint32(x.shape[1]), x_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

aval, xval: -0.66,-0.48 - 0.32: (0,0) 
aval, xval: -0.06,-1.52 - 0.40: (1,1) 
aval, xval: -0.50,-0.49 - 0.65: (2,2) 
aval, xval: 0.44,-0.24 - 0.54: (3,3) 
aval, xval: -0.38,-0.65 - 0.79: (4,4) 


<pyopencl._cl.NannyEvent at 0x7f5a643b20e0>

In [80]:
mult = x.dot(w_init.T)
mult.shape

(8, 16)

In [81]:
mult

array([[ 0.71005666, -0.6922365 ,  1.958061  ,  1.258677  ,  3.463459  ,
        -0.3478019 ,  0.08231293, -0.04464556, -0.03240513,  1.8573855 ,
        -1.0835508 , -0.9068153 , -0.20014307, -1.5690712 , -1.1520894 ,
         0.6737664 ],
       [ 0.7850611 , -2.7718844 ,  3.8869042 ,  1.7455592 ,  2.9286547 ,
        -1.3900536 ,  0.8452302 ,  0.59935635,  0.31789836, -0.3896514 ,
        -0.80347735, -2.0294168 , -0.19058926, -1.6315975 , -3.2396262 ,
        -0.37664318],
       [-1.0399084 ,  3.7561824 , -8.091907  , -2.4692414 , -2.1408994 ,
         4.37048   , -1.9433106 ,  0.41747692,  0.1287814 ,  0.7065758 ,
         0.89383984,  3.3977637 , -0.99121714,  2.0500147 ,  5.0695367 ,
         0.4186813 ],
       [-0.46678752,  3.8533442 , -3.8412805 , -1.7131798 ,  1.1737914 ,
         3.4516523 , -0.6000567 , -0.39082292, -0.56545794,  1.7348484 ,
        -0.43604594,  3.096307  , -1.2181851 ,  0.24772911,  2.8284123 ,
         3.1381838 ],
       [-0.93206084, -0.5172032 , -3

In [82]:
res_np

array([[ 0.71005666, -0.6922365 ,  1.958061  ,  1.258677  ,  3.463459  ,
        -0.3478019 ,  0.08231293, -0.04464556, -0.03240513,  1.8573855 ,
        -1.0835508 , -0.9068153 , -0.20014307, -1.5690712 , -1.1520894 ,
         0.6737664 ],
       [ 0.7850611 , -2.7718844 ,  3.8869042 ,  1.7455592 ,  2.9286547 ,
        -1.3900536 ,  0.8452302 ,  0.59935635,  0.31789836, -0.3896514 ,
        -0.80347735, -2.0294168 , -0.19058926, -1.6315975 , -3.2396262 ,
        -0.37664318],
       [-1.0399084 ,  3.7561824 , -8.091907  , -2.4692414 , -2.1408994 ,
         4.37048   , -1.9433106 ,  0.41747692,  0.1287814 ,  0.7065758 ,
         0.89383984,  3.3977637 , -0.99121714,  2.0500147 ,  5.0695367 ,
         0.4186813 ],
       [-0.46678752,  3.8533442 , -3.8412805 , -1.7131798 ,  1.1737914 ,
         3.4516523 , -0.6000567 , -0.39082292, -0.56545794,  1.7348484 ,
        -0.43604594,  3.096307  , -1.2181851 ,  0.24772911,  2.8284123 ,
         3.1381838 ],
       [-0.93206084, -0.5172032 , -3

In [83]:
x

array([[ 1.1085547e-03, -2.8954408e-01, -1.1160663e+00, -1.2882757e-02,
        -3.7836146e-01],
       [-4.8113537e-01, -1.5173311e+00, -4.9087200e-01, -2.4068058e-01,
        -6.4794743e-01],
       [ 6.3589108e-01,  1.7401173e+00,  2.9668221e-01,  7.0750368e-01,
         1.8228158e+00],
       [ 4.3076903e-01,  1.5427296e+00, -9.0072119e-01, -1.3712502e-01,
         1.2975791e+00],
       [ 6.7527115e-01,  3.1958118e-02,  9.1814590e-01,  3.8050947e-01,
         5.1636750e-01],
       [-3.5523945e-01,  2.0877700e-01,  3.2841107e-01, -4.9822477e-01,
        -2.0917768e+00],
       [-8.2587741e-02,  2.4551826e+00, -2.6721101e+00, -9.1327929e-01,
        -2.2731435e-01],
       [ 2.6931539e-01,  1.1304612e+00,  1.0423975e+00,  1.3038105e+00,
         1.3894007e+00]], dtype=float32)

In [84]:
w

array([[-6.56452596e-01, -5.62572964e-02, -4.99902606e-01,
         4.36419368e-01, -3.75813037e-01],
       [-9.23061609e-01,  1.91725028e+00, -1.50302842e-01,
        -6.38729751e-01,  8.24770331e-01],
       [-1.21083879e+00, -5.03405392e-01, -7.01915681e-01,
        -1.97427106e+00, -2.65573215e+00],
       [-5.76822497e-02, -6.56186581e-01, -6.61706686e-01,
         7.69348443e-01, -8.99004877e-01],
       [ 1.69363797e+00, -1.69733524e+00, -2.79337025e+00,
        -2.26150647e-01,  3.97428840e-01],
       [ 1.65970361e+00, -4.93746817e-01, -3.76097679e-01,
        -1.69739768e-01,  2.41710639e+00],
       [-1.80884051e+00,  3.39751154e-01, -2.27297600e-02,
        -9.59997058e-01, -3.83114427e-01],
       [ 1.09529994e-01, -8.55162859e-01,  2.21606664e-04,
         6.63855076e-01,  7.49480963e-01],
       [-4.65818375e-01, -2.77439266e-01,  3.54995355e-02,
         8.48221183e-01,  1.62998557e-01],
       [ 1.20862365e+00,  5.02520800e-01, -1.58382213e+00,
         1.02303350e+00

In [85]:
(res_np-mult).sum()

0.0

In [86]:
mult

array([[ 0.71005666, -0.6922365 ,  1.958061  ,  1.258677  ,  3.463459  ,
        -0.3478019 ,  0.08231293, -0.04464556, -0.03240513,  1.8573855 ,
        -1.0835508 , -0.9068153 , -0.20014307, -1.5690712 , -1.1520894 ,
         0.6737664 ],
       [ 0.7850611 , -2.7718844 ,  3.8869042 ,  1.7455592 ,  2.9286547 ,
        -1.3900536 ,  0.8452302 ,  0.59935635,  0.31789836, -0.3896514 ,
        -0.80347735, -2.0294168 , -0.19058926, -1.6315975 , -3.2396262 ,
        -0.37664318],
       [-1.0399084 ,  3.7561824 , -8.091907  , -2.4692414 , -2.1408994 ,
         4.37048   , -1.9433106 ,  0.41747692,  0.1287814 ,  0.7065758 ,
         0.89383984,  3.3977637 , -0.99121714,  2.0500147 ,  5.0695367 ,
         0.4186813 ],
       [-0.46678752,  3.8533442 , -3.8412805 , -1.7131798 ,  1.1737914 ,
         3.4516523 , -0.6000567 , -0.39082292, -0.56545794,  1.7348484 ,
        -0.43604594,  3.096307  , -1.2181851 ,  0.24772911,  2.8284123 ,
         3.1381838 ],
       [-0.93206084, -0.5172032 , -3

In [87]:
res_np==mult

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  Tru

In [88]:
res_np-mult

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

In [89]:
res_np.shape

(8, 16)

In [90]:
mult.shape

(8, 16)

In [91]:
asdf

NameError: name 'asdf' is not defined

## MatMul (dense * sparse) NEW

bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmulnew(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint gid2 = get_global_id(1);
      uint ncols = get_global_size(1);
      uint nnz = rowNnz[gid2];
      float sum = 0;
      for (uint i = 0; i < nnz; i++) {
        uint index   = (gid2 * ellwidth) + i;
        uint col     = colIdx[index];
        float aval  = matData[index];
        float xval  = vector_x[gid*mwidth+col];
        vector_y[gid2*nrows+gid] += aval * xval;
        if (gid==0 && gid2==0)
          printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        
      }
      
    }""").build()

In [None]:
mult = a.dot(b)
mult

In [None]:
bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmulnew(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint gid2 = get_global_id(1);
      uint ncols = get_global_size(1);
      uint nnz = rowNnz[gid2];
      float sum = 0;
      for (uint i = 0; i < nnz; i++) {
        uint index   = (gid2 * ellwidth) + i;
        uint col     = colIdx[index];
        float aval  = matData[index];
        float xval  = vector_x[gid*mwidth+col];
        sum  += aval * xval;
        if (gid==1 && gid2==0)
          printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
      }
      vector_y[gid2*ncols+gid] = sum;
    }""").build()

In [92]:
a.shape, b.shape

((8, 16), (16, 5))

In [93]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [94]:
rows = a.shape[0]

In [95]:
mult = mult.astype(np.float32)

In [96]:
outshape = np.array([a.shape[0], b.shape[1]])
outshape

array([8, 5])

In [97]:
b.T

array([[0.49395892, 0.65864426, 0.31044865, 0.        , 0.71082693,
        0.        , 0.46450275, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.18725161,
        0.7250254 ],
       [0.        , 0.        , 0.        , 0.46765924, 0.        ,
        0.        , 0.        , 0.34248227, 0.        , 0.7705159 ,
        0.7167741 , 0.91970533, 0.        , 0.        , 0.11607183,
        0.        ],
       [0.46451807, 0.9786625 , 0.02502257, 0.        , 0.58997136,
        0.84622455, 0.        , 0.7672639 , 0.9855889 , 0.31522992,
        0.        , 0.        , 0.        , 0.674503  , 0.        ,
        0.12346577],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5916316 , 0.        , 0.        , 0.        , 0.        ,
        0.8238369 , 0.7873889 , 0.404568  , 0.48820347, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.03160795, 0.        ,
        0.        , 0.9429863 , 

In [98]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [99]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmulnew  # Use this Kernel object for repeated calls
knl(queue, outshape, None, bdatat_buf, bcolst_buf, bnnzst_buf, np.uint32(ellwbt), np.uint32(b.shape[0]), a_buf, res_buf)

res_np = np.zeros((outshape[0],)).astype(np.float32)
print(res_np.shape)
cl.enqueue_copy(queue, res_np, res_buf)

AttributeError: 'matmulnew' was not found as a program info attribute or as a kernel name

In [100]:
(res_np-mult.T).sum()

ValueError: operands could not be broadcast together with shapes (8,16) (16,8) 

In [101]:
res_buf

<pyopencl._cl.Buffer at 0x7f5a64307f90>

In [102]:
res_np.T

array([[ 0.71005666,  0.7850611 , -1.0399084 , -0.46678752, -0.93206084,
         0.6259611 ,  0.93874276, -0.71463346],
       [-0.6922365 , -2.7718844 ,  3.7561824 ,  3.8533442 , -0.5172032 ,
        -0.72817993,  5.580915  ,  2.0752609 ],
       [ 1.958061  ,  3.8869042 , -8.091907  , -3.8412805 , -3.6007562 ,
         6.6333513 ,  3.1463914 , -7.8908043 ],
       [ 1.258677  ,  1.7455592 , -2.4692414 , -1.7131798 , -0.83893746,
         1.1633917 , -0.33641425, -1.693083  ],
       [ 3.463459  ,  2.9286547 , -2.1408994 ,  1.1737914 , -1.3561333 ,
        -2.5920439 ,  3.2732484 , -4.1171207 ],
       [-0.3478019 , -1.3900536 ,  4.37048   ,  3.4516523 ,  1.9431857 ,
        -5.787668  , -0.73875856,  2.6337998 ],
       [ 0.08231293,  0.8452302 , -1.9433106 , -0.6000567 , -1.7945851 ,
         1.9857233 ,  2.0081086 , -1.9107202 ],
       [-0.04464556,  0.59935635,  0.41747692, -0.39082292,  0.68644726,
        -2.115871  , -2.885872  ,  0.9698713 ],
       [-0.03240513,  0.31789836

In [103]:
mult

array([[ 0.71005666, -0.6922365 ,  1.958061  ,  1.258677  ,  3.463459  ,
        -0.3478019 ,  0.08231293, -0.04464556, -0.03240513,  1.8573855 ,
        -1.0835508 , -0.9068153 , -0.20014307, -1.5690712 , -1.1520894 ,
         0.6737664 ],
       [ 0.7850611 , -2.7718844 ,  3.8869042 ,  1.7455592 ,  2.9286547 ,
        -1.3900536 ,  0.8452302 ,  0.59935635,  0.31789836, -0.3896514 ,
        -0.80347735, -2.0294168 , -0.19058926, -1.6315975 , -3.2396262 ,
        -0.37664318],
       [-1.0399084 ,  3.7561824 , -8.091907  , -2.4692414 , -2.1408994 ,
         4.37048   , -1.9433106 ,  0.41747692,  0.1287814 ,  0.7065758 ,
         0.89383984,  3.3977637 , -0.99121714,  2.0500147 ,  5.0695367 ,
         0.4186813 ],
       [-0.46678752,  3.8533442 , -3.8412805 , -1.7131798 ,  1.1737914 ,
         3.4516523 , -0.6000567 , -0.39082292, -0.56545794,  1.7348484 ,
        -0.43604594,  3.096307  , -1.2181851 ,  0.24772911,  2.8284123 ,
         3.1381838 ],
       [-0.93206084, -0.5172032 , -3

In [104]:
res_np-mult.T

ValueError: operands could not be broadcast together with shapes (8,16) (16,8) 

In [105]:
res_np.shape

(8, 16)

In [106]:
mult.shape

(8, 16)

In [107]:
asdf

NameError: name 'asdf' is not defined

# Matmult Dense Dense

In [108]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i; 
        float xval = x[xidx];
        uint yidx = osize*i+gidy;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [109]:
a.shape, b.shape

((8, 16), (16, 5))

In [110]:
rows = a.shape[0]

In [111]:
mult = mult.astype(np.float32)

In [112]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.00 x 0.49 - 0.00  -- 0/0
mult: 0.68 x 0.66 - 0.00  -- 1/5
mult: 0.80 x 0.31 - 0.00  -- 2/10
mult: 0.00 x 0.00 - 0.00  -- 3/15
mult: 0.00 x 0.71 - 0.00  -- 4/20
mult: 0.00 x 0.00 - 0.00  -- 5/25
mult: 0.00 x 0.46 - 0.00  -- 6/30
mult: 0.08 x 0.00 - 0.00  -- 7/35
mult: 0.89 x 0.00 - 0.00  -- 8/40
mult: 0.00 x 0.00 - 0.00  -- 9/45
mult: 0.18 x 0.00 - 0.00  -- 10/50
mult: 0.00 x 0.00 - 0.00  -- 11/55
mult: 0.46 x 0.00 - 0.00  -- 12/60
mult: 0.00 x 0.00 - 0.00  -- 13/65
mult: 0.00 x 0.19 - 0.00  -- 14/70
mult: 0.00 x 0.73 - 0.00  -- 15/75

<pyopencl._cl.NannyEvent at 0x7f5a642ed6d0>

In [113]:
(res_np-mult.T).sum()

ValueError: operands could not be broadcast together with shapes (8,5) (16,8) 

In [114]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [115]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [116]:
res_np

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [117]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [118]:
res_np==mult

  res_np==mult


False

In [119]:
res_np.shape

(8, 5)

In [120]:
mult.shape

(8, 16)

# Matmult Dense Transposed

In [121]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [122]:
c=np.zeros(b.T.shape)
bt = b.T
for row in range(bt.shape[0]):
    for col in range(bt.shape[1]):
        c[row][col] = bt[row][col]

In [123]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i;
        float xval = x[xidx];
        uint yidx = msize*gidy+i;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [124]:
a.shape, b.T.shape

((8, 16), (5, 16))

In [125]:
rows = a.shape[0]

In [126]:
mult = mult.astype(np.float32)

In [127]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.00 x 0.49 - 0.00  -- 0/0
mult: 0.68 x 0.66 - 0.00  -- 1/1
mult: 0.80 x 0.31 - 0.00  -- 2/2
mult: 0.00 x 0.00 - 0.00  -- 3/3
mult: 0.00 x 0.71 - 0.00  -- 4/4
mult: 0.00 x 0.00 - 0.00  -- 5/5
mult: 0.00 x 0.46 - 0.00  -- 6/6
mult: 0.08 x 0.00 - 0.00  -- 7/7
mult: 0.89 x 0.00 - 0.00  -- 8/8
mult: 0.00 x 0.00 - 0.00  -- 9/9
mult: 0.18 x 0.00 - 0.00  -- 10/10
mult: 0.00 x 0.00 - 0.00  -- 11/11
mult: 0.46 x 0.00 - 0.00  -- 12/12
mult: 0.00 x 0.00 - 0.00  -- 13/13
mult: 0.00 x 0.19 - 0.00  -- 14/14
mult: 0.00 x 0.73 - 0.00  -- 15/15

<pyopencl._cl.NannyEvent at 0x7f5a642ed400>

In [128]:
(res_np-mult).sum()

ValueError: operands could not be broadcast together with shapes (8,5) (8,16) 

In [129]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [130]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [131]:
res_np

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [132]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [133]:
res_np==mult

  res_np==mult


False

In [134]:
res_np.shape

(8, 5)

In [135]:
mult.shape

(8, 16)

# Matmult Transposed Dense

In [136]:
mult = a.dot(b)

In [137]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [138]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint isize
                          ) { // LOCAL SHARED BUFFER
      uint osize = get_global_size(0);
      int gidy = get_global_id(0); // row
      
      for (uint gidx = 0; gidx < isize; gidx++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [139]:
a.shape, b.shape

((8, 16), (16, 5))

In [140]:
rows = a.shape[0]

In [141]:
mult = mult.astype(np.float32)
mult.shape

(8, 5)

In [142]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [b.shape[1]], None, a_buf, b_buf, res_buf, np.uint32(a.shape[1]), np.uint32(rows))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.00 x 0.49 - 0.00  -- 0/0
mult: 0.68 x 0.66 - 0.45  -- 8/5
mult: 0.80 x 0.31 - 0.70  -- 16/10
mult: 0.00 x 0.00 - 0.70  -- 24/15
mult: 0.00 x 0.71 - 0.70  -- 32/20
mult: 0.00 x 0.00 - 0.70  -- 40/25
mult: 0.00 x 0.46 - 0.70  -- 48/30
mult: 0.08 x 0.00 - 0.70  -- 56/35
mult: 0.89 x 0.00 - 0.70  -- 64/40
mult: 0.00 x 0.00 - 0.70  -- 72/45
mult: 0.18 x 0.00 - 0.70  -- 80/50
mult: 0.00 x 0.00 - 0.70  -- 88/55
mult: 0.46 x 0.00 - 0.70  -- 96/60
mult: 0.00 x 0.00 - 0.70  -- 104/65
mult: 0.00 x 0.19 - 0.70  -- 112/70
mult: 0.00 x 0.73 - 0.70  -- 120/75

<pyopencl._cl.NannyEvent at 0x7f5a643319f0>

In [143]:
(res_np-mult).sum()

0.0

In [144]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [145]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [146]:
res_np

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [147]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [148]:
res_np==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [149]:
res_np.shape

(8, 5)

In [150]:
mult.shape

(8, 5)

# Matmult Transposed Dense (SPR) - NEW

In [151]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [152]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topkx*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topky*4)
sdata_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
sidxs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
snnzs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*4)
sdatat_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
sidxst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
snnzst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate4(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topkx,
                              uint topky,
                              __global  uint*  xoutidx,
                              __global  uint*  youtidx,
                              __global  float* matData,     // OUTPUT MATRIX DATA
                              __global  uint*  colIdx,
                              __global  uint*  rowNnz,
                              __global  float* matDatat,    // OUTPUT MATRIX DATA
                              __global  uint*  colIdxt,
                              __global  uint*  rowNnzt
                              ) {
      uint gid = get_global_id(0);

      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }

        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = fabs(xsum[i]);
          bool larger = tempval > fabs(valx);
          posx += (larger)?1:0;
        }
        if (posx < topky) {
          youtidx[posx] = gid;
        }
      }

      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          //}
          ysum[gid] += val;
        }

        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = fabs(ysum[i]);
          bool larger = tempval > fabs(valy);
          posy += (larger)?1:0;
        }

        if (posy < topkx) {
          xoutidx[posy] = gid;
        }
      }

      if (gid < topkx) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topkx; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[gid] = gid;
      }

      if (gid < topky) {
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topky; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[gid] = gid;
      }

      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < isize) {
        for (uint i=0; i<topkx; i++) {
          matData[gid*topkx+i] = 0;
          colIdx[gid*topkx+i] = 0;
        }
        rowNnz[gid] = 0;
      }
      if (gid < osize) {
        for (uint i=0; i<topky; i++) {
          matDatat[gid*topky+i] = 0;
          colIdxt[gid*topky+i] = 0;
        }
        rowNnzt[gid] = 0;
      }


      if (gid < topkx) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topky; j++) {
          uint idxy = youtidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxy;
            uint yidx2 = osize*k+idxx;
            uint colidx = idxy;
            matDatat[idxx*topky+j] += x[xidx2] * y[yidx2];
            colIdxt[idxx*topky+j] = idxy;
            if (gid == 0)
              printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnzt[idxx] += 1;
        }
      }
      if (gid < topky) {
        uint idxx = youtidx[gid];
        for (uint j=0; j<topkx; j++) {
          uint idxy = xoutidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            uint colidx = idxy;
            matData[idxx*topkx+j] += x[xidx2] * y[yidx2];
            colIdx[idxx*topkx+j] = idxy;
            if (gid == 0)
              printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnz[idxx] += 1;
        }
      }
    }""").build()

In [464]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topkx*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topky*4)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0(__global  float* x,      // INPUT MATRIX DATA
                          __global  float* y,      // INPUT
                          __global  float* xsum,    // INPUT
                          __global  float* ysum,    // INPUT
                          __global  uint*  xoutidx, // OUT
                          __global  uint*  youtidx, // OUT
                          __global  float* resdata, // OUT
                          __global  uint*  rescols, // OUT
                          __global  uint*  resnnzs, // OUT
                          uint topkx,
                          uint topky,
                          uint ellw,
                          uint msize,
                          uint osize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      int gidx = get_global_id(0); // row
      
      resnnzs[gidx] = 0;
      
      // get topk
      xsum[gidx] = 0;
      for (uint i=0; i<msize; i++) {
        float val = x[i*isize+gidx];
        //if (gid == 0) {
        //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
        //}
        xsum[gidx] += val;
      }
      float valx = xsum[gidx];
      uint posx = 0;
      for (uint i = 0; i < isize; i++) {
        float tempval = fabs(xsum[i]);
        bool larger = tempval > fabs(valx);
        posx += (larger)?1:0;
      }
      if (posx < topky) {
        youtidx[posx] = gidx;
      }
      
      for (uint k = 0; k < topky; k++) {
        uint gidy = youtidx[k];
        float ret = 0.0;
        uint i;
        for (i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        
        // add for 
        uint nnz = resnnzs[gidx];
        for (i = 0; i < nnz; i++) {
          if (rescols[i] >= gidy) {
            break;
          }
          for (uint j = nnz; j >= i; j--) {
            //resdata[j+1] = resdata[j];
          }
        }
        resdata[gidx * ellw + i] = ret;
        rescols[gidx * ellw + i] = gidy;
        resnnzs[gidx] += 1;
      }
    }""").build()

In [504]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topkx*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topky*4)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void gettopk(__global  float* x,      // INPUT MATRIX DATA
                          __global  float* y,      // INPUT
                          __global  float* xsum,    // INPUT
                          __global  float* ysum,    // INPUT
                          __global  uint*  xoutidx, // OUT
                          __global  uint*  youtidx, // OUT
                          uint topkx,
                          uint topky,
                          uint ellw,
                          uint msize,
                          uint osize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      int gidx = get_global_id(0); // row
      
      resnnzs[gidx] = 0;
      
      // get topk
      xsum[gidx] = 0;
      for (uint i=0; i<msize; i++) {
        float val = x[i*isize+gidx];
        //if (gid == 0) {
        //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
        //}
        xsum[gidx] += val;
      }
      float valx = xsum[gidx];
      uint posx = 0;
      for (uint i = 0; i < isize; i++) {
        float tempval = fabs(xsum[i]);
        bool larger = tempval > fabs(valx);
        posx += (larger)?1:0;
      }
      if (posx < topky) {
        youtidx[posx] = gidx;
      }
      
      
    }""").build()

In [505]:
a.shape, b.shape

((8, 16), (16, 5))

In [506]:
topkx, topky

(5, 8)

In [507]:
rows = a.shape[0]

In [508]:
mult = mult.astype(np.float32)
mult.shape

(8, 5)

In [509]:
resdata_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
rescols_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
resnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows])*4)

knl = prg.gettopk  # Use this Kernel object for repeated calls
knl(queue, [rows,cols], None, a_buf, b_buf, x_sum_buf, y_sum_buf, x_idx_buf, y_idx_buf, np.uint32(topkx), np.uint32(topky), np.uint32(b.shape[1]), np.uint32(a.shape[1]), np.uint32(b.shape[1]))

xidxcols = np.zeros(topkx).astype(np.uint32)
yidxcols = np.zeros(topky).astype(np.uint32)
resdata  = np.zeros(a.shape[0]*b.shape[1]).astype(np.float32)
rescols  = np.zeros(a.shape[0]*b.shape[1]).astype(np.uint32)
resnnzs  = np.zeros(a.shape[0]).astype(np.uint32)

cl.enqueue_copy(queue, resdata, resdata_buf)
cl.enqueue_copy(queue, rescols, rescols_buf)
cl.enqueue_copy(queue, resnnzs, resnnzs_buf)
cl.enqueue_copy(queue, xidxcols, x_idx_buf)
cl.enqueue_copy(queue, yidxcols, y_idx_buf)


mult: 0.00 x 0.49 - 0.00  -- 0/0
mult: 0.68 x 0.66 - 0.45  -- 8/5
mult: 0.80 x 0.31 - 0.70  -- 16/10
mult: 0.00 x 0.00 - 0.70  -- 24/15
mult: 0.00 x 0.71 - 0.70  -- 32/20
mult: 0.00 x 0.00 - 0.70  -- 40/25
mult: 0.00 x 0.46 - 0.70  -- 48/30
mult: 0.08 x 0.00 - 0.70  -- 56/35
mult: 0.89 x 0.00 - 0.70  -- 64/40
mult: 0.00 x 0.00 - 0.70  -- 72/45
mult: 0.18 x 0.00 - 0.70  -- 80/50
mult: 0.00 x 0.00 - 0.70  -- 88/55
mult: 0.46 x 0.00 - 0.70  -- 96/60
mult: 0.00 x 0.00 - 0.70  -- 104/65
mult: 0.00 x 0.19 - 0.70  -- 112/70
mult: 0.00 x 0.73 - 0.70  -- 120/75

<pyopencl._cl.NannyEvent at 0x7f5a64269310>

In [510]:
resdata

array([6.9963682e-01, 1.6236207e+00, 6.1331564e-01, 2.1218245e-01,
       1.2226288e+00, 1.8848312e+00, 1.4160932e+00, 2.9892322e-01,
       8.0155689e-01, 0.0000000e+00, 1.6573755e-01, 1.4074200e+00,
       0.0000000e+00, 6.4756888e-01, 4.8351842e-01, 1.1302330e+00,
       1.1950791e+00, 6.3114762e-01, 1.1668310e-01, 0.0000000e+00,
       4.9532753e-01, 8.1877571e-01, 1.8931029e-02, 9.7717863e-01,
       1.1707418e-01, 6.0025734e-01, 1.2898692e+00, 0.0000000e+00,
       4.6149877e-01, 7.6452947e-01, 1.7482497e-01, 4.7157714e-01,
       7.3094618e-01, 8.2383305e-01, 1.3009151e-03, 7.2407621e-01,
       9.9110806e-01, 8.3909458e-01, 6.9821823e-01, 7.5884867e-01],
      dtype=float32)

In [511]:
xidxcols

array([2, 7, 6, 3, 5], dtype=uint32)

In [512]:
yidxcols

array([1, 3, 4, 0, 7, 5, 2, 6], dtype=uint32)

In [513]:
rescols

array([0, 2, 4, 5, 6, 0, 2, 4, 5, 6, 0, 2, 4, 5, 6, 0, 2, 4, 5, 6, 0, 2,
       4, 5, 6, 0, 2, 4, 5, 6, 0, 2, 4, 5, 6, 0, 2, 4, 5, 6], dtype=uint32)

In [514]:
resnnzs

array([8, 8, 8, 8, 8, 8, 8, 8], dtype=uint32)

In [475]:
res_np = to_dense(resdata, rescols, resnnzs, b.shape[1], mult.shape)

IndexError: index 5 is out of bounds for axis 1 with size 5

In [164]:
(res_np-mult).sum()

0.0

In [165]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [166]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [167]:
res_np

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [168]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [169]:
res_np==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [170]:
res_np.shape

(8, 5)

In [171]:
mult.shape

(8, 5)

# Matmult Transposed Dense (SPR-T OUT)

In [172]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [173]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0t(__global  float* x,      // INPUT MATRIX DATA
                          __global  float* y,      // INPUT
                          __global  float* resdata,// OUT
                          __global  uint*  rescols,
                          __global  uint*  resnnzs,
                          uint ellw,
                          uint msize,
                          uint isize
                          ) { // LOCAL SHARED BUFFER
      uint osize = get_global_size(0);
      int gidy = get_global_id(0); // row
      
      resnnzs[gidy] = 0;
      for (uint gidx = 0; gidx < isize; gidx++) {
        float ret = 0.0;
        uint i;
        for (i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        
        // add for 
        uint nnz = resnnzs[gidy];
        for (i = 0; i < nnz; i++) {
          if (rescols[i] >= gidx) {
            break;
          }
          for (uint j = nnz; j >= i; j--) {
            //resdata[j+1] = resdata[j];
          }
        }
        resdata[gidy * ellw + i] = ret;
        rescols[gidy * ellw + i] = gidx;
        resnnzs[gidy] += 1;
      }
    }""").build()

In [174]:
a.shape, b.shape

((8, 16), (16, 5))

In [175]:
rows = a.shape[0]
cols = b.shape[1]

In [176]:
mult = mult.astype(np.float32)
mult.shape

(8, 5)

In [177]:
resdatat_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([cols,rows])*4)
rescolst_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([cols,rows])*4)
resnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE, cols*4)

knl = prg.matmul0t  # Use this Kernel object for repeated calls
knl(queue, [cols], None, a_buf, b_buf, resdatat_buf, rescolst_buf, resnnzst_buf, np.uint32(rows), np.uint32(a.shape[1]), np.uint32(rows))

resdatat = np.zeros(cols*rows).astype(np.float32)
rescolst = np.zeros(cols*rows).astype(np.uint32)
resnnzst = np.zeros(cols).astype(np.uint32)
cl.enqueue_copy(queue, resdatat, resdatat_buf)
cl.enqueue_copy(queue, rescolst, rescolst_buf)
cl.enqueue_copy(queue, resnnzst, resnnzst_buf)


mult: 0.00 x 0.49 - 0.00  -- 0/0
mult: 0.68 x 0.66 - 0.45  -- 8/5
mult: 0.80 x 0.31 - 0.70  -- 16/10
mult: 0.00 x 0.00 - 0.70  -- 24/15
mult: 0.00 x 0.71 - 0.70  -- 32/20
mult: 0.00 x 0.00 - 0.70  -- 40/25
mult: 0.00 x 0.46 - 0.70  -- 48/30
mult: 0.08 x 0.00 - 0.70  -- 56/35
mult: 0.89 x 0.00 - 0.70  -- 64/40
mult: 0.00 x 0.00 - 0.70  -- 72/45
mult: 0.18 x 0.00 - 0.70  -- 80/50
mult: 0.00 x 0.00 - 0.70  -- 88/55
mult: 0.46 x 0.00 - 0.70  -- 96/60
mult: 0.00 x 0.00 - 0.70  -- 104/65
mult: 0.00 x 0.19 - 0.70  -- 112/70
mult: 0.00 x 0.73 - 0.70  -- 120/75

<pyopencl._cl.NannyEvent at 0x7f5a642851d0>

In [178]:
resdatat

array([0.6996368 , 1.8848312 , 0.16573755, 1.130233  , 0.49532753,
       0.60025734, 0.17482497, 0.7240762 , 0.15550727, 0.18735047,
       0.51614475, 0.31159392, 0.9677906 , 0.7728439 , 0.30367106,
       0.27932334, 1.6236207 , 1.4160932 , 1.40742   , 1.1950791 ,
       0.8187757 , 1.2898692 , 0.47157714, 0.99110806, 0.3340667 ,
       0.30414024, 0.7711447 , 0.32051137, 0.1351769 , 0.9285497 ,
       0.544282  , 0.09674796, 0.61331564, 0.29892322, 0.        ,
       0.6311476 , 0.01893103, 0.        , 0.7309462 , 0.8390946 ],
      dtype=float32)

In [179]:
rescolst

array([0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5,
       6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7], dtype=uint32)

In [180]:
resnnzst

array([8, 8, 8, 8, 8], dtype=uint32)

In [181]:
res_np = to_dense(resdatat, rescolst, resnnzst, a.shape[0], mult.T.shape)
res_np.T

array([[0.69963682, 0.15550727, 1.62362075, 0.33406669, 0.61331564],
       [1.88483119, 0.18735047, 1.41609323, 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742004, 0.77114469, 0.        ],
       [1.13023305, 0.31159392, 1.19507909, 0.32051137, 0.63114762],
       [0.49532753, 0.9677906 , 0.81877571, 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.28986919, 0.92854971, 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.54428202, 0.73094618],
       [0.72407621, 0.27932334, 0.99110806, 0.09674796, 0.83909458]])

In [182]:
(res_np.T-mult).sum()

0.0

In [183]:
res_np.shape

(5, 8)

In [184]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [185]:
mult - res_np.T

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [186]:
res_np.T==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

# Matmult Transposed Dense (SPR)

In [187]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [188]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0(__global  float* x,      // INPUT MATRIX DATA
                          __global  float* y,      // INPUT
                          __global  float* resdata,// OUT
                          __global  uint*  rescols,
                          __global  uint*  resnnzs,
                          uint ellw,
                          uint msize,
                          uint osize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      int gidx = get_global_id(0); // row
      
      resnnzs[gidx] = 0;
      
      for (uint gidy = 0; gidy < osize; gidy++) {
        float ret = 0.0;
        uint i;
        for (i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        
        // add for 
        uint nnz = resnnzs[gidx];
        for (i = 0; i < nnz; i++) {
          if (rescols[i] >= gidy) {
            break;
          }
          for (uint j = nnz; j >= i; j--) {
            //resdata[j+1] = resdata[j];
          }
        }
        resdata[gidx * ellw + i] = ret;
        rescols[gidx * ellw + i] = gidy;
        resnnzs[gidx] += 1;
      }
    }""").build()

In [189]:
a.shape, b.shape

((8, 16), (16, 5))

In [190]:
rows = a.shape[0]

In [191]:
mult = mult.astype(np.float32)
mult.shape

(8, 5)

In [192]:
resdata_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
rescols_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
resnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows])*4)

knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows], None, a_buf, b_buf, resdata_buf, rescols_buf, resnnzs_buf, np.uint32(b.shape[1]), np.uint32(a.shape[1]), np.uint32(b.shape[1]))

resdata = np.zeros(a.shape[0]*b.shape[1]).astype(np.float32)
rescols = np.zeros(a.shape[0]*b.shape[1]).astype(np.uint32)
resnnzs = np.zeros(a.shape[0]).astype(np.uint32)
cl.enqueue_copy(queue, resdata, resdata_buf)
cl.enqueue_copy(queue, rescols, rescols_buf)
cl.enqueue_copy(queue, resnnzs, resnnzs_buf)

<pyopencl._cl.NannyEvent at 0x7f5a64285040>


mult: 0.00 x 0.49 - 0.00  -- 0/0
mult: 0.68 x 0.66 - 0.45  -- 8/5
mult: 0.80 x 0.31 - 0.70  -- 16/10
mult: 0.00 x 0.00 - 0.70  -- 24/15
mult: 0.00 x 0.71 - 0.70  -- 32/20
mult: 0.00 x 0.00 - 0.70  -- 40/25
mult: 0.00 x 0.46 - 0.70  -- 48/30
mult: 0.08 x 0.00 - 0.70  -- 56/35
mult: 0.89 x 0.00 - 0.70  -- 64/40
mult: 0.00 x 0.00 - 0.70  -- 72/45
mult: 0.18 x 0.00 - 0.70  -- 80/50
mult: 0.00 x 0.00 - 0.70  -- 88/55
mult: 0.46 x 0.00 - 0.70  -- 96/60
mult: 0.00 x 0.00 - 0.70  -- 104/65
mult: 0.00 x 0.19 - 0.70  -- 112/70
mult: 0.00 x 0.73 - 0.70  -- 120/75

In [193]:
resdata

array([0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564,
       1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322,
       0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ,
       1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ,
       0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103,
       0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ,
       0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ,
       0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ],
      dtype=float32)

In [194]:
rescols

array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1,
       2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=uint32)

In [195]:
resnnzs

array([5, 5, 5, 5, 5, 5, 5, 5], dtype=uint32)

In [196]:
res_np = to_dense(resdata, rescols, resnnzs, b.shape[1], mult.shape)

In [197]:
(res_np-mult).sum()

0.0

In [198]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [199]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [200]:
res_np

array([[0.69963682, 0.15550727, 1.62362075, 0.33406669, 0.61331564],
       [1.88483119, 0.18735047, 1.41609323, 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742004, 0.77114469, 0.        ],
       [1.13023305, 0.31159392, 1.19507909, 0.32051137, 0.63114762],
       [0.49532753, 0.9677906 , 0.81877571, 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.28986919, 0.92854971, 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.54428202, 0.73094618],
       [0.72407621, 0.27932334, 0.99110806, 0.09674796, 0.83909458]])

In [201]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [202]:
res_np==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [203]:
res_np.shape

(8, 5)

In [204]:
mult.shape

(8, 5)

# Matmult Transposed Dense (SPR-T OUT)

In [205]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [206]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0t(__global  float* x,      // INPUT MATRIX DATA
                          __global  float* y,      // INPUT
                          __global  float* resdata,// OUT
                          __global  uint*  rescols,
                          __global  uint*  resnnzs,
                          uint ellw,
                          uint msize,
                          uint isize
                          ) { // LOCAL SHARED BUFFER
      uint osize = get_global_size(0);
      int gidy = get_global_id(0); // row
      
      resnnzs[gidy] = 0;
      for (uint gidx = 0; gidx < isize; gidx++) {
        float ret = 0.0;
        uint i;
        for (i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        
        // add for 
        uint nnz = resnnzs[gidy];
        for (i = 0; i < nnz; i++) {
          if (rescols[i] >= gidx) {
            break;
          }
          for (uint j = nnz; j >= i; j--) {
            //resdata[j+1] = resdata[j];
          }
        }
        resdata[gidy * ellw + i] = ret;
        rescols[gidy * ellw + i] = gidx;
        resnnzs[gidy] += 1;
      }
    }""").build()

In [207]:
a.shape, b.shape

((8, 16), (16, 5))

In [208]:
rows = a.shape[0]
cols = b.shape[1]

In [209]:
mult = mult.astype(np.float32)
mult.shape

(8, 5)

In [210]:
resdatat_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([cols,rows])*4)
rescolst_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([cols,rows])*4)
resnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE, cols*4)

knl = prg.matmul0t  # Use this Kernel object for repeated calls
knl(queue, [cols], None, a_buf, b_buf, resdatat_buf, rescolst_buf, resnnzst_buf, np.uint32(rows), np.uint32(a.shape[1]), np.uint32(rows))

resdatat = np.zeros(cols*rows).astype(np.float32)
rescolst = np.zeros(cols*rows).astype(np.uint32)
resnnzst = np.zeros(cols).astype(np.uint32)
cl.enqueue_copy(queue, resdatat, resdatat_buf)
cl.enqueue_copy(queue, rescolst, rescolst_buf)
cl.enqueue_copy(queue, resnnzst, resnnzst_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642f5a90>


mult: 0.00 x 0.49 - 0.00  -- 0/0
mult: 0.68 x 0.66 - 0.45  -- 8/5
mult: 0.80 x 0.31 - 0.70  -- 16/10
mult: 0.00 x 0.00 - 0.70  -- 24/15
mult: 0.00 x 0.71 - 0.70  -- 32/20
mult: 0.00 x 0.00 - 0.70  -- 40/25
mult: 0.00 x 0.46 - 0.70  -- 48/30
mult: 0.08 x 0.00 - 0.70  -- 56/35
mult: 0.89 x 0.00 - 0.70  -- 64/40
mult: 0.00 x 0.00 - 0.70  -- 72/45
mult: 0.18 x 0.00 - 0.70  -- 80/50
mult: 0.00 x 0.00 - 0.70  -- 88/55
mult: 0.46 x 0.00 - 0.70  -- 96/60
mult: 0.00 x 0.00 - 0.70  -- 104/65
mult: 0.00 x 0.19 - 0.70  -- 112/70
mult: 0.00 x 0.73 - 0.70  -- 120/75

In [211]:
resdatat

array([0.6996368 , 1.8848312 , 0.16573755, 1.130233  , 0.49532753,
       0.60025734, 0.17482497, 0.7240762 , 0.15550727, 0.18735047,
       0.51614475, 0.31159392, 0.9677906 , 0.7728439 , 0.30367106,
       0.27932334, 1.6236207 , 1.4160932 , 1.40742   , 1.1950791 ,
       0.8187757 , 1.2898692 , 0.47157714, 0.99110806, 0.3340667 ,
       0.30414024, 0.7711447 , 0.32051137, 0.1351769 , 0.9285497 ,
       0.544282  , 0.09674796, 0.61331564, 0.29892322, 0.        ,
       0.6311476 , 0.01893103, 0.        , 0.7309462 , 0.8390946 ],
      dtype=float32)

In [212]:
rescolst

array([0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5,
       6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7], dtype=uint32)

In [213]:
resnnzst

array([8, 8, 8, 8, 8], dtype=uint32)

In [214]:
res_np = to_dense(resdatat, rescolst, resnnzst, a.shape[0], mult.T.shape)
res_np.T

array([[0.69963682, 0.15550727, 1.62362075, 0.33406669, 0.61331564],
       [1.88483119, 0.18735047, 1.41609323, 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742004, 0.77114469, 0.        ],
       [1.13023305, 0.31159392, 1.19507909, 0.32051137, 0.63114762],
       [0.49532753, 0.9677906 , 0.81877571, 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.28986919, 0.92854971, 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.54428202, 0.73094618],
       [0.72407621, 0.27932334, 0.99110806, 0.09674796, 0.83909458]])

In [215]:
(res_np.T-mult).sum()

0.0

In [216]:
res_np.shape

(5, 8)

In [217]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [218]:
mult - res_np.T

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [219]:
res_np.T==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

# Matmult Dense Transposed2

In [220]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint osize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      // osize = get_global_size(1);
      int gidx = get_global_id(0); // col
      // int gidy = get_global_id(1); // row

      for (uint gidy = 0; gidy < osize; gidy++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          ret += x[gidx*msize+i]*y[i*osize+gidy];
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f", x[gidx*msize+i],y[i*msize+gidy], ret);
        }

        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [221]:
a.shape, b.shape

((8, 16), (16, 5))

In [222]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [223]:
rows = a.shape[0]

In [224]:
mult = mult.astype(np.float32)

In [225]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]), np.uint32(b.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642f5ef0>


mult: 0.00 x 0.49 - 0.00
mult: 0.68 x 0.47 - 0.45
mult: 0.80 x 0.00 - 0.70
mult: 0.00 x 0.00 - 0.70
mult: 0.00 x 0.84 - 0.70
mult: 0.00 x 0.63 - 0.70
mult: 0.00 x 0.94 - 0.70
mult: 0.08 x -0.71 - 0.70
mult: 0.89 x 0.70 - 0.70
mult: 0.00 x 0.31 - 0.70
mult: 0.18 x 0.47 - 0.70
mult: 0.00 x 0.00 - 0.70
mult: 0.46 x 0.00 - 0.70
mult: 0.00 x 0.00 - 0.70
mult: 0.00 x 0.00 - 0.70
mult: 0.00 x 0.00 - 0.70

In [226]:
(res_np-mult).sum()

0.0

In [227]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [228]:
b.T

array([[0.49395892, 0.65864426, 0.31044865, 0.        , 0.71082693,
        0.        , 0.46450275, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.18725161,
        0.7250254 ],
       [0.        , 0.        , 0.        , 0.46765924, 0.        ,
        0.        , 0.        , 0.34248227, 0.        , 0.7705159 ,
        0.7167741 , 0.91970533, 0.        , 0.        , 0.11607183,
        0.        ],
       [0.46451807, 0.9786625 , 0.02502257, 0.        , 0.58997136,
        0.84622455, 0.        , 0.7672639 , 0.9855889 , 0.31522992,
        0.        , 0.        , 0.        , 0.674503  , 0.        ,
        0.12346577],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5916316 , 0.        , 0.        , 0.        , 0.        ,
        0.8238369 , 0.7873889 , 0.404568  , 0.48820347, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.03160795, 0.        ,
        0.        , 0.9429863 , 

In [229]:
a[0]

array([0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
       0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
       0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
       0.        ], dtype=float32)

In [230]:
b.T[0]

array([0.49395892, 0.65864426, 0.31044865, 0.        , 0.71082693,
       0.        , 0.46450275, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.18725161,
       0.7250254 ], dtype=float32)

In [231]:
res_buf

<pyopencl._cl.Buffer at 0x7f5a642edef0>

In [232]:
res_np

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [233]:
a.dot(b)

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [234]:
res_np==mult

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [235]:
res_np.shape

(8, 5)

In [236]:
mult.shape

(8, 5)

## Weight update kernel

In [237]:
bs = 4

In [238]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [239]:
dim = 8

x = np.random.rand(bs,dim).astype(np.float32)
y = np.random.rand(bs,dim).astype(np.float32)
x.shape,y.shape, topk

((4, 8), (4, 8), 5)

x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint bs = get_global_size(1);
      uint gid2 = get_global_id(1);

      uint idx = n*gid2+gid;

      float valx = x[idx];
      float valy = y[idx];
      uint posx = 0;
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval = x[idx2];
        float tempval2 = y[idx2];
        bool larger = tempval > valx;
        bool larger2 = tempval2 > valy;

        barrier(CLK_GLOBAL_MEM_FENCE);
        posx += (larger)?1:0;
        posy += (larger2)?1:0;
        barrier(CLK_GLOBAL_MEM_FENCE);
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      //printf("posx:%i", posx);
      if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
      }
      if (posy < topk) {
        youtidx[posy+topk*gid2] = gid;
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      if (gid < topk) {
        for (uint j=0; j<topk; j++) {
          float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
          //printf("\\nJ:%i  gid:%i", j, gid);
          //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
          barrier(CLK_GLOBAL_MEM_FENCE);
          xout[gid2*topk*topk+j*topk+gid] = res;
          barrier(CLK_GLOBAL_MEM_FENCE);
          
        }
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
    }""").build()

In [240]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#x_cp_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
#x_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#y_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             uint bs,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      //uint bs = get_global_size(1);
      //uint gid2 = get_global_id(1);

      for (uint gid2=0; gid2<bs; gid2++){
        uint idx = n*gid2+gid;

        float valx = x[idx];
        float valy = y[idx];
        uint posx = 0;
        uint posy = 0;
        for (uint i = 0; i < n; i++) {
          uint idx2 = n*gid2+i;
          float tempval = x[idx2];
          float tempval2 = y[idx2];
          bool larger = tempval > valx;
          bool larger2 = tempval2 > valy;

          posx += (larger)?1:0;
          posy += (larger2)?1:0;
        }
        //printf("posx:%i", posx);
        if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
        }
        if (posy < topk) {
          youtidx[posy+topk*gid2] = gid;
        }
      }
      for (uint gid2=0; gid2<bs; gid2++){
        if (gid < topk) {
          for (uint j=0; j<topk; j++) {
            float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
            //printf("\\nJ:%i  gid:%i", j, gid);
            //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
            //barrier(CLK_GLOBAL_MEM_FENCE);
            xout[gid2*topk*topk+j*topk+gid] = res;
          }
        }
      }
    }""").build()

In [241]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
evt = knl(queue, [dim], None, x_buf, y_buf, x_cp_buf, np.uint32(topk), np.uint32(bs), x_idx_buf, y_idx_buf)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642e2400>

knl(queue, [dim], None, y_buf, x_buf, x_cp_buft, np.uint32(topk), np.uint32(bs), x_idx_buft, y_idx_buft)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

In [242]:
x

array([[0.17703491, 0.9082271 , 0.844115  , 0.13836125, 0.37077376,
        0.14187126, 0.2554239 , 0.74123585],
       [0.5749973 , 0.58092856, 0.245505  , 0.37022075, 0.28529382,
        0.04638312, 0.89031464, 0.4462557 ],
       [0.01870731, 0.49408567, 0.4352037 , 0.23138313, 0.523213  ,
        0.98933345, 0.6775894 , 0.9670713 ],
       [0.24639843, 0.7003624 , 0.7240451 , 0.5082079 , 0.23762804,
        0.50561154, 0.88468575, 0.9135105 ]], dtype=float32)

In [243]:
y

array([[0.22866099, 0.52384585, 0.11538133, 0.09020234, 0.2828013 ,
        0.9686888 , 0.04652401, 0.40306655],
       [0.8626351 , 0.16699363, 0.9405066 , 0.85600585, 0.98426616,
        0.19067204, 0.9464574 , 0.79394555],
       [0.35402992, 0.49338102, 0.22329962, 0.7806627 , 0.3408471 ,
        0.22262034, 0.72643334, 0.1980827 ],
       [0.65394187, 0.3903598 , 0.88434243, 0.9210825 , 0.52586645,
        0.60021836, 0.9936902 , 0.52363884]], dtype=float32)

In [244]:
x.shape, y.shape

((4, 8), (4, 8))

In [245]:
resx

array([0.8797894 , 0.81768477, 0.7180269 , 0.3591644 , 0.24742627,
       0.47577098, 0.44218615, 0.38829333, 0.19422829, 0.13380276,
       0.36607596, 0.34023452, 0.2987674 , 0.1494465 , 0.10295283,
       0.2568478 , 0.23871683, 0.20962246, 0.1048553 , 0.07223421,
       0.2076761 , 0.19301617, 0.16949172, 0.0847815 , 0.05840548,
       0.8763066 , 0.5717883 , 0.5659504 , 0.4392344 , 0.36439577,
       0.8426449 , 0.5498241 , 0.54421043, 0.42236203, 0.35039815,
       0.8373468 , 0.5463671 , 0.54078877, 0.41970643, 0.34819505,
       0.76801664, 0.5011293 , 0.49601284, 0.38495582, 0.3193654 ,
       0.7621145 , 0.49727824, 0.49220106, 0.3819975 , 0.31691113,
       0.77233577, 0.7549565 , 0.5289688 , 0.4084529 , 0.38571426,
       0.7186848 , 0.7025128 , 0.49222353, 0.3800794 , 0.3589203 ,
       0.48811835, 0.47713462, 0.33430976, 0.25814337, 0.24377249,
       0.35025364, 0.34237218, 0.23988692, 0.18523307, 0.17492111,
       0.33721143, 0.32962346, 0.2309544 , 0.17833565, 0.16840

In [246]:
resx.reshape(bs,topk,topk)

array([[[0.8797894 , 0.81768477, 0.7180269 , 0.3591644 , 0.24742627],
        [0.47577098, 0.44218615, 0.38829333, 0.19422829, 0.13380276],
        [0.36607596, 0.34023452, 0.2987674 , 0.1494465 , 0.10295283],
        [0.2568478 , 0.23871683, 0.20962246, 0.1048553 , 0.07223421],
        [0.2076761 , 0.19301617, 0.16949172, 0.0847815 , 0.05840548]],

       [[0.8763066 , 0.5717883 , 0.5659504 , 0.4392344 , 0.36439577],
        [0.8426449 , 0.5498241 , 0.54421043, 0.42236203, 0.35039815],
        [0.8373468 , 0.5463671 , 0.54078877, 0.41970643, 0.34819505],
        [0.76801664, 0.5011293 , 0.49601284, 0.38495582, 0.3193654 ],
        [0.7621145 , 0.49727824, 0.49220106, 0.3819975 , 0.31691113]],

       [[0.77233577, 0.7549565 , 0.5289688 , 0.4084529 , 0.38571426],
        [0.7186848 , 0.7025128 , 0.49222353, 0.3800794 , 0.3589203 ],
        [0.48811835, 0.47713462, 0.33430976, 0.25814337, 0.24377249],
        [0.35025364, 0.34237218, 0.23988692, 0.18523307, 0.17492111],
        [0.33721

In [247]:
resxidx

array([1, 2, 7, 4, 6, 6, 1, 0, 7, 3, 5, 7, 6, 4, 1, 7, 6, 2, 1, 3],
      dtype=uint32)

In [248]:
resyidx

array([5, 1, 7, 4, 0, 4, 6, 2, 0, 3, 3, 6, 1, 0, 4, 6, 3, 2, 0, 5],
      dtype=uint32)

In [249]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [250]:
xy0[3][7]

0.29393512

## Weight update kernel new

In [251]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [252]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [253]:
at

array([[0.        , 0.9562663 , 0.        , 0.        , 0.        ,
        0.11415514, 0.28872353, 0.67626965],
       [0.68347037, 0.5531271 , 0.20103766, 0.        , 0.        ,
        0.82574075, 0.04889954, 0.        ],
       [0.8035886 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.59893256,
        0.        , 0.6493426 , 0.35563806],
       [0.        , 0.5375557 , 0.        , 0.9103574 , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.74341315, 0.03242496, 0.        ,
        0.27978534, 0.33329687, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.69630474],
       [0.08023349, 0.        , 0.        , 0.71459514, 0.9217044 ,
        0.25018668, 0.        , 0.        ],
       [0.8858316 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.      

In [254]:
c

array([[0.        , 0.95626628, 0.        , 0.        , 0.        ,
        0.11415514, 0.28872353, 0.67626965],
       [0.68347037, 0.55312711, 0.20103766, 0.        , 0.        ,
        0.82574075, 0.04889954, 0.        ],
       [0.80358863, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.59893256,
        0.        , 0.6493426 , 0.35563806],
       [0.        , 0.53755569, 0.        , 0.91035742, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.74341315, 0.03242496, 0.        ,
        0.27978534, 0.33329687, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.69630474],
       [0.08023349, 0.        , 0.        , 0.71459514, 0.92170441,
        0.25018668, 0.        , 0.        ],
       [0.88583159, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.      

a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate3(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             uint topk,
                             uint msize,
                             __global  float* xout,    // INPUT
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint bs = get_global_size(1);
      uint gid2 = get_global_id(1);



      uint idx = n*gid2+gid;
      float valx = x[idx];
      uint posx = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval = x[idx2];
        bool larger = tempval > valx;
        posx += (larger)?1:0;
      }
      
      uint idxy = n*gid2+gid;
      float valy = y[idx];
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval2 = y[idx2];
        bool larger2 = tempval2 > valy;
        posy += (larger2)?1:0;
      }
      
      if (posx < topk) {
        xoutidx[posx+topk*gid2] = idx;
      }
      if (posy < topk) {
        youtidx[posy+topk*gid2] = idxy;
      }
      return;
      if (gid < topk) {
        for (uint j=0; j<topk; j++) {
          float res = x[xoutidx[gid+topk*gid2]+gid2*msize] * y[youtidx[j+topk*gid2]+gid2*msize];
          printf("\\nJ:%i  gid:(%i,%i)", j, gid, gid2);
          printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
          //barrier(CLK_GLOBAL_MEM_FENCE);
          xout[gid2*topk*topk+j*topk+gid] = res;
        }
      }
    }""").build()

In [255]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate3(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topk,
                              __global  float* xout,
                              __global  uint* xoutidx,   
                              __global  uint* youtidx    
                              ) { 
      uint gid = get_global_id(0);
      
      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }
        
        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = xsum[i];
          bool larger = tempval > valx;
          posx += (larger)?1:0;
        }
        if (posx < topk) {
          xoutidx[posx] = gid;
        }
      }
      
      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          //}
          ysum[gid] += val;
        }
        
        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = ysum[i];
          bool larger = tempval > valy;
          posy += (larger)?1:0;
        }
      
        if (posy < topk) {
          youtidx[posy] = gid;
        }
      }
      
      if (gid < topk) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[posx] = valx;
        
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[posy] = valy;
      }
      
      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < topk) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topk; j++) {
          uint idxy = youtidx[j];
          xout[j*topk+gid] = 0;
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            xout[j*topk+gid] += x[xidx2] * y[yidx2];
            //if (gid == 0 && j == 1)
            //  printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
        }
      }
    }""").build()

In [256]:
a.shape, b.shape

((8, 16), (16, 5))

In [257]:
rows = a.shape[0]
msize = a.shape[1]

In [258]:
cols = b.shape[1]

In [259]:
mult = a.dot(b)

In [260]:
mult = mult.astype(np.float32)

In [261]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.genwupdate3  # Use this Kernel object for repeated calls
evt = knl(queue, [max(rows,cols)], None, a_buf, b_buf, x_sum_buf, y_sum_buf, np.uint32(rows), np.uint32(msize),np.uint32(cols), np.uint32(topk), x_cp_buf, x_idx_buf, y_idx_buf)

resxsum = np.zeros(a.shape[0]).astype(np.float32)
resysum = np.zeros(b.shape[1]).astype(np.float32)
resx = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, resxsum, x_sum_buf)
cl.enqueue_copy(queue, resysum, y_sum_buf)
cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642edd60>

In [262]:
resx.reshape(topk,topk)

array([[0.6996368 , 1.8848312 , 1.130233  , 0.49532753, 0.7240762 ],
       [0.15550727, 0.18735047, 0.31159392, 0.9677906 , 0.27932334],
       [1.6236207 , 1.4160932 , 1.1950791 , 0.8187757 , 0.99110806],
       [0.3340667 , 0.30414024, 0.32051137, 0.1351769 , 0.09674796],
       [0.61331564, 0.29892322, 0.6311476 , 0.01893103, 0.8390946 ]],
      dtype=float32)

In [263]:
resxsum

array([3.0937524, 3.5246177, 2.3468003, 3.2115116, 3.102983 , 2.4004393,
       2.1758807, 2.6182923], dtype=float32)

In [264]:
resysum

array([3.5506585, 3.3332088, 5.77045  , 3.095629 , 2.0694818],
      dtype=float32)

In [265]:
a.sum(axis=1)

array([3.0937524, 3.5246177, 2.3468003, 3.2115116, 3.102983 , 2.4004393,
       2.1758807, 2.6182923], dtype=float32)

In [266]:
b.sum(axis=0)

array([3.5506585, 3.3332088, 5.77045  , 3.095629 , 2.0694818],
      dtype=float32)

In [267]:
mult

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [268]:
resxidx

array([0, 1, 3, 4, 7], dtype=uint32)

In [269]:
resyidx

array([0, 1, 2, 3, 4], dtype=uint32)

In [270]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [271]:
xy0[0][0]

0.49601284

## Weight update kernel new2 (sparse ouput)

In [272]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [273]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topkx*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topky*4)
sdata_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
sidxs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
snnzs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*4)
sdatat_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
sidxst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
snnzst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate4(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topkx,
                              uint topky,
                              __global  uint*  xoutidx,
                              __global  uint*  youtidx,
                              __global  float* matData,     // OUTPUT MATRIX DATA
                              __global  uint*  colIdx,
                              __global  uint*  rowNnz,
                              __global  float* matDatat,    // OUTPUT MATRIX DATA
                              __global  uint*  colIdxt,
                              __global  uint*  rowNnzt
                              ) {
      uint gid = get_global_id(0);

      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }

        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = fabs(xsum[i]);
          bool larger = tempval > fabs(valx);
          posx += (larger)?1:0;
        }
        if (posx < topky) {
          youtidx[posx] = gid;
        }
      }

      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          //}
          ysum[gid] += val;
        }

        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = fabs(ysum[i]);
          bool larger = tempval > fabs(valy);
          posy += (larger)?1:0;
        }

        if (posy < topkx) {
          xoutidx[posy] = gid;
        }
      }

      if (gid < topkx) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topkx; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[gid] = gid;
      }

      if (gid < topky) {
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topky; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[gid] = gid;
      }

      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < isize) {
        for (uint i=0; i<topkx; i++) {
          matData[gid*topkx+i] = 0;
          colIdx[gid*topkx+i] = 0;
        }
        rowNnz[gid] = 0;
      }
      if (gid < osize) {
        for (uint i=0; i<topky; i++) {
          matDatat[gid*topky+i] = 0;
          colIdxt[gid*topky+i] = 0;
        }
        rowNnzt[gid] = 0;
      }


      if (gid < topkx) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topky; j++) {
          uint idxy = youtidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxy;
            uint yidx2 = osize*k+idxx;
            uint colidx = idxy;
            matDatat[idxx*topky+j] += x[xidx2] * y[yidx2];
            colIdxt[idxx*topky+j] = idxy;
            if (gid == 0)
              printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnzt[idxx] += 1;
        }
      }
      if (gid < topky) {
        uint idxx = youtidx[gid];
        for (uint j=0; j<topkx; j++) {
          uint idxy = xoutidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            uint colidx = idxy;
            matData[idxx*topkx+j] += x[xidx2] * y[yidx2];
            colIdx[idxx*topkx+j] = idxy;
            if (gid == 0)
              printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnz[idxx] += 1;
        }
      }
    }""").build()

In [274]:
a.shape, b.shape

((8, 16), (16, 5))

In [275]:
rows = a.shape[0]
msize = a.shape[1]

In [276]:
cols = b.shape[1]

In [277]:
mult = a.dot(b)

In [278]:
mult = mult.astype(np.float32)

In [279]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.genwupdate4  # Use this Kernel object for repeate/duald calls
evt = knl(queue, [max(rows,cols)], None, a_buf, b_buf, x_sum_buf, y_sum_buf, np.uint32(rows), np.uint32(msize),np.uint32(cols), 
          np.uint32(topkx),np.uint32(topky), x_idx_buf, y_idx_buf, sdata_buf, sidxs_buf, snnzs_buf, sdatat_buf, sidxst_buf, snnzst_buf)


 ADD VAL:0.00,0.49 - (0,0) - (0,0,0)
 ADD VAL:0.68,0.66 - (0,0) - (0,0,1)
 ADD VAL:0.80,0.31 - (0,0) - (0,0,2)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,3)
 ADD VAL:0.00,0.71 - (0,0) - (0,0,4)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,5)
 ADD VAL:0.00,0.46 - (0,0) - (0,0,6)
 ADD VAL:0.08,0.00 - (0,0) - (0,0,7)
 ADD VAL:0.89,0.00 - (0,0) - (0,0,8)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,9)
 ADD VAL:0.18,0.00 - (0,0) - (0,0,10)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,11)
 ADD VAL:0.46,0.00 - (0,0) - (0,0,12)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,13)
 ADD VAL:0.00,0.19 - (0,0) - (0,0,14)
 ADD VAL:0.00,0.73 - (0,0) - (0,0,15)
 ADD VAL:0.96,0.49 - (0,1) - (0,1,0)
 ADD VAL:0.55,0.66 - (0,1) - (0,1,1)
 ADD VAL:0.00,0.31 - (0,1) - (0,1,2)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,3)
 ADD VAL:0.54,0.71 - (0,1) - (0,1,4)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,5)
 ADD VAL:0.00,0.46 - (0,1) - (0,1,6)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,7)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,8)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,9)
 ADD VAL:0.00,0.00 - (0,1) - (0

In [280]:
resxsum = np.zeros(a.shape[0]).astype(np.float32)
resysum = np.zeros(b.shape[1]).astype(np.float32)
resxidx = np.zeros(topkx).astype(np.uint32)
resyidx = np.zeros(topky).astype(np.uint32)
resxdat = np.zeros(a.shape[0]*topkx).astype(np.float32)
resxcol = np.zeros(a.shape[0]*topkx).astype(np.uint32)
resxnnz = np.zeros(a.shape[0]).astype(np.uint32)
resxdatt = np.zeros(b.shape[1]*topky).astype(np.float32)
resxcolt = np.zeros(b.shape[1]*topky).astype(np.uint32)
resxnnzt = np.zeros(b.shape[1]).astype(np.uint32)

cl.enqueue_copy(queue, resxsum, x_sum_buf)
cl.enqueue_copy(queue, resysum, y_sum_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)
cl.enqueue_copy(queue, resxdat, sdata_buf)
cl.enqueue_copy(queue, resxcol, sidxs_buf)
cl.enqueue_copy(queue, resxnnz, snnzs_buf)
cl.enqueue_copy(queue, resxdatt, sdatat_buf)
cl.enqueue_copy(queue, resxcolt, sidxst_buf)
cl.enqueue_copy(queue, resxnnzt, snnzst_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642f5c70>

## results

In [281]:
topkx, topky

(5, 8)

In [282]:
mult.shape

(8, 5)

In [283]:
mult

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [284]:
resxdatt.reshape(b.shape[1],topky).T

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [285]:
resxcol.reshape(a.shape[0],topkx)

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]], dtype=uint32)

In [286]:
resxnnz.reshape(a.shape[0])

array([5, 5, 5, 5, 5, 5, 5, 5], dtype=uint32)

In [287]:
resxdatt.reshape(b.shape[1],topky)

array([[0.6996368 , 1.8848312 , 0.16573755, 1.130233  , 0.49532753,
        0.60025734, 0.17482497, 0.7240762 ],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        0.7728439 , 0.30367106, 0.27932334],
       [1.6236207 , 1.4160932 , 1.40742   , 1.1950791 , 0.8187757 ,
        1.2898692 , 0.47157714, 0.99110806],
       [0.3340667 , 0.30414024, 0.7711447 , 0.32051137, 0.1351769 ,
        0.9285497 , 0.544282  , 0.09674796],
       [0.61331564, 0.29892322, 0.        , 0.6311476 , 0.01893103,
        0.        , 0.7309462 , 0.8390946 ]], dtype=float32)

In [288]:
resxcolt.reshape(b.shape[1],topky)

array([[0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7]], dtype=uint32)

In [289]:
resxnnzt.reshape(b.shape[1])

array([8, 8, 8, 8, 8], dtype=uint32)

In [290]:
resdense = to_dense(resxdat, resxcol, resxnnz, topkx, mult.shape)
resdense

array([[0.69963682, 0.15550727, 1.62362075, 0.33406669, 0.61331564],
       [1.88483119, 0.18735047, 1.41609323, 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742004, 0.77114469, 0.        ],
       [1.13023305, 0.31159392, 1.19507909, 0.32051137, 0.63114762],
       [0.49532753, 0.9677906 , 0.81877571, 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.28986919, 0.92854971, 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.54428202, 0.73094618],
       [0.72407621, 0.27932334, 0.99110806, 0.09674796, 0.83909458]])

In [291]:
resdenset = to_dense(resxdatt, resxcolt, resxnnzt, topky, mult.T.shape)
resdenset

array([[0.69963682, 1.88483119, 0.16573755, 1.13023305, 0.49532753,
        0.60025734, 0.17482497, 0.72407621],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        0.7728439 , 0.30367106, 0.27932334],
       [1.62362075, 1.41609323, 1.40742004, 1.19507909, 0.81877571,
        1.28986919, 0.47157714, 0.99110806],
       [0.33406669, 0.30414024, 0.77114469, 0.32051137, 0.1351769 ,
        0.92854971, 0.54428202, 0.09674796],
       [0.61331564, 0.29892322, 0.        , 0.63114762, 0.01893103,
        0.        , 0.73094618, 0.83909458]])

In [292]:
resdense == resdenset.T

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

## comp

In [293]:
resxsum

array([3.0937524, 3.5246177, 2.3468003, 3.2115116, 3.102983 , 2.4004393,
       2.1758807, 2.6182923], dtype=float32)

In [294]:
a.sum(axis=1)

array([3.0937524, 3.5246177, 2.3468003, 3.2115116, 3.102983 , 2.4004393,
       2.1758807, 2.6182923], dtype=float32)

In [295]:
resysum

array([3.5506585, 3.3332088, 5.77045  , 3.095629 , 2.0694818],
      dtype=float32)

In [296]:
b.sum(axis=0)

array([3.5506585, 3.3332088, 5.77045  , 3.095629 , 2.0694818],
      dtype=float32)

In [297]:
mult

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [298]:
resxidx

array([0, 1, 2, 3, 4], dtype=uint32)

In [299]:
resyidx

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=uint32)

In [300]:
asdf

NameError: name 'asdf' is not defined

## Weight update kernel new3 (sparse ouput)

In [301]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [302]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topkx*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topky*4)
sdata_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
sidxs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
snnzs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*4)
sdatat_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
sidxst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
snnzst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate4(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topkx,
                              uint topky,
                              __global  uint*  xoutidx,
                              __global  uint*  youtidx,
                              __global  float* matData,     // OUTPUT MATRIX DATA
                              __global  uint*  colIdx,
                              __global  uint*  rowNnz,
                              __global  float* matDatat,    // OUTPUT MATRIX DATA
                              __global  uint*  colIdxt,
                              __global  uint*  rowNnzt
                              ) {
      uint gid = get_global_id(0);

      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }

        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = fabs(xsum[i]);
          bool larger = tempval > fabs(valx);
          posx += (larger)?1:0;
        }
        if (posx < topky) {
          youtidx[posx] = gid;
        }
      }

      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          //}
          ysum[gid] += val;
        }

        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = fabs(ysum[i]);
          bool larger = tempval > fabs(valy);
          posy += (larger)?1:0;
        }

        if (posy < topkx) {
          xoutidx[posy] = gid;
        }
      }

      if (gid < topkx) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topkx; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[gid] = gid;
      }

      if (gid < topky) {
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topky; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[gid] = gid;
      }

      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < isize) {
        for (uint i=0; i<topkx; i++) {
          matData[gid*topkx+i] = 0;
          colIdx[gid*topkx+i] = 0;
        }
        rowNnz[gid] = 0;
      }
      if (gid < osize) {
        for (uint i=0; i<topky; i++) {
          matDatat[gid*topky+i] = 0;
          colIdxt[gid*topky+i] = 0;
        }
        rowNnzt[gid] = 0;
      }


      if (gid < topkx) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topky; j++) {
          uint idxy = youtidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxy;
            uint yidx2 = osize*k+idxx;
            uint colidx = idxy;
            matDatat[idxx*topky+j] += x[xidx2] * y[yidx2];
            colIdxt[idxx*topky+j] = idxy;
            if (gid == 0)
              printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnzt[idxx] += 1;
        }
      }
      if (gid < topky) {
        uint idxx = youtidx[gid];
        for (uint j=0; j<topkx; j++) {
          uint idxy = xoutidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            uint colidx = idxy;
            matData[idxx*topkx+j] += x[xidx2] * y[yidx2];
            colIdx[idxx*topkx+j] = idxy;
            if (gid == 0)
              printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnz[idxx] += 1;
        }
      }
    }""").build()

In [303]:
a.shape, b.shape

((8, 16), (16, 5))

In [304]:
rows = a.shape[0]
msize = a.shape[1]

In [305]:
cols = b.shape[1]

In [306]:
mult = a.dot(b)

In [307]:
mult = mult.astype(np.float32)

In [308]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.genwupdate4  # Use this Kernel object for repeate/duald calls
evt = knl(queue, [max(rows,cols)], None, a_buf, b_buf, x_sum_buf, y_sum_buf, np.uint32(rows), np.uint32(msize),np.uint32(cols), 
          np.uint32(topkx),np.uint32(topky), x_idx_buf, y_idx_buf, sdata_buf, sidxs_buf, snnzs_buf, sdatat_buf, sidxst_buf, snnzst_buf)


 ADD VAL:0.00,0.49 - (0,0) - (0,0,0)
 ADD VAL:0.68,0.66 - (0,0) - (0,0,1)
 ADD VAL:0.80,0.31 - (0,0) - (0,0,2)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,3)
 ADD VAL:0.00,0.71 - (0,0) - (0,0,4)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,5)
 ADD VAL:0.00,0.46 - (0,0) - (0,0,6)
 ADD VAL:0.08,0.00 - (0,0) - (0,0,7)
 ADD VAL:0.89,0.00 - (0,0) - (0,0,8)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,9)
 ADD VAL:0.18,0.00 - (0,0) - (0,0,10)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,11)
 ADD VAL:0.46,0.00 - (0,0) - (0,0,12)
 ADD VAL:0.00,0.00 - (0,0) - (0,0,13)
 ADD VAL:0.00,0.19 - (0,0) - (0,0,14)
 ADD VAL:0.00,0.73 - (0,0) - (0,0,15)
 ADD VAL:0.96,0.49 - (0,1) - (0,1,0)
 ADD VAL:0.55,0.66 - (0,1) - (0,1,1)
 ADD VAL:0.00,0.31 - (0,1) - (0,1,2)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,3)
 ADD VAL:0.54,0.71 - (0,1) - (0,1,4)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,5)
 ADD VAL:0.00,0.46 - (0,1) - (0,1,6)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,7)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,8)
 ADD VAL:0.00,0.00 - (0,1) - (0,1,9)
 ADD VAL:0.00,0.00 - (0,1) - (0

In [309]:
resxsum = np.zeros(a.shape[0]).astype(np.float32)
resysum = np.zeros(b.shape[1]).astype(np.float32)
resxidx = np.zeros(topkx).astype(np.uint32)
resyidx = np.zeros(topky).astype(np.uint32)
resxdat = np.zeros(a.shape[0]*topkx).astype(np.float32)
resxcol = np.zeros(a.shape[0]*topkx).astype(np.uint32)
resxnnz = np.zeros(a.shape[0]).astype(np.uint32)
resxdatt = np.zeros(b.shape[1]*topky).astype(np.float32)
resxcolt = np.zeros(b.shape[1]*topky).astype(np.uint32)
resxnnzt = np.zeros(b.shape[1]).astype(np.uint32)

cl.enqueue_copy(queue, resxsum, x_sum_buf)
cl.enqueue_copy(queue, resysum, y_sum_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)
cl.enqueue_copy(queue, resxdat, sdata_buf)
cl.enqueue_copy(queue, resxcol, sidxs_buf)
cl.enqueue_copy(queue, resxnnz, snnzs_buf)
cl.enqueue_copy(queue, resxdatt, sdatat_buf)
cl.enqueue_copy(queue, resxcolt, sidxst_buf)
cl.enqueue_copy(queue, resxnnzt, snnzst_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642fa7c0>

## results

In [310]:
topkx, topky

(5, 8)

In [311]:
mult.shape

(8, 5)

In [312]:
mult

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [313]:
resxdatt.reshape(b.shape[1],topky).T

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [314]:
resxcol.reshape(a.shape[0],topkx)

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]], dtype=uint32)

In [315]:
resxnnz.reshape(a.shape[0])

array([5, 5, 5, 5, 5, 5, 5, 5], dtype=uint32)

In [316]:
resxdatt.reshape(b.shape[1],topky)

array([[0.6996368 , 1.8848312 , 0.16573755, 1.130233  , 0.49532753,
        0.60025734, 0.17482497, 0.7240762 ],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        0.7728439 , 0.30367106, 0.27932334],
       [1.6236207 , 1.4160932 , 1.40742   , 1.1950791 , 0.8187757 ,
        1.2898692 , 0.47157714, 0.99110806],
       [0.3340667 , 0.30414024, 0.7711447 , 0.32051137, 0.1351769 ,
        0.9285497 , 0.544282  , 0.09674796],
       [0.61331564, 0.29892322, 0.        , 0.6311476 , 0.01893103,
        0.        , 0.7309462 , 0.8390946 ]], dtype=float32)

In [317]:
resxcolt.reshape(b.shape[1],topky)

array([[0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7],
       [0, 1, 2, 3, 4, 5, 6, 7]], dtype=uint32)

In [318]:
resxnnzt.reshape(b.shape[1])

array([8, 8, 8, 8, 8], dtype=uint32)

In [319]:
resdense = to_dense(resxdat, resxcol, resxnnz, topkx, mult.shape)
resdense

array([[0.69963682, 0.15550727, 1.62362075, 0.33406669, 0.61331564],
       [1.88483119, 0.18735047, 1.41609323, 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742004, 0.77114469, 0.        ],
       [1.13023305, 0.31159392, 1.19507909, 0.32051137, 0.63114762],
       [0.49532753, 0.9677906 , 0.81877571, 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.28986919, 0.92854971, 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.54428202, 0.73094618],
       [0.72407621, 0.27932334, 0.99110806, 0.09674796, 0.83909458]])

In [320]:
resdenset = to_dense(resxdatt, resxcolt, resxnnzt, topky, mult.T.shape)
resdenset

array([[0.69963682, 1.88483119, 0.16573755, 1.13023305, 0.49532753,
        0.60025734, 0.17482497, 0.72407621],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        0.7728439 , 0.30367106, 0.27932334],
       [1.62362075, 1.41609323, 1.40742004, 1.19507909, 0.81877571,
        1.28986919, 0.47157714, 0.99110806],
       [0.33406669, 0.30414024, 0.77114469, 0.32051137, 0.1351769 ,
        0.92854971, 0.54428202, 0.09674796],
       [0.61331564, 0.29892322, 0.        , 0.63114762, 0.01893103,
        0.        , 0.73094618, 0.83909458]])

In [321]:
resdense == resdenset.T

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

## comp

In [322]:
resxsum

array([3.0937524, 3.5246177, 2.3468003, 3.2115116, 3.102983 , 2.4004393,
       2.1758807, 2.6182923], dtype=float32)

In [323]:
a.sum(axis=1)

array([3.0937524, 3.5246177, 2.3468003, 3.2115116, 3.102983 , 2.4004393,
       2.1758807, 2.6182923], dtype=float32)

In [324]:
resysum

array([3.5506585, 3.3332088, 5.77045  , 3.095629 , 2.0694818],
      dtype=float32)

In [325]:
b.sum(axis=0)

array([3.5506585, 3.3332088, 5.77045  , 3.095629 , 2.0694818],
      dtype=float32)

In [326]:
mult

array([[0.6996368 , 0.15550727, 1.6236207 , 0.3340667 , 0.61331564],
       [1.8848312 , 0.18735047, 1.4160932 , 0.30414024, 0.29892322],
       [0.16573755, 0.51614475, 1.40742   , 0.7711447 , 0.        ],
       [1.130233  , 0.31159392, 1.1950791 , 0.32051137, 0.6311476 ],
       [0.49532753, 0.9677906 , 0.8187757 , 0.1351769 , 0.01893103],
       [0.60025734, 0.7728439 , 1.2898692 , 0.9285497 , 0.        ],
       [0.17482497, 0.30367106, 0.47157714, 0.544282  , 0.7309462 ],
       [0.7240762 , 0.27932334, 0.99110806, 0.09674796, 0.8390946 ]],
      dtype=float32)

In [327]:
resxidx

array([0, 1, 2, 3, 4], dtype=uint32)

In [328]:
resyidx

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=uint32)

In [329]:
asdf

NameError: name 'asdf' is not defined

## Prune Weights

In [330]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)

prg = cl.Program(ctx, """
    // prunes weights smaller than a constant C
    __kernel void prune(__global  float* matData,     // INPUT MATRIX DATA
                        __global  uint*  colIdx,
                        __global  uint*  rowNnz,
                        uint ellw,
                        float pruneval) { 
      uint gid = get_global_id(0);
      
      uint nnzs = rowNnz[gid];
      for (uint i=0; i<nnzs; i++) {
        uint idx = ellw * gid + i;
        float val = matData[idx];
        printf("\\nDATA:%.2f - %.2f", matData[idx], pruneval);
        if(fabs(val)<pruneval) {
          printf("\\nPRUNE(%i): %.2f", gid, matData[idx]);
          for (uint j=i; j<=nnzs-1; j++) {
            uint idx2 = ellw * gid + j;
            matData[idx2] = matData[idx2+1];
            colIdx[idx2] = colIdx[idx2+1];
          }
          matData[ellw*gid+nnzs] = 0;
          colIdx[ellw*gid+nnzs] = 0;
          rowNnz[gid] -= 1;
          nnzs = rowNnz[gid];
        }
      }
    }""").build()

In [331]:
a.shape

(8, 16)

In [332]:
rows = a.shape[0]
cols = a.shape[1]

pruneval = .35

In [333]:
knl = prg.prune  # Use this Kernel object for repeated calls
evt = knl(queue, [rows,], None, adata_buf, acols_buf, annzs_buf, np.uint32(ellwa), np.float32(pruneval))


DATA:0.68 - 0.35
DATA:0.96 - 0.35
DATA:0.20 - 0.35
DATA:0.14 - 0.35
DATA:0.60 - 0.35
DATA:0.11 - 0.35
DATA:0.29 - 0.35
DATA:0.68 - 0.35
PRUNE(2): 0.20
PRUNE(3): 0.14
PRUNE(5): 0.11
PRUNE(6): 0.29
DATA:0.80 - 0.35
DATA:0.55 - 0.35
DATA:0.58 - 0.35
DATA:0.03 - 0.35
DATA:0.92 - 0.35
DATA:0.28 - 0.35
DATA:0.65 - 0.35
DATA:0.36 - 0.35
PRUNE(3): 0.03
PRUNE(5): 0.28
DATA:0.08 - 0.35
DATA:0.54 - 0.35
DATA:0.05 - 0.35
DATA:0.74 - 0.35
DATA:0.16 - 0.35
DATA:0.83 - 0.35
DATA:0.33 - 0.35
DATA:0.70 - 0.35
PRUNE(0): 0.08
PRUNE(2): 0.05
PRUNE(4): 0.16
PRUNE(6): 0.33
DATA:0.18 - 0.35
DATA:0.20 - 0.35
DATA:0.18 - 0.35
DATA:0.67 - 0.35
DATA:0.76 - 0.35
DATA:0.10 - 0.35
DATA:0.01 - 0.35
DATA:0.68 - 0.35
PRUNE(0): 0.18
PRUNE(1): 0.20
PRUNE(2): 0.18
PRUNE(5): 0.10
PRUNE(6): 0.01
DATA:0.92 - 0.35
DATA:0.49 - 0.35
DATA:0.12 - 0.35
PRUNE(7): 0.12

In [334]:
resxdat = np.zeros(adata.shape).astype(np.float32)
resxcol = np.zeros(acols.shape).astype(np.uint32)
resxnnz = np.zeros(annz.shape).astype(np.uint32)

cl.enqueue_copy(queue, resxdat, adata_buf)
cl.enqueue_copy(queue, resxcol, acols_buf)
cl.enqueue_copy(queue, resxnnz, annzs_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642b7d60>

In [335]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [336]:
adata.reshape((4,-1))

array([[0.68347037, 0.8035886 , 0.08023349, 0.8858316 , 0.17861794,
        0.46201044, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.9562663 , 0.5531271 , 0.5375557 , 0.20370705,
        0.35530138, 0.9186601 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.20103766, 0.74341315, 0.5787496 , 0.05387774, 0.5917517 ,
        0.17797045, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.14296253, 0.9103574 , 0.03242496, 0.71459514,
        0.7448136 , 0.66635793, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.59893256, 0.9217044 , 0.16333483, 0.17167741, 0.7605819 ,
        0.48675168, 0.        , 0.        , 0.    

In [337]:
acols.reshape((4,-1))

array([[ 1,  2,  7,  8, 10, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  1,  4, 11, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  5,  9, 11, 13, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         3,  4,  5,  7, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  7,  9, 11, 14, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  1,  5,  7, 10, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  3,  5, 12, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  3,  6,  8, 11, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=uint32)

In [338]:
resxdat.reshape((4,-1))

array([[0.68347037, 0.8035886 , 0.8858316 , 0.46201044, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.9562663 , 0.5531271 , 0.5375557 , 0.35530138,
        0.9186601 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.74341315, 0.5787496 , 0.5917517 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.9103574 , 0.71459514, 0.7448136 , 0.66635793,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.59893256, 0.9217044 , 0.17167741, 0.7605819 , 0.48675168,
        0.        , 0.        , 0.        , 0.    

In [339]:
resxcol.reshape((4,-1))

array([[ 1,  2,  8, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  1,  4, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 5,  9, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         4,  7, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  7, 11, 14, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1,  7, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  3, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  3,  6,  8, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=uint32)

In [340]:
resxnnz

array([4, 5, 3, 4, 5, 3, 3, 5], dtype=uint32)

## results

In [341]:
mult.T

array([[0.6996368 , 1.8848312 , 0.16573755, 1.130233  , 0.49532753,
        0.60025734, 0.17482497, 0.7240762 ],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        0.7728439 , 0.30367106, 0.27932334],
       [1.6236207 , 1.4160932 , 1.40742   , 1.1950791 , 0.8187757 ,
        1.2898692 , 0.47157714, 0.99110806],
       [0.3340667 , 0.30414024, 0.7711447 , 0.32051137, 0.1351769 ,
        0.9285497 , 0.544282  , 0.09674796],
       [0.61331564, 0.29892322, 0.        , 0.6311476 , 0.01893103,
        0.        , 0.7309462 , 0.8390946 ]], dtype=float32)

In [342]:
resxdat.reshape(a.shape[0],topk)

ValueError: cannot reshape array of size 128 into shape (8,5)

In [343]:
resxcol.reshape(a.shape[0],topk)

ValueError: cannot reshape array of size 128 into shape (8,5)

In [344]:
resxnnz.reshape(a.shape[0])

array([4, 5, 3, 4, 5, 3, 3, 5], dtype=uint32)

In [345]:
resxdatt.reshape(b.shape[1],topk)

ValueError: cannot reshape array of size 40 into shape (5,5)

In [346]:
resxcolt.reshape(b.shape[1],topk)

ValueError: cannot reshape array of size 40 into shape (5,5)

In [347]:
resxnnzt.reshape(b.shape[1])

array([8, 8, 8, 8, 8], dtype=uint32)

### Update Vals (add sparse)

In [348]:
b.shape

(16, 5)

In [349]:
randadd = np.random.rand(*b.shape)

In [350]:
randdata, randcols, randnnz, randellw = to_data(randadd)
bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
randdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=randdata)
randcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=randcols)
randnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=randnnz)


prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            __global  float* matDataAdd,     // INPUT MATRIX DATA
                            __global  uint*  colIdxAdd,
                            __global  uint*  rowNnzAdd,
                            uint ellwidthAdd
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];

      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*ellwidthAdd;

      uint nnzadd = rowNnzAdd[gid];
      
      uint m = 0;
      for (uint i=0; i<nnzadd; i++) {
        float addval = matDataAdd[baseidxd+i] * lr;
        uint addcol = colIdxAdd[baseidxd+i];
        
        if (addval == 0.0) {
          //printf("\\nZERO VAL, CONT: %.2f - %i", addval, gid);
          continue;
        }

        uint refcol = colIdx[baseidxs+i];
        m = 0;
        while (refcol < addcol && (i+m) < nnz) {
          m += 1;
          refcol = colIdx[baseidxs+i+m];
        }

        //if (gid == 0)
        //  printf("\\nADD VAL:%.2f  ADDCOL:%i  ref:(%i)  gid/i/m:(%i/%i%i)", addval, addcol, refcol, gid,i,m);
        
        if (addcol == refcol) {
          matData[baseidxs+i+m] += addval;
          //if (gid == 0)
          //  printf("\\nINCREMENT: %.2f",addval);
          continue;
        } else {
          //if (gid == 0)
          //  printf("\\nADD: %.2f %i-%i",addval, addcol, refcol);
          if (rowNnz[gid] >= ellwidth) {
            break;
          }

          for (uint j=nnz; j>i+m; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
        
          //if (gid == 0)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, addcol);
          matData[baseidxs+i+m] = addval;
          colIdx[baseidxs+i+m] = addcol;
        }
      }
    }""").build()

In [351]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [352]:
rows = b.shape[0]

In [353]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, bdata_buf, bcols_buf, bnnzs_buf, np.float32(1), np.uint32(ellwb), 
    randdata_buf, randcols_buf, randnnzs_buf, np.uint32(randellw))

<pyopencl._cl.Event at 0x7f5a642f19a0>

In [354]:
data_res = np.empty_like(bdata)
cols_res = np.empty_like(bcols)
nnzs_res = np.empty_like(bnnz)
cl.enqueue_copy(queue, data_res, bdata_buf, is_blocking=True)
cl.enqueue_copy(queue, cols_res, bcols_buf, is_blocking=True)
cl.enqueue_copy(queue, nnzs_res, bnnzs_buf, is_blocking=True)

<pyopencl._cl.NannyEvent at 0x7f5a642acc70>

In [355]:
b

array([[0.49395892, 0.        , 0.46451807, 0.        , 0.        ],
       [0.65864426, 0.        , 0.9786625 , 0.        , 0.        ],
       [0.31044865, 0.        , 0.02502257, 0.        , 0.        ],
       [0.        , 0.46765924, 0.        , 0.        , 0.03160795],
       [0.71082693, 0.        , 0.58997136, 0.        , 0.        ],
       [0.        , 0.        , 0.84622455, 0.5916316 , 0.        ],
       [0.46450275, 0.        , 0.        , 0.        , 0.9429863 ],
       [0.        , 0.34248227, 0.7672639 , 0.        , 0.        ],
       [0.        , 0.        , 0.9855889 , 0.        , 0.2535647 ],
       [0.        , 0.7705159 , 0.31522992, 0.        , 0.        ],
       [0.        , 0.7167741 , 0.        , 0.8238369 , 0.        ],
       [0.        , 0.91970533, 0.        , 0.7873889 , 0.        ],
       [0.        , 0.        , 0.        , 0.404568  , 0.841323  ],
       [0.        , 0.        , 0.674503  , 0.48820347, 0.        ],
       [0.18725161, 0.11607183, 0.

In [356]:
bcols

array([0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 1, 4, 0, 0, 0, 0, 2,
       0, 0, 0, 2, 3, 0, 0, 0, 0, 4, 0, 0, 0, 1, 2, 0, 0, 0, 2, 4, 0, 0,
       0, 1, 2, 0, 0, 0, 1, 3, 0, 0, 0, 1, 3, 0, 0, 0, 3, 4, 0, 0, 0, 2,
       3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0], dtype=uint32)

In [357]:
data_res

array([0.59898084, 0.72692764, 1.413269  , 0.18923925, 0.60242975,
       0.66276884, 0.7435114 , 1.1969025 , 0.01742795, 0.73911434,
       1.2171143 , 0.60595757, 0.18582854, 0.53565234, 0.95624155,
       0.3272366 , 1.1077902 , 0.5512528 , 0.9750269 , 0.15299505,
       1.1938903 , 0.558434  , 0.82644945, 0.70988417, 0.99955094,
       0.9250974 , 0.00653369, 1.2793535 , 0.59819007, 0.9270997 ,
       1.2004621 , 0.20090504, 0.2725012 , 0.9116319 , 1.5838816 ,
       0.31735668, 1.1819086 , 1.0524116 , 0.754549  , 0.79547673,
       0.8382792 , 0.01663798, 1.2586385 , 0.41384438, 0.2883268 ,
       0.76330817, 1.0443709 , 0.89169025, 0.6034687 , 0.64717245,
       0.39072165, 1.3489455 , 0.3536333 , 1.8236    , 0.24869065,
       0.9900133 , 1.717294  , 0.4082332 , 0.9859982 , 0.12968084,
       0.7715938 , 0.5082965 , 0.7822947 , 0.6116313 , 0.88699055,
       0.10459811, 0.5772082 , 1.5592778 , 0.9416207 , 0.09933764,
       0.3348385 , 0.37105194, 0.26246974, 0.02615215, 0.20316

In [358]:
cols_res

array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1,
       2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3,
       4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
       1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=uint32)

In [359]:
nnzs_res

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5], dtype=uint32)

In [360]:
randadd

array([[0.10502191, 0.72692763, 0.94875092, 0.18923926, 0.60242976],
       [0.00412456, 0.74351135, 0.21824009, 0.01742795, 0.73911432],
       [0.90666569, 0.60595756, 0.16080596, 0.53565232, 0.95624154],
       [0.3272366 , 0.64013108, 0.5512528 , 0.97502693, 0.12138711],
       [0.48306338, 0.55843403, 0.23647809, 0.70988416, 0.99955097],
       [0.92509741, 0.00653369, 0.43312893, 0.0065585 , 0.92709972],
       [0.73595936, 0.20090504, 0.27250121, 0.91163188, 0.64089539],
       [0.31735668, 0.83942634, 0.28514763, 0.75454901, 0.79547676],
       [0.8382792 , 0.01663799, 0.27304963, 0.41384439, 0.03476211],
       [0.76330818, 0.27385498, 0.57646036, 0.6034687 , 0.64717247],
       [0.39072165, 0.63217139, 0.35363332, 0.99976313, 0.24869065],
       [0.99001328, 0.7975886 , 0.40823321, 0.1986093 , 0.12968084],
       [0.77159383, 0.50829649, 0.78229467, 0.20706327, 0.0456675 ],
       [0.10459812, 0.5772082 , 0.88477483, 0.45341721, 0.09933764],
       [0.14758689, 0.25498012, 0.

In [361]:
adenseadd = to_dense(data_res, cols_res, nnzs_res, ellwb, b.shape)
adenseadd

array([[0.59898084, 0.72692764, 1.41326904, 0.18923925, 0.60242975],
       [0.66276884, 0.74351138, 1.19690251, 0.01742795, 0.73911434],
       [1.21711433, 0.60595757, 0.18582854, 0.53565234, 0.95624155],
       [0.32723659, 1.10779023, 0.55125278, 0.97502691, 0.15299505],
       [1.19389033, 0.55843401, 0.82644945, 0.70988417, 0.99955094],
       [0.92509741, 0.00653369, 1.2793535 , 0.59819007, 0.9270997 ],
       [1.2004621 , 0.20090504, 0.2725012 , 0.91163188, 1.58388162],
       [0.31735668, 1.18190861, 1.05241156, 0.75454903, 0.79547673],
       [0.83827919, 0.01663798, 1.2586385 , 0.41384438, 0.2883268 ],
       [0.76330817, 1.04437089, 0.89169025, 0.60346872, 0.64717245],
       [0.39072165, 1.3489455 , 0.35363331, 1.82360005, 0.24869065],
       [0.9900133 , 1.71729398, 0.4082332 , 0.98599821, 0.12968084],
       [0.77159381, 0.50829649, 0.78229469, 0.61163127, 0.88699055],
       [0.10459811, 0.57720822, 1.55927777, 0.94162071, 0.09933764],
       [0.33483851, 0.37105194, 0.

In [362]:
baseadd = (b+randadd)
baseadd

array([[0.59898083, 0.72692763, 1.41326899, 0.18923926, 0.60242976],
       [0.66276882, 0.74351135, 1.19690258, 0.01742795, 0.73911432],
       [1.21711433, 0.60595756, 0.18582853, 0.53565232, 0.95624154],
       [0.3272366 , 1.10779031, 0.5512528 , 0.97502693, 0.15299506],
       [1.19389031, 0.55843403, 0.82644945, 0.70988416, 0.99955097],
       [0.92509741, 0.00653369, 1.27935348, 0.59819009, 0.92709972],
       [1.20046211, 0.20090504, 0.27250121, 0.91163188, 1.5838817 ],
       [0.31735668, 1.18190861, 1.05241152, 0.75454901, 0.79547676],
       [0.8382792 , 0.01663799, 1.25863854, 0.41384439, 0.2883268 ],
       [0.76330818, 1.0443709 , 0.89169028, 0.6034687 , 0.64717247],
       [0.39072165, 1.3489455 , 0.35363332, 1.82360005, 0.24869065],
       [0.99001328, 1.71729394, 0.40823321, 0.98599822, 0.12968084],
       [0.77159383, 0.50829649, 0.78229467, 0.61163126, 0.88699052],
       [0.10459812, 0.5772082 , 1.55927785, 0.94162067, 0.09933764],
       [0.3348385 , 0.37105194, 0.

In [363]:
adenseadd - baseadd

array([[ 1.54027120e-08,  9.56472290e-09,  5.14008753e-08,
        -6.51070109e-09, -9.59787139e-09],
       [ 1.93356010e-08,  2.51855449e-08, -6.32907975e-08,
        -2.64986588e-10,  2.27917963e-08],
       [-2.88382052e-09,  1.10168039e-08,  6.01101946e-09,
         2.21009436e-08,  3.57082952e-09],
       [-8.19521695e-09, -7.95521116e-08, -1.66340128e-08,
        -2.61371355e-08, -6.07570516e-09],
       [ 2.46283216e-08, -1.80848190e-08,  2.37064823e-09,
         5.05839981e-09, -2.91283488e-08],
       [-2.31012320e-09,  7.05647762e-11,  1.90398737e-08,
        -2.27391351e-08, -1.04726119e-08],
       [-1.18419576e-08, -3.52715623e-09, -6.98318958e-09,
        -9.54711865e-10, -8.01039124e-08],
       [-4.64041106e-09, -2.64914934e-09,  3.31328849e-08,
         1.39339376e-08, -2.67710514e-08],
       [-1.59260511e-08, -6.67406574e-10, -3.59990335e-08,
        -1.04928971e-08, -4.27748725e-10],
       [-9.11933251e-09, -1.16793284e-08, -2.69303090e-08,
         1.52864228e-08

In [364]:
(adenseadd - baseadd).sum()

-2.9250741062636365e-07

### Update Vals (add sparset)

In [365]:
multt=np.zeros(mult.T.shape)

for row in range(multt.shape[0]):
    for col in range(multt.shape[1]):
        multt[row][col] = mult[col][row]

In [366]:
multdata, multcols, multnnz, multellw = to_data(multt)
multdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multdata)
multcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multcols)
multnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multnnz)

In [367]:
a.shape, b.shape

((8, 16), (16, 5))

In [368]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [369]:
rows = mult.T.shape[0]

In [370]:
mult = mult.astype(np.float32)

In [371]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, multdata_buf, multcols_buf, multnnzs_buf, np.float32(1), np.uint32(multellw), 
    sdatat_buf, sidxst_buf, snnzst_buf, np.uint32(topk))

<pyopencl._cl.Event at 0x7f5a642fa090>

In [372]:
mult.T

array([[0.6996368 , 1.8848312 , 0.16573755, 1.130233  , 0.49532753,
        0.60025734, 0.17482497, 0.7240762 ],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        0.7728439 , 0.30367106, 0.27932334],
       [1.6236207 , 1.4160932 , 1.40742   , 1.1950791 , 0.8187757 ,
        1.2898692 , 0.47157714, 0.99110806],
       [0.3340667 , 0.30414024, 0.7711447 , 0.32051137, 0.1351769 ,
        0.9285497 , 0.544282  , 0.09674796],
       [0.61331564, 0.29892322, 0.        , 0.6311476 , 0.01893103,
        0.        , 0.7309462 , 0.8390946 ]], dtype=float32)

In [373]:
data_res = np.empty_like(multdata)
cols_res = np.empty_like(multcols)
nnzs_res = np.empty_like(multnnz)
cl.enqueue_copy(queue, data_res, multdata_buf, is_blocking=True)
cl.enqueue_copy(queue, cols_res, multcols_buf, is_blocking=True)
cl.enqueue_copy(queue, nnzs_res, multnnzs_buf, is_blocking=True)

<pyopencl._cl.NannyEvent at 0x7f5a642b2270>

In [374]:
multt-data_res.reshape(multt.shape)

array([[-0.69963682, -1.88483119, -0.16573755, -1.13023305, -0.49532753,
        -0.60025734, -0.17482497, -0.72407621],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.60025734, -0.17482495, -0.72407627],
       [ 0.        ,  0.        , -0.51614475, -0.31159389, -0.96779054,
        -0.77284396, -0.30367103, -0.27932334],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -0.27932334],
       [ 0.        ,  0.        , -0.63114762, -0.20655912, -0.31513566,
        -1.28986919, -0.47157717, -0.991108  ]])

In [375]:
nnzs_res

array([8, 8, 8, 8, 8], dtype=uint32)

In [376]:
adenseaddt = to_dense(data_res, cols_res, nnzs_res, multellw, multt.shape)
adenseaddt

array([[1.39927363, 3.76966238, 0.33147511, 2.2604661 , 0.99065506,
        1.20051467, 0.34964994, 1.44815242],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        1.37310123, 0.47849602, 1.00339961],
       [1.62362075, 1.41609323, 1.92356479, 1.50667298, 1.78656626,
        2.06271315, 0.77524817, 1.2704314 ],
       [0.33406669, 0.30414024, 0.77114469, 0.32051137, 0.1351769 ,
        0.92854971, 0.54428202, 0.3760713 ],
       [0.33406669, 0.29892322, 0.        , 0.63114762, 0.83770674,
        1.28986919, 1.20252335, 1.83020258]])

In [377]:
multt-adenseaddt

array([[-0.69963682, -1.88483119, -0.16573755, -1.13023305, -0.49532753,
        -0.60025734, -0.17482497, -0.72407621],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.60025734, -0.17482495, -0.72407627],
       [ 0.        ,  0.        , -0.51614475, -0.31159389, -0.96779054,
        -0.77284396, -0.30367103, -0.27932334],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -0.27932334],
       [ 0.27924895,  0.        ,  0.        ,  0.        , -0.81877572,
        -1.28986919, -0.47157717, -0.991108  ]])

In [378]:
adenseaddt

array([[1.39927363, 3.76966238, 0.33147511, 2.2604661 , 0.99065506,
        1.20051467, 0.34964994, 1.44815242],
       [0.15550727, 0.18735047, 0.51614475, 0.31159392, 0.9677906 ,
        1.37310123, 0.47849602, 1.00339961],
       [1.62362075, 1.41609323, 1.92356479, 1.50667298, 1.78656626,
        2.06271315, 0.77524817, 1.2704314 ],
       [0.33406669, 0.30414024, 0.77114469, 0.32051137, 0.1351769 ,
        0.92854971, 0.54428202, 0.3760713 ],
       [0.33406669, 0.29892322, 0.        , 0.63114762, 0.83770674,
        1.28986919, 1.20252335, 1.83020258]])

In [379]:
adenseadd.T == adenseaddt

  adenseadd.T == adenseaddt


False

### Update Vals (add topk to sparse)

In [380]:
matadd = np.random.randn(*a.shape).astype(np.float32)
matadd

array([[-5.22632837e-01,  1.31274581e+00,  6.26713991e-01,
         3.13637584e-01, -3.76596689e-01,  4.88761859e-03,
        -5.55174649e-01,  6.39165103e-01, -2.10940933e+00,
         3.30693685e-02, -2.93532342e-01,  6.84451699e-01,
        -6.47347689e-01,  7.27873564e-01,  7.91488886e-01,
        -1.49035931e-01],
       [ 8.47880363e-01,  8.92729044e-01,  6.21262550e-01,
        -1.14085293e+00, -1.10533178e+00, -1.21122408e+00,
        -8.00590575e-01,  5.91763318e-01, -4.87312883e-01,
        -6.34929419e-01,  2.46132702e-01,  8.31072927e-01,
         5.23058414e-01,  1.22928667e+00, -4.17523742e-01,
         4.02868688e-01],
       [ 8.76122355e-01,  1.19081818e-01, -9.71816957e-01,
        -1.31140959e+00,  3.68975103e-01,  7.99282610e-01,
         8.34740400e-01, -1.62523425e+00,  9.95305061e-01,
         7.06632584e-02, -4.54083085e-01, -1.23369254e-01,
        -1.52907693e+00, -8.85128319e-01, -1.95014015e-01,
        -4.75803852e-01],
       [-1.44459438e+00, -1.63728487e

In [381]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [382]:
a_added = a + matadd

In [383]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   awidth,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*awidth;
      
      for (uint i=0; i<awidth; i++) {
        float addval = vector_x[baseidxd+i];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (i > colIdx[baseidxs+i]) {
            rowNnz[gid] += 1;
            //if (gid==1)
            //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i] = addval;
            colIdx[baseidxs+i] = i;
            continue;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
          if (nnz >= ellwidth)
            break;
        }
      }
    }""").build()

In [384]:
a.shape, b.shape

((8, 16), (16, 5))

In [385]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [386]:
rows = a.shape[0]

In [387]:
mult = mult.astype(np.float32)

In [388]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa),np.uint32(a.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7f5a642f2c20>

In [389]:
matadd[0][0]

-0.52263284

In [390]:
data_res = np.empty_like(adata)
cols_res = np.empty_like(acols)
nnzs_res = np.empty_like(annz)
cl.enqueue_copy(queue, data_res, adata_buf)
cl.enqueue_copy(queue, cols_res, acols_buf)
cl.enqueue_copy(queue, nnzs_res, annzs_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642475e0>

In [391]:
adenseadd = to_dense(data_res, cols_res, nnzs_res, ellwa, a.shape)
adenseadd

array([[-0.52263284,  1.99621618,  1.43030262,  0.31363758, -0.37659669,
         0.00488762, -0.55517465,  0.71939862, -1.22357774,  0.03306937,
        -0.1149144 ,  0.6844517 , -0.18533725,  0.72787356,  0.79148889,
        -0.14903593],
       [ 1.80414665,  1.44585609,  0.62126255, -1.14085293, -0.56777608,
        -1.21122408, -0.80059057,  0.59176332, -0.48731288, -0.63492942,
         0.2461327 ,  1.03478003,  0.87835979,  1.22928667, -0.41752374,
         0.9186601 ],
       [ 0.87612236,  0.32011947, -0.97181696, -1.31140959,  0.3689751 ,
         1.54269576,  0.8347404 , -1.62523425,  0.99530506,  0.64941287,
        -0.45408309, -0.06949151, -1.52907693, -0.29337662, -0.01704356,
        -0.47580385],
       [-1.44459438, -1.63728487, -0.01160916, -0.0574351 , -0.46375471,
        -0.71150678, -0.54578453,  2.46945143,  0.39729461,  0.37571013,
         0.18259938, -0.01686918,  1.66961479,  0.71831405,  0.3580009 ,
         0.66635793],
       [ 1.39721274, -0.65629166, -0

In [392]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [393]:
matadd

array([[-5.22632837e-01,  1.31274581e+00,  6.26713991e-01,
         3.13637584e-01, -3.76596689e-01,  4.88761859e-03,
        -5.55174649e-01,  6.39165103e-01, -2.10940933e+00,
         3.30693685e-02, -2.93532342e-01,  6.84451699e-01,
        -6.47347689e-01,  7.27873564e-01,  7.91488886e-01,
        -1.49035931e-01],
       [ 8.47880363e-01,  8.92729044e-01,  6.21262550e-01,
        -1.14085293e+00, -1.10533178e+00, -1.21122408e+00,
        -8.00590575e-01,  5.91763318e-01, -4.87312883e-01,
        -6.34929419e-01,  2.46132702e-01,  8.31072927e-01,
         5.23058414e-01,  1.22928667e+00, -4.17523742e-01,
         4.02868688e-01],
       [ 8.76122355e-01,  1.19081818e-01, -9.71816957e-01,
        -1.31140959e+00,  3.68975103e-01,  7.99282610e-01,
         8.34740400e-01, -1.62523425e+00,  9.95305061e-01,
         7.06632584e-02, -4.54083085e-01, -1.23369254e-01,
        -1.52907693e+00, -8.85128319e-01, -1.95014015e-01,
        -4.75803852e-01],
       [-1.44459438e+00, -1.63728487e

In [394]:
a_added

array([[-0.52263284,  1.9962162 ,  1.4303026 ,  0.31363758, -0.3765967 ,
         0.00488762, -0.55517465,  0.7193986 , -1.2235777 ,  0.03306937,
        -0.1149144 ,  0.6844517 , -0.18533725,  0.72787356,  0.7914889 ,
        -0.14903593],
       [ 1.8041466 ,  1.4458561 ,  0.62126255, -1.1408529 , -0.5677761 ,
        -1.2112241 , -0.8005906 ,  0.5917633 , -0.48731288, -0.6349294 ,
         0.2461327 ,  1.03478   ,  0.8783598 ,  1.2292867 , -0.41752374,
         1.3215288 ],
       [ 0.87612236,  0.32011947, -0.97181696, -1.3114096 ,  0.3689751 ,
         1.5426958 ,  0.8347404 , -1.6252342 ,  0.99530506,  0.6494129 ,
        -0.45408309, -0.06949151, -1.5290769 , -0.29337662, -0.01704356,
        -0.47580385],
       [-1.4445944 , -1.6372849 , -0.01160916, -0.0574351 , -0.4637547 ,
        -0.7115068 , -0.54578453,  2.4694514 ,  0.3972946 ,  0.37571013,
         0.18259938, -0.01686918,  1.6696148 ,  0.71831405,  0.3580009 ,
        -0.5677931 ],
       [ 1.3972127 , -0.65629166, -0

In [395]:
adenseadd

array([[-0.52263284,  1.99621618,  1.43030262,  0.31363758, -0.37659669,
         0.00488762, -0.55517465,  0.71939862, -1.22357774,  0.03306937,
        -0.1149144 ,  0.6844517 , -0.18533725,  0.72787356,  0.79148889,
        -0.14903593],
       [ 1.80414665,  1.44585609,  0.62126255, -1.14085293, -0.56777608,
        -1.21122408, -0.80059057,  0.59176332, -0.48731288, -0.63492942,
         0.2461327 ,  1.03478003,  0.87835979,  1.22928667, -0.41752374,
         0.9186601 ],
       [ 0.87612236,  0.32011947, -0.97181696, -1.31140959,  0.3689751 ,
         1.54269576,  0.8347404 , -1.62523425,  0.99530506,  0.64941287,
        -0.45408309, -0.06949151, -1.52907693, -0.29337662, -0.01704356,
        -0.47580385],
       [-1.44459438, -1.63728487, -0.01160916, -0.0574351 , -0.46375471,
        -0.71150678, -0.54578453,  2.46945143,  0.39729461,  0.37571013,
         0.18259938, -0.01686918,  1.66961479,  0.71831405,  0.3580009 ,
         0.66635793],
       [ 1.39721274, -0.65629166, -0

In [396]:
adenseadd == a_added

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  Tru

### update vals

In [397]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddenset(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   aheight,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint ncols = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      
      for (uint i=0; i<aheight; i++) {
        if (nnz > ellwidth)
            break;
        uint baseidxd = i*ncols+gid;
        float addval = vector_x[baseidxd];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (i > colIdx[baseidxs+i]) {
            rowNnz[gid] += 1;
            //if (gid==1)
            //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i] = addval;
            colIdx[baseidxs+i] = i;
            continue;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
        }
      }
    }""").build()

In [398]:
a.shape, b.shape

((8, 16), (16, 5))

In [399]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [400]:
cols = a.shape[1]

In [401]:
mult = mult.astype(np.float32)

In [402]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddenset  # Use this Kernel object for repeated calls
knl(queue, [cols], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat),np.uint32(a.T.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7f5ab43785e0>


ADD VAL:1.31 idx:8/1  col:1
ADD VAL:0.63 idx:16/2  col:1
ADD VAL:0.64 idx:56/7  col:1
ADD VAL:-2.11 idx:64/8  col:1
ADD VAL:-0.29 idx:80/10  col:1
ADD VAL:-0.65 idx:96/12  col:1
ADD VAL:0.85 idx:1/17  col:1
ADD VAL:0.89 idx:9/18  col:1
ADD VAL:-1.11 idx:33/21  col:1
ADD VAL:0.83 idx:89/28  col:1
ADD VAL:0.52 idx:97/29  col:1
ADD VAL:0.40 idx:121/32  col:1
ADD VAL:0.12 idx:10/35  col:5
ADD VAL:0.80 idx:42/39  col:5
ADD VAL:0.07 idx:74/43  col:5
ADD VAL:-0.12 idx:90/45  col:5
ADD VAL:-0.89 idx:106/47  col:5
ADD VAL:-0.20 idx:114/48  col:5
ADD VAL:-0.20 idx:27/54  col:5
ADD VAL:-1.37 idx:35/55  col:5
ADD VAL:-0.74 idx:43/56  col:5
ADD VAL:1.75 idx:59/58  col:5
ADD VAL:0.92 idx:99/63  col:5
ADD VAL:-1.23 idx:123/66  col:5
ADD VAL:1.15 idx:28/71  col:5
ADD VAL:-2.08 idx:60/75  col:5
ADD VAL:-1.94 idx:76/77  col:5
ADD VAL:-0.25 idx:92/79  col:5
ADD VAL:0.38 idx:116/82  col:5
ADD VAL:0.33 idx:124/83  col:5
ADD VAL:-1.46 idx:5/85  col:5
ADD VAL:-1.09 idx:13/86  col:5
ADD VAL:0.00 idx:45/90  c

In [403]:
matadd[0][0]

-0.52263284

In [404]:
datat_res = np.empty_like(adatat)
colst_res = np.empty_like(acolst)
nnzst_res = np.empty_like(annzt)
cl.enqueue_copy(queue, datat_res, adatat_buf)
cl.enqueue_copy(queue, colst_res, acolst_buf)
cl.enqueue_copy(queue, nnzst_res, annzst_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642dd9a0>

In [405]:
adenseaddt = to_dense(datat_res, colst_res, nnzst_res, ellwat, a.T.shape).T
adenseaddt

array([[-0.52263284,  1.99621618,  1.43030262,  0.31363758, -0.37659669,
         0.00488762, -0.55517465,  0.71939862, -1.22357774,  0.03306937,
        -0.1149144 ,  0.6844517 , -0.18533725,  0.72787356,  0.79148889,
        -0.14903593],
       [ 1.80414665,  1.44585609,  0.62126255, -1.14085293, -0.56777608,
        -1.21122408, -0.80059057,  0.59176332, -0.48731288, -0.63492942,
         0.2461327 ,  1.03478003,  0.87835979,  1.22928667, -0.41752374,
         1.32152879],
       [ 0.87612236,  0.32011947, -0.97181696, -1.31140959,  0.3689751 ,
         1.54269576,  0.8347404 , -1.62523425,  0.99530506,  0.64941287,
        -0.45408309, -0.06949151, -1.52907693, -0.29337662, -0.01704356,
        -0.47580385],
       [-1.44459438, -1.63728487, -0.01160916, -0.0574351 , -0.46375471,
        -0.71150678, -0.54578453,  2.46945143,  0.39729461,  0.37571013,
         0.18259938, -0.01686918,  1.66961479,  0.71831405,  0.3580009 ,
        -0.56779307],
       [ 1.39721274, -0.65629166, -0

In [406]:
a

array([[0.        , 0.68347037, 0.8035886 , 0.        , 0.        ,
        0.        , 0.        , 0.08023349, 0.8858316 , 0.        ,
        0.17861794, 0.        , 0.46201044, 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.        , 0.        , 0.5375557 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.20370705, 0.35530138, 0.        , 0.        ,
        0.9186601 ],
       [0.        , 0.20103766, 0.        , 0.        , 0.        ,
        0.74341315, 0.        , 0.        , 0.        , 0.5787496 ,
        0.        , 0.05387774, 0.        , 0.5917517 , 0.17797045,
        0.        ],
       [0.        , 0.        , 0.        , 0.14296253, 0.9103574 ,
        0.03242496, 0.        , 0.71459514, 0.        , 0.        ,
        0.        , 0.        , 0.7448136 , 0.        , 0.        ,
        0.66635793],
       [0.        , 0.        , 0.        , 0.59893256, 0.        ,
        0.        , 0.        , 

In [407]:
matadd

array([[-5.22632837e-01,  1.31274581e+00,  6.26713991e-01,
         3.13637584e-01, -3.76596689e-01,  4.88761859e-03,
        -5.55174649e-01,  6.39165103e-01, -2.10940933e+00,
         3.30693685e-02, -2.93532342e-01,  6.84451699e-01,
        -6.47347689e-01,  7.27873564e-01,  7.91488886e-01,
        -1.49035931e-01],
       [ 8.47880363e-01,  8.92729044e-01,  6.21262550e-01,
        -1.14085293e+00, -1.10533178e+00, -1.21122408e+00,
        -8.00590575e-01,  5.91763318e-01, -4.87312883e-01,
        -6.34929419e-01,  2.46132702e-01,  8.31072927e-01,
         5.23058414e-01,  1.22928667e+00, -4.17523742e-01,
         4.02868688e-01],
       [ 8.76122355e-01,  1.19081818e-01, -9.71816957e-01,
        -1.31140959e+00,  3.68975103e-01,  7.99282610e-01,
         8.34740400e-01, -1.62523425e+00,  9.95305061e-01,
         7.06632584e-02, -4.54083085e-01, -1.23369254e-01,
        -1.52907693e+00, -8.85128319e-01, -1.95014015e-01,
        -4.75803852e-01],
       [-1.44459438e+00, -1.63728487e

In [408]:
a_added

array([[-0.52263284,  1.9962162 ,  1.4303026 ,  0.31363758, -0.3765967 ,
         0.00488762, -0.55517465,  0.7193986 , -1.2235777 ,  0.03306937,
        -0.1149144 ,  0.6844517 , -0.18533725,  0.72787356,  0.7914889 ,
        -0.14903593],
       [ 1.8041466 ,  1.4458561 ,  0.62126255, -1.1408529 , -0.5677761 ,
        -1.2112241 , -0.8005906 ,  0.5917633 , -0.48731288, -0.6349294 ,
         0.2461327 ,  1.03478   ,  0.8783598 ,  1.2292867 , -0.41752374,
         1.3215288 ],
       [ 0.87612236,  0.32011947, -0.97181696, -1.3114096 ,  0.3689751 ,
         1.5426958 ,  0.8347404 , -1.6252342 ,  0.99530506,  0.6494129 ,
        -0.45408309, -0.06949151, -1.5290769 , -0.29337662, -0.01704356,
        -0.47580385],
       [-1.4445944 , -1.6372849 , -0.01160916, -0.0574351 , -0.4637547 ,
        -0.7115068 , -0.54578453,  2.4694514 ,  0.3972946 ,  0.37571013,
         0.18259938, -0.01686918,  1.6696148 ,  0.71831405,  0.3580009 ,
        -0.5677931 ],
       [ 1.3972127 , -0.65629166, -0

In [409]:
adenseaddt == a_added

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  Tru

### Make Random

In [410]:
rand = SparseTensor.uniform(2,4)
rand

<SparseTensor <GPUBuffer with shape (8,)> with grad None>

In [411]:
rand.to_numpy()

array([[6.44682732e-05, 6.43153617e-05, 6.76935379e-05, 4.03048944e-05],
       [8.65994152e-05, 8.07224751e-06, 6.99069133e-05, 1.57707436e-05]])

In [412]:
rand.data

<GPUBuffer with shape (8,)>

### update vals

In [413]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)

In [414]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1);
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint col = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint row = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [415]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint topk = get_global_size(0);
      uint col = updateyidx[gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint row = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];m
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

RuntimeError: clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE

Build on <pyopencl.Device 'GeForce GTX 1080 Ti' on 'NVIDIA CUDA' at 0x564624e64990>:

<kernel>:17:32: error: use of undeclared identifier 'baseupdateidx'
        float val = updatevals[baseupdateidx+gid*topk+i];
                               ^
<kernel>:18:31: error: use of undeclared identifier 'baseidxidx'
        uint row = updatexidx[baseidxidx+i];
                              ^
<kernel>:32:49: error: use of undeclared identifier 'm'
                matData[idx2] = matData[idx2-1];m
                                                ^

(options: -I /home/fpaboim/.conda/envs/tinygrad/lib/python3.8/site-packages/pyopencl/cl)
(source saved as /tmp/tmpmh2k4zha.cl)

In [None]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,1], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa), x_cp_buf, x_idx_buf, y_idx_buf)

resa = np.empty_like(adata)
resaidx = np.zeros(acols.shape).astype(np.uint32)
resannz = np.zeros(annz.shape).astype(np.uint32)

cl.enqueue_copy(queue, resa, adata_buf)
cl.enqueue_copy(queue, resaidx, acols_buf)
cl.enqueue_copy(queue, resannz, annzs_buf)

In [416]:
resa.shape, resaidx.shape, resannz.shape, ellwa, a.T.shape

NameError: name 'resa' is not defined

In [417]:
adenseadd = to_dense(resa, resaidx, resannz, ellwa, a.shape)
adenseadd

NameError: name 'resa' is not defined

In [418]:
adenseadd - adense

array([[-5.22632837e-01,  1.31274581e+00,  6.26713991e-01,
         3.13637584e-01, -3.76596689e-01,  4.88761859e-03,
        -5.55174649e-01,  6.39165126e-01, -2.10940933e+00,
         3.30693685e-02, -2.93532342e-01,  6.84451699e-01,
        -6.47347689e-01,  7.27873564e-01,  7.91488886e-01,
        -1.49035931e-01],
       [ 8.47880363e-01,  8.92728984e-01,  6.21262550e-01,
        -1.14085293e+00, -1.10533178e+00, -1.21122408e+00,
        -8.00590575e-01,  5.91763318e-01, -4.87312883e-01,
        -6.34929419e-01,  2.46132702e-01,  8.31072971e-01,
         5.23058414e-01,  1.22928667e+00, -4.17523742e-01,
         0.00000000e+00],
       [ 8.76122355e-01,  1.19081810e-01, -9.71816957e-01,
        -1.31140959e+00,  3.68975103e-01,  7.99282610e-01,
         8.34740400e-01, -1.62523425e+00,  9.95305061e-01,
         7.06632733e-02, -4.54083085e-01, -1.23369250e-01,
        -1.52907693e+00, -8.85128319e-01, -1.95014015e-01,
        -4.75803852e-01],
       [-1.44459438e+00, -1.63728487e

In [419]:
adenseadd == adense

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, Fals

In [420]:
ellwa

16

In [421]:
adata2 = adata.reshape(-1, ellwa)
adata2

array([[0.68347037, 0.8035886 , 0.08023349, 0.8858316 , 0.17861794,
        0.46201044, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.9562663 , 0.5531271 , 0.5375557 , 0.20370705, 0.35530138,
        0.9186601 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.20103766, 0.74341315, 0.5787496 , 0.05387774, 0.5917517 ,
        0.17797045, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.14296253, 0.9103574 , 0.03242496, 0.71459514, 0.7448136 ,
        0.66635793, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.59893256, 0.9217044 , 0.16333483, 0.17167741, 0.7605819 ,
        0.48675168, 0.        , 

In [422]:
resa = resa.reshape(-1, ellwa)
resa

NameError: name 'resa' is not defined

In [423]:
resa - adata2

NameError: name 'resa' is not defined

In [424]:
acols

array([ 1,  2,  7,  8, 10, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1,  4, 11, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  5,
        9, 11, 13, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  5,
        7, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  7,  9, 11,
       14, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  5,  7, 10,
       11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  5, 12, 13,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  6,  8, 11, 15,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=uint32)

In [425]:
resaidx

NameError: name 'resaidx' is not defined

In [426]:
resannz

NameError: name 'resannz' is not defined

In [427]:
annz

array([6, 6, 6, 6, 6, 6, 6, 6], dtype=uint32)

### update vals2

In [428]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)

In [429]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1);
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint row = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint col = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [430]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,bs], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat), x_cp_buf, x_idx_buf, y_idx_buf)

resat = np.empty_like(adatat)
resaidxt = np.zeros(acolst.shape).astype(np.uint32)
resannzt = np.zeros(annzt.shape).astype(np.uint32)

cl.enqueue_copy(queue, resat, adatat_buf)
cl.enqueue_copy(queue, resaidxt, acolst_buf)
cl.enqueue_copy(queue, resannzt, annzst_buf)


UPDATE[6,7]: 0.000000
UPDATE[1,0]: 0.155507
UPDATE[2,0]: 1.623621
INSERT[6,5]: 0.00
INSERT[6,6]: 0.00
UPDATE[3,7]: 0.000000
UPDATE[0,7]: 0.000000
INSERT[0,0]: 0.70
INSERT[3,0]: 0.33
INSERT[4,0]: 0.61
UPDATE[0,6]: 0.000000
UPDATE[3,6]: 0.000000
UPDATE[0,5]: 0.000000
UPDATE[3,6]: 0.000000
UPDATE[0,6]: 0.000000
UPDATE[5,6]: 0.000000
INSERT[3,5]: 0.00
UPDATE[0,1]: 1.884831
UPDATE[1,1]: 0.187350
UPDATE[4,1]: 0.298923
UPDATE[5,6]: 0.000000
INSERT[5,1]: 0.00
INSERT[6,1]: 0.00
INSERT[3,1]: 0.30
UPDATE[1,5]: 0.000000
UPDATE[5,2]: 0.000000
UPDATE[0,1]: 0.000000
UPDATE[3,7]: 0.000000
UPDATE[1,2]: 0.311594
INSERT[6,2]: 0.00
INSERT[7,1]: 0.00
INSERT[3,1]: 0.00
UPDATE[6,6]: 0.000000
INSERT[0,2]: 1.13
INSERT[3,2]: 0.32
INSERT[4,2]: 0.63
INSERT[3,2]: 0.00
INSERT[0,2]: 0.00
UPDATE[0,6]: 0.000000
UPDATE[7,0]: 0.000000
UPDATE[0,0]: 0.000000
INSERT[4,3]: 0.02
UPDATE[5,1]: 0.000000
UPDATE[1,6]: 0.000000
INSERT[5,0]: 0.00
INSERT[6,0]: 0.00
INSERT[1,3]: 0.97
INSERT[6,1]: 0.00
INSERT[4,4]: 0.00
UPDATE[6,7]: 

<pyopencl._cl.NannyEvent at 0x7f5a642b2040>

In [431]:
ellwa

16

In [432]:
resat.shape, resaidxt.shape, resannzt.shape

((128,), (128,), (16,))

In [433]:
adenseaddt = to_dense(resat, resaidxt, resannzt, ellwat, a.T.shape)
adenseaddt

array([[-0.69963682, -0.        , -1.13023305, -0.49532753, -0.        ,
         0.11415514,  0.28872353,  0.        ],
       [ 0.        , -0.        , -0.11055626, -0.9677906 , -0.27932334,
         0.82574075,  0.04889954,  0.67626965],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.33406669, -0.        , -0.32051137, -0.01893103,  0.59893256,
        -0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.91035742, -0.83909458,
         0.        ,  0.6493426 ,  0.        ],
       [-0.        , -0.        ,  0.74341315,  0.03242496,  0.        ,
         0.27978534,  0.33329687,  0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -0.        , -0.        ,  0.69630474],
       [ 0.        ,  0.        ,  0.        ,  0.71459514,  0.92170441,
         0.25018668,  0.        ,  0.69630474],
       [ 0.88583159,  0.        

In [434]:
adenseadd == adenseaddt.T

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True,  True],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, Fals

In [435]:
adata2t = adatat.reshape(-1, ellwat)
adata2t

array([[0.9562663 , 0.11415514, 0.28872353, 0.67626965, 0.        ,
        0.        , 0.        , 0.        ],
       [0.68347037, 0.5531271 , 0.20103766, 0.82574075, 0.04889954,
        0.        , 0.        , 0.        ],
       [0.8035886 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.14296253, 0.59893256, 0.6493426 , 0.35563806, 0.        ,
        0.        , 0.        , 0.        ],
       [0.5375557 , 0.9103574 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.74341315, 0.03242496, 0.27978534, 0.33329687, 0.        ,
        0.        , 0.        , 0.        ],
       [0.69630474, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.08023349, 0.71459514, 0.9217044 , 0.25018668, 0.        ,
        0.        , 0.        , 0.        ],
       [0.8858316 , 0.6753613 , 0.        , 0.        , 0.        ,
        0.        , 0.      

In [436]:
resat = resat.reshape(-1, ellwat)
resat

array([[-0.6996368 , -0.9285649 , -0.        , -1.130233  , -0.        ,
        -0.49532753,  0.11415514,  0.28872353],
       [-0.        ,  0.67626965,  0.        , -0.11055626, -0.9677906 ,
        -0.27932334,  0.82574075,  0.04889954],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.3340667 , -0.        , -0.        , -0.32051137, -0.30414024,
         0.14296253, -0.        , -0.        ],
       [-0.        ,  0.59893256, -0.01893103, -0.8390946 ,  0.6493426 ,
         0.        ,  0.9103574 ,  0.        ],
       [-0.        , -0.        ,  0.74341315,  0.03242496,  0.27978534,
         0.33329687,  0.        ,  0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -0.        , -0.        , -0.        ],
       [ 0.69630474,  0.        ,  0.71459514,  0.9217044 ,  0.25018668,
         0.        ,  0.        ,  0.        ],
       [ 0.8858316 ,  0.6753613 

In [437]:
resat - adata2t

array([[-1.6559031 , -1.0427201 , -0.28872353, -1.8065027 , -0.        ,
        -0.49532753,  0.11415514,  0.28872353],
       [-0.68347037,  0.12314254, -0.20103766, -0.936297  , -1.0166901 ,
        -0.27932334,  0.82574075,  0.04889954],
       [-0.8035886 ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.4770292 , -0.59893256, -0.6493426 , -0.6761494 , -0.30414024,
         0.14296253, -0.        , -0.        ],
       [-0.5375557 , -0.31142485, -0.01893103, -0.8390946 ,  0.6493426 ,
         0.        ,  0.9103574 ,  0.        ],
       [-0.74341315, -0.03242496,  0.46362782, -0.3008719 ,  0.27978534,
         0.33329687,  0.        ,  0.        ],
       [-0.69630474, -0.        , -0.        , -0.        , -0.        ,
        -0.        , -0.        , -0.        ],
       [ 0.6160712 , -0.71459514, -0.20710927,  0.6715177 ,  0.25018668,
         0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        

In [438]:
acols

array([ 1,  2,  7,  8, 10, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1,  4, 11, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  5,
        9, 11, 13, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  5,
        7, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  7,  9, 11,
       14, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  5,  7, 10,
       11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  5, 12, 13,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  6,  8, 11, 15,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=uint32)

In [439]:
resaidx

NameError: name 'resaidx' is not defined

In [440]:
resannz

NameError: name 'resannz' is not defined

In [441]:
annz

array([6, 6, 6, 6, 6, 6, 6, 6], dtype=uint32)

# OTHER

import numpy as np
import pyopencl as cl

mf = cl.mem_flags

dim = 16
topk = 4

x = np.random.rand(dim).astype(np.float32)
y = np.random.rand(dim).astype(np.float32)
x.shape,y.shape

dim1 = 4
dim2 = 8
dim3 = 1

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.2

a = np.zeros((dim1,dim2))
b = np.random.rand(dim2,dim3).flatten().astype(np.float32)

a.shape, b.shape

In [442]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
val_out_buf = cl.Buffer(ctx, mf.READ_WRITE, 4*topk*topk)
x_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                         __global  float* y,    // INPUT
                         __global  float* xout,    // INPUT
                         uint topk,
                         __global  uint* xoutidx,    // INPUT
                         __global  uint* youtidx    // INPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  uint n = get_global_size(0);
  
  xout[gid] = x[gid];
  xoutidx[gid] = gid;
  youtidx[gid] = gid;
  
  float valx = x[gid];
  float valy = y[gid];
  uint posx = 0;
  uint posy = 0;
  for (uint i = 0; i < n; i++) {
    float tempval = x[i];
    float tempval2 = y[i];
    bool larger = tempval > valx;
    bool larger2 = tempval2 > valy;
      
    posx += (larger)?1:0;
    posy += (larger2)?1:0;
  }
  //printf("posx:%i", posx);
  if (posx < topk) {
    xoutidx[posx] = gid;
  }
  if (posy < topk) {
    youtidx[posy] = gid;
  }
  if (gid < topk) {
    uint i = gid;
    for (uint j=0; j<topk; j++) {
      xout[gid*topk+j] = x[xoutidx[gid]] * y[youtidx[j]];
    }
  }
}""").build()

In [443]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
event = knl(queue, [dim,], None, x_buf, y_buf, val_out_buf, np.uint32(topk), x_idx_buf, y_idx_buf)

#event.wait()
val_out = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, val_out, val_out_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf, wait_for=[event])
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7f5a642b73b0>

In [444]:
val_out

array([0.8797894 , 0.47577098, 0.36607596, 0.2568478 , 0.2076761 ,
       0.81768477, 0.44218615, 0.34023452, 0.23871683, 0.19301617,
       0.7180269 , 0.38829333, 0.2987674 , 0.20962246, 0.16949172,
       0.3591644 , 0.19422829, 0.1494465 , 0.1048553 , 0.0847815 ,
       0.24742627, 0.13380276, 0.10295283, 0.07223421, 0.05840548],
      dtype=float32)

In [445]:
resxidx

array([1, 2, 7, 4, 6], dtype=uint32)

In [446]:
resyidx

array([5, 1, 7, 4, 0], dtype=uint32)

In [447]:
asdf

NameError: name 'asdf' is not defined

In [None]:
from __future__ import division

KERNEL_CODE = """
// Thread block size
#define BLOCK_SIZE %(block_size)d
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA %(w_a)d // Matrix A width
#define HA %(h_a)d // Matrix A height
#define WB %(w_b)d // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height
/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */
/* Matrix multiplication: C = A * B.
 * Device code.
 */
#define AS(j, i) As[i + j * BLOCK_SIZE]
#define BS(j, i) Bs[i + j * BLOCK_SIZE]
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! WA is A's width and WB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(16,16,1))) 
void
matrixMul( __global float* C, __global float* A, __global float* B)
{
    __local float As[BLOCK_SIZE*BLOCK_SIZE];
    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);
    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);
    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + WA - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * WB;
    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0f;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {
        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + WA * ty + tx];
        BS(ty, tx) = B[b + WB * ty + tx];
        // Synchronize to make sure the matrices are loaded
        barrier(CLK_LOCAL_MEM_FENCE);
        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
"""


In [448]:
a2 = np.random.rand(4,4)

In [449]:
a2

array([[0.19795601, 0.05292882, 0.43989334, 0.1150279 ],
       [0.44728966, 0.16383854, 0.63395522, 0.85326287],
       [0.50154382, 0.29188574, 0.42603948, 0.3408554 ],
       [0.27129721, 0.27592893, 0.81850571, 0.12830064]])

In [450]:
a2.sum(axis=1)

array([0.80580607, 2.0983463 , 1.56032445, 1.4940325 ])

In [451]:
b2 = np.random.rand(4,4)

In [452]:
b2

array([[0.08568472, 0.12980955, 0.30394046, 0.36267546],
       [0.03442971, 0.921941  , 0.47491669, 0.46738538],
       [0.04905791, 0.6028401 , 0.39008585, 0.68239085],
       [0.43891714, 0.53169592, 0.31297645, 0.32155413]])

In [453]:
b2.sum(axis=0)

array([0.60808948, 2.18628657, 1.48191944, 1.83400583])

In [454]:
matmul = a2.dot(b2)
matmul

array([[0.09085209, 0.40083904, 0.29290081, 0.43369883],
       [0.44957902, 1.04496195, 0.72810723, 0.94577217],
       [0.22353207, 0.77227172, 0.56393256, 0.71864967],
       [0.12921371, 0.85125216, 0.57294403, 0.8271544 ]])