In [1]:
from tinygrad.densetensor import DenseTensor
from tinygrad.sparsetensor import SparseTensor
import numpy as np

%load_ext autoreload
%autoreload 2

DEVICE:GPU


In [2]:
x_init = np.random.randn(2,6).astype(np.float32)
x2_init = np.random.randn(3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(6,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)

x = DenseTensor(x_init)
W = DenseTensor(W_init)
m = DenseTensor(m_init)
out = x.dot(W).relu()
out = out.logsoftmax()
out = out.mul(m).add(m).sum()
out.backward()

out.cpu().data, x

x2 = DenseTensor(x2_init)#.gpu()
W = SparseTensor(W_init)
out = W.dot(x2).relu().sum()

out.backward()

out.cpu().data, x

In [3]:
import numpy as np
import pyopencl as cl

mf = cl.mem_flags

In [4]:
dim1 = 16
dim2 = 32
dim3 = 24
bs = dim3

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.4

a = np.zeros((dim1,dim2))
b = np.zeros((dim2,dim3)).astype(np.float32)

a.shape, b.shape

((16, 32), (32, 24))

In [5]:
def fill_sparse(mat, sparsity=0.1):
    indices = np.array(range(mat.shape[1]))
    nrows = int(mat.shape[1]*sparsity)
    for row in range(mat.shape[0]):
        lim = nrows #+ int(np.random.random()*3)
        mat[row][np.random.permutation(indices)[:lim]] = np.random.random(lim)
    return mat

a = fill_sparse(a, sparsity).astype(np.float32)
b = fill_sparse(b, sparsity).astype(np.float32)

In [6]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [7]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [8]:
x2_init.T

array([-0.4575035 ,  0.82980406,  0.43716195], dtype=float32)

In [9]:
mult = a.dot(b)
mult.shape

(16, 24)

In [10]:
mult.shape

(16, 24)

In [11]:
def to_data(mat):
    all_rows = []
    all_idxs = []
    all_nnzs = []
    for row in range(mat.shape[0]):
        rowdata = []
        colidxs = []
        all_nnzs.append(0)
        for col in range(mat.shape[1]):
            val = mat[row][col]
            if val != 0:
                rowdata.append(val)
                colidxs.append(col)
                all_nnzs[-1] += 1
        all_rows.append(rowdata)
        all_idxs.append(colidxs)
    
    ellwidth = min(int(np.sqrt(np.max(all_nnzs))+1)**2, mat.shape[1])
    #all_rows = np.array(all_rows)#.astype(np.float32).flatten()           
    for row in range(mat.shape[0]):
        #print(row, all_rows)
        all_rows[row] = np.array(all_rows[row])
        all_rows[row].resize(ellwidth)
        all_idxs[row] = np.array(all_idxs[row])
        all_idxs[row].resize(ellwidth)
        #print(all_idxs[row])
    all_rows = np.array(all_rows)
    all_idxs = np.array(all_idxs)
    all_nnzs = np.array(all_nnzs)
    
#     while (not all_rows[:,-1].any()):
#         all_rows = all_rows[:,:-1]
#         all_idxs = all_idxs[:,:-1]
#         ellwidth -= 1
        
    
    all_rows = np.array(all_rows).astype(np.float32).flatten()
    all_idxs = np.array(all_idxs).astype(np.uint32).flatten()
    
    all_nnzs = np.array(all_nnzs).astype(np.uint32)
    
    
    return all_rows, all_idxs, all_nnzs, ellwidth

In [12]:
def to_dense(data, cols, nnzs, ellw, shape):
    out = np.zeros(shape)
    for row in range(shape[0]):
        for icol in range(nnzs[row]):
            out[row,cols[row*ellw+icol]] = data[row*ellw+icol]
    return out

In [13]:
adata, acols, annz, ellwa = to_data(a)
adata, acols, annz, ellwa

(array([0.52390957, 0.28779745, 0.5066265 , 0.02850537, 0.8833858 ,
        0.7214018 , 0.53931004, 0.7078977 , 0.8825997 , 0.94836694,
        0.03330261, 0.50521904, 0.        , 0.        , 0.        ,
        0.        , 0.3209946 , 0.37998632, 0.35968715, 0.8757173 ,
        0.29720682, 0.71653605, 0.9328499 , 0.16859464, 0.955795  ,
        0.5710225 , 0.22954714, 0.15922078, 0.        , 0.        ,
        0.        , 0.        , 0.28137836, 0.80891   , 0.4850459 ,
        0.6968132 , 0.20988134, 0.3480211 , 0.25583476, 0.75034386,
        0.24273469, 0.57196736, 0.9955581 , 0.70779353, 0.        ,
        0.        , 0.        , 0.        , 0.83257234, 0.3710736 ,
        0.179964  , 0.13894022, 0.23567294, 0.88615716, 0.40681642,
        0.997986  , 0.1362593 , 0.8067922 , 0.44255948, 0.7508757 ,
        0.        , 0.        , 0.        , 0.        , 0.8348913 ,
        0.4491425 , 0.9540574 , 0.35131943, 0.01093002, 0.9366681 ,
        0.6727683 , 0.29138318, 0.65162927, 0.44

In [14]:
adatat, acolst, annzt, ellwat = to_data(a.T)
adatat, acolst, annzt, ellwat

(array([0.52390957, 0.3209946 , 0.28137836, 0.51713663, 0.02384352,
        0.64305884, 0.97078615, 0.27334523, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.80891   , 0.17786872, 0.69861746, 0.57698685,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28779745, 0.37998632, 0.8348913 ,
        0.02476312, 0.11436905, 0.61044616, 0.4466196 , 0.12572199,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.83257234, 0.23834811,
        0.09053238, 0.73914844, 0.5414916 , 0.6565848 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.5066265 ,
        0.35968715, 0.4850459 , 0.37790823, 0.11510975, 0.38609546,
        0.75785327, 0.00313647, 0.9551985 , 0.  

In [15]:
bdata, bcols, bnnz, ellwb = to_data(b)
bdata, bcols, bnnz, ellwb

(array([1.79806799e-01, 4.80055600e-01, 7.96413183e-01, 6.50798619e-01,
        2.20665541e-02, 2.88000137e-01, 1.40719712e-01, 3.51273656e-01,
        3.27207237e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        8.35044205e-01, 3.38727891e-01, 3.54924470e-01, 1.99968800e-01,
        5.70148349e-01, 2.30531722e-01, 3.21350515e-01, 3.68743688e-01,
        4.29615289e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        8.99728179e-01, 7.84764469e-01, 1.06208868e-01, 6.07245803e-01,
        4.86730039e-02, 8.16224337e-01, 9.05490816e-01, 4.10150796e-01,
        2.94395033e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        7.72912621e-01, 5.01399815e-01, 2.64131010e-01, 5.01486957e-01,
        8.58053938e-02, 3.64440799e-01, 2.23705173e-01, 2.736397

In [16]:
bdatat, bcolst, bnnzt, ellwbt = to_data(b.T)
adatat, bcolst, bnnzt, ellwbt

(array([0.52390957, 0.3209946 , 0.28137836, 0.51713663, 0.02384352,
        0.64305884, 0.97078615, 0.27334523, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.80891   , 0.17786872, 0.69861746, 0.57698685,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28779745, 0.37998632, 0.8348913 ,
        0.02476312, 0.11436905, 0.61044616, 0.4466196 , 0.12572199,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.83257234, 0.23834811,
        0.09053238, 0.73914844, 0.5414916 , 0.6565848 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.5066265 ,
        0.35968715, 0.4850459 , 0.37790823, 0.11510975, 0.38609546,
        0.75785327, 0.00313647, 0.9551985 , 0.  

In [17]:
adense = to_dense(adata, acols, annz, ellwa, a.shape)

In [18]:
adenset = to_dense(adatat, acolst, annzt, ellwat, a.T.shape)

In [19]:
bdense = to_dense(bdata, bcols, bnnz, ellwb, b.shape)

In [20]:
bdenset = to_dense(bdatat, bcolst, bnnzt, ellwbt, b.T.shape)

In [21]:
adense

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.50662649,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.88338578,
        0.        , 0.72140181, 0.        , 0.53931004, 0.        ,
        0.70789772, 0.88259971, 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.32099459, 0.        , 0.37998632, 0.        , 0.35968715,
        0.87571728, 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.93284988, 0.        ,
        0.        , 0.16859464, 0.95579499, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57102251, 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891001, 0.        , 0.        , 0.48504591,
        0.        , 0.        , 0.69681323, 0.2098

In [22]:
adenset.T == adense

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  

In [23]:
bdenset.T == bdense

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,

In [24]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [25]:
a == adense

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  

In [26]:
a.shape

(16, 32)

In [27]:
adata.shape, acols.shape, annz.shape, ellwa

((256,), (256,), (16,), 16)

In [28]:
#acols = acols.astype(np.uint32)
#annz = annz.astype(np.uint32)

In [29]:
adata, acols, annz, b

(array([0.52390957, 0.28779745, 0.5066265 , 0.02850537, 0.8833858 ,
        0.7214018 , 0.53931004, 0.7078977 , 0.8825997 , 0.94836694,
        0.03330261, 0.50521904, 0.        , 0.        , 0.        ,
        0.        , 0.3209946 , 0.37998632, 0.35968715, 0.8757173 ,
        0.29720682, 0.71653605, 0.9328499 , 0.16859464, 0.955795  ,
        0.5710225 , 0.22954714, 0.15922078, 0.        , 0.        ,
        0.        , 0.        , 0.28137836, 0.80891   , 0.4850459 ,
        0.6968132 , 0.20988134, 0.3480211 , 0.25583476, 0.75034386,
        0.24273469, 0.57196736, 0.9955581 , 0.70779353, 0.        ,
        0.        , 0.        , 0.        , 0.83257234, 0.3710736 ,
        0.179964  , 0.13894022, 0.23567294, 0.88615716, 0.40681642,
        0.997986  , 0.1362593 , 0.8067922 , 0.44255948, 0.7508757 ,
        0.        , 0.        , 0.        , 0.        , 0.8348913 ,
        0.4491425 , 0.9540574 , 0.35131943, 0.01093002, 0.9366681 ,
        0.6727683 , 0.29138318, 0.65162927, 0.44

## MatMul (Sparse-Dense)

adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      float sum = 0;
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[col*ncols+gid2];
          //if (gid==0 && gid2==2)
          //  printf("aval, xval: %.2f,%.2f: (%i,%i) \\n", aval, xval, col, index);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [30]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          uint xidx = col*ncols+gid2;
          float xval  = vector_x[xidx];
          if (gid==0 && gid2==1)
            printf("aval, xval: %.2f,%.2f: (%i,%i) - %i \\n", aval, xval, col, index, xidx);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [31]:
a.shape, b.shape

((16, 32), (32, 24))

In [32]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [33]:
rows = a.shape[0]

In [34]:
mult = mult.astype(np.float32)

In [35]:
outshape = (a.shape[0], b.shape[1])
outshape

(16, 24)

In [36]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul2  # Use this Kernel object for repeated calls
knl(queue, [outshape[0]], None, adata_buf, acols_buf, annzs_buf, np.uint32(ellwa), np.uint32(outshape[1]), b_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

aval, xval: 0.52,0.00: (0,0) - 1 
aval, xval: 0.29,0.00: (2,1) - 49 
aval, xval: 0.51,0.18: (4,2) - 97 
aval, xval: 0.03,0.61: (5,3) - 121 
aval, xval: 0.88,0.00: (14,4) - 337 
aval, xval: 0.72,0.00: (16,5) - 385 
aval, xval: 0.54,0.00: (18,6) - 433 
aval, xval: 0.71,0.00: (20,7) - 481 
aval, xval: 0.88,0.00: (21,8) - 505 
aval, xval: 0.95,0.00: (22,9) - 529 
aval, xval: 0.03,0.00: (26,10) - 625 
aval, xval: 0.51,0.51: (30,11) - 721 


<pyopencl._cl.NannyEvent at 0x7fccfc4f60e0>

In [37]:
(res_np-mult).sum()

-1.2665987e-06

In [38]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [39]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [40]:
res_buf

<pyopencl._cl.Buffer at 0x7fccfc4da680>

In [41]:
res_np

array([[1.6294171 , 0.36525553, 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791255 , 1.4661919 ,
        2.3058922 , 0.81397647, 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [42]:
mult

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [43]:
res_np==mult

array([[ True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False,  True, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False,

In [44]:
res_np.shape

(16, 24)

In [45]:
mult.shape

(16, 24)

## MatMul (dense * sparse)

In [46]:
bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmul(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        uint nnz = rowNnz[gid2];
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid2 * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[gid*mwidth+col];
          sum  += aval * xval;
          if (gid==0 && gid2==0)
            printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [47]:
a.shape, b.shape

((16, 32), (32, 24))

In [48]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [49]:
rows = a.shape[0]

In [50]:
mult = mult.astype(np.float32)

In [51]:
outshape = np.array([a.shape[0], b.shape[1]])
outshape

array([16, 24])

In [52]:
b.T

array([[0.00000000e+00, 8.35044205e-01, 0.00000000e+00, 7.72912621e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 6.04525506e-01, 0.00000000e+00, 2.04175711e-02,
        0.00000000e+00, 0.00000000e+00, 5.55010617e-01, 8.11873853e-01,
        1.16767600e-01, 0.00000000e+00, 0.00000000e+00, 5.30311882e-01,
        7.69208670e-01, 1.91230401e-01, 3.60188931e-01, 2.91794538e-01,
        0.00000000e+00, 2.58038431e-01, 0.00000000e+00, 0.00000000e+00,
        1.59830544e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.38727891e-01, 0.00000000e+00, 5.01399815e-01,
        1.78162962e-01, 6.14967883e-01, 0.00000000e+00, 4.79711026e-01,
        9.56434608e-01, 0.00000000e+00, 8.76694322e-01, 9.48271155e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.74078238e-01,
        0.00000000e+00, 9.01507556e-01, 0.00000000e+00, 1.86964422e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000

In [53]:
a.T

array([[0.52390957, 0.3209946 , 0.28137836, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.51713663, 0.        ,
        0.02384352, 0.64305884, 0.97078615, 0.        , 0.27334523,
        0.        ],
       [0.        , 0.        , 0.80891   , 0.        , 0.        ,
        0.        , 0.17786872, 0.        , 0.        , 0.        ,
        0.69861746, 0.        , 0.57698685, 0.        , 0.        ,
        0.        ],
       [0.28779745, 0.37998632, 0.        , 0.        , 0.8348913 ,
        0.02476312, 0.        , 0.        , 0.11436905, 0.        ,
        0.61044616, 0.        , 0.4466196 , 0.        , 0.        ,
        0.12572199],
       [0.        , 0.        , 0.        , 0.83257234, 0.        ,
        0.23834811, 0.        , 0.        , 0.        , 0.09053238,
        0.73914844, 0.        , 0.        , 0.        , 0.5414916 ,
        0.6565848 ],
       [0.5066265 , 0.35968715, 0.4850459 , 0.        , 0.        ,
        0.        , 0.        , 

In [54]:
outshape.T

array([16, 24])

In [55]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul  # Use this Kernel object for repeated calls
knl(queue, [outshape.T[0]], None, bdatat_buf, bcolst_buf, bnnzst_buf, np.uint32(ellwbt), np.uint32(b.shape[0]), np.uint32(outshape.T[1]), a_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7fccfc4b5720>

aval, xval: 0.84,0.00 - 0.00: (1,0) 
aval, xval: 0.77,0.00 - 0.00: (3,1) 
aval, xval: 0.60,0.00 - 0.00: (9,2) 
aval, xval: 0.02,0.00 - 0.00: (11,3) 
aval, xval: 0.56,0.88 - 0.49: (14,4) 
aval, xval: 0.81,0.00 - 0.49: (15,5) 
aval, xval: 0.12,0.72 - 0.57: (16,6) 
aval, xval: 0.53,0.00 - 0.57: (19,7) 
aval, xval: 0.77,0.71 - 1.12: (20,8) 
aval, xval: 0.19,0.88 - 1.29: (21,9) 
aval, xval: 0.36,0.95 - 1.63: (22,10) 
aval, xval: 0.29,0.00 - 1.63: (23,11) 
aval, xval: 0.26,0.00 - 1.63: (25,12) 
aval, xval: 0.02,0.00 - 1.63: (28,13) 


In [56]:
(res_np-mult).sum()

-1.2665987e-06

In [57]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [58]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [59]:
res_buf

<pyopencl._cl.Buffer at 0x7fccfc4f6c70>

In [60]:
res_np

array([[1.6294171 , 0.36525553, 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791255 , 1.4661919 ,
        2.3058922 , 0.81397647, 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [61]:
mult

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [62]:
res_np==mult

array([[ True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False,  True, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False,

In [63]:
res_np-mult

array([[ 0.0000000e+00,  2.9802322e-08,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, -1.1920929e-07,
        -1.1920929e-07,  0.0000000e+00, -5.9604645e-08,  0.000

In [64]:
res_np.shape

(16, 24)

In [65]:
mult.shape

(16, 24)

# Matmult Dense Dense

In [66]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i; 
        float xval = x[xidx];
        uint yidx = osize*i+gidy;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [67]:
a.shape, b.shape

((16, 32), (32, 24))

In [68]:
rows = a.shape[0]

In [69]:
mult = mult.astype(np.float32)

In [70]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.52 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.84 - 0.00  -- 1/24
mult: 0.29 x 0.00 - 0.00  -- 2/48
mult: 0.00 x 0.77 - 0.00  -- 3/72
mult: 0.51 x 0.00 - 0.00  -- 4/96
mult: 0.03 x 0.00 - 0.00  -- 5/120
mult: 0.00 x 0.00 - 0.00  -- 6/144
mult: 0.00 x 0.00 - 0.00  -- 7/168
mult: 0.00 x 0.00 - 0.00  -- 8/192
mult: 0.00 x 0.60 - 0.00  -- 9/216
mult: 0.00 x 0.00 - 0.00  -- 10/240
mult: 0.00 x 0.02 - 0.00  -- 11/264
mult: 0.00 x 0.00 - 0.00  -- 12/288
mult: 0.00 x 0.00 - 0.00  -- 13/312
mult: 0.88 x 0.56 - 0.00  -- 14/336
mult: 0.00 x 0.81 - 0.00  -- 15/360
mult: 0.72 x 0.12 - 0.00  -- 16/384
mult: 0.00 x 0.00 - 0.00  -- 17/408
mult: 0.54 x 0.00 - 0.00  -- 18/432
mult: 0.00 x 0.53 - 0.00  -- 19/456
mult: 0.71 x 0.77 - 0.00  -- 20/480
mult: 0.88 x 0.19 - 0.00  -- 21/504
mult: 0.95 x 0.36 - 0.00  -- 22/528
mult: 0.00 x 0.29 - 0.00  -- 23/552
mult: 0.00 x 0.00 - 0.00  -- 24/576
mult: 0.00 x 0.26 - 0.00  -- 25/600
mult: 0.03 x 0.00 - 0.00  -- 26/624
mult: 0.00 x 0.00 - 0.00  -- 27/648
mult: 0

<pyopencl._cl.NannyEvent at 0x7fccfc4b6f90>

.00 x 0.02 - 0.00  -- 28/672
mult: 0.00 x 0.00 - 0.00  -- 29/696
mult: 0.51 x 0.00 - 0.00  -- 30/720
mult: 0.00 x 0.00 - 0.00  -- 31/744

In [71]:
(res_np-mult).sum()

-1.2665987e-06

In [72]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [73]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [74]:
res_np

array([[1.6294171 , 0.36525553, 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791255 , 1.4661919 ,
        2.3058922 , 0.81397647, 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [75]:
a.dot(b)

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [76]:
res_np==mult

array([[ True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False,  True, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False,

In [77]:
res_np.shape

(16, 24)

In [78]:
mult.shape

(16, 24)

# Matmult Dense Transposed

In [79]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [80]:
c=np.zeros(b.T.shape)
bt = b.T
for row in range(bt.shape[0]):
    for col in range(bt.shape[1]):
        c[row][col] = bt[row][col]

In [81]:
bt

array([[0.00000000e+00, 8.35044205e-01, 0.00000000e+00, 7.72912621e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 6.04525506e-01, 0.00000000e+00, 2.04175711e-02,
        0.00000000e+00, 0.00000000e+00, 5.55010617e-01, 8.11873853e-01,
        1.16767600e-01, 0.00000000e+00, 0.00000000e+00, 5.30311882e-01,
        7.69208670e-01, 1.91230401e-01, 3.60188931e-01, 2.91794538e-01,
        0.00000000e+00, 2.58038431e-01, 0.00000000e+00, 0.00000000e+00,
        1.59830544e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.38727891e-01, 0.00000000e+00, 5.01399815e-01,
        1.78162962e-01, 6.14967883e-01, 0.00000000e+00, 4.79711026e-01,
        9.56434608e-01, 0.00000000e+00, 8.76694322e-01, 9.48271155e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.74078238e-01,
        0.00000000e+00, 9.01507556e-01, 0.00000000e+00, 1.86964422e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000

In [82]:
c

array([[0.00000000e+00, 8.35044205e-01, 0.00000000e+00, 7.72912621e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 6.04525506e-01, 0.00000000e+00, 2.04175711e-02,
        0.00000000e+00, 0.00000000e+00, 5.55010617e-01, 8.11873853e-01,
        1.16767600e-01, 0.00000000e+00, 0.00000000e+00, 5.30311882e-01,
        7.69208670e-01, 1.91230401e-01, 3.60188931e-01, 2.91794538e-01,
        0.00000000e+00, 2.58038431e-01, 0.00000000e+00, 0.00000000e+00,
        1.59830544e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.38727891e-01, 0.00000000e+00, 5.01399815e-01,
        1.78162962e-01, 6.14967883e-01, 0.00000000e+00, 4.79711026e-01,
        9.56434608e-01, 0.00000000e+00, 8.76694322e-01, 9.48271155e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.74078238e-01,
        0.00000000e+00, 9.01507556e-01, 0.00000000e+00, 1.86964422e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000

In [83]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i;
        float xval = x[xidx];
        uint yidx = msize*gidy+i;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [84]:
a.shape, b.T.shape

((16, 32), (24, 32))

In [85]:
rows = a.shape[0]

In [86]:
mult = mult.astype(np.float32)

In [87]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7fccfc4b6b30>


mult: 0.52 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.84 - 0.00  -- 1/1
mult: 0.29 x 0.00 - 0.00  -- 2/2
mult: 0.00 x 0.77 - 0.00  -- 3/3
mult: 0.51 x 0.00 - 0.00  -- 4/4
mult: 0.03 x 0.00 - 0.00  -- 5/5
mult: 0.00 x 0.00 - 0.00  -- 6/6
mult: 0.00 x 0.00 - 0.00  -- 7/7
mult: 0.00 x 0.00 - 0.00  -- 8/8
mult: 0.00 x 0.60 - 0.00  -- 9/9
mult: 0.00 x 0.00 - 0.00  -- 10/10
mult: 0.00 x 0.02 - 0.00  -- 11/11
mult: 0.00 x 0.00 - 0.00  -- 12/12
mult: 0.00 x 0.00 - 0.00  -- 13/13
mult: 0.88 x 0.56 - 0.00  -- 14/14
mult: 0.00 x 0.81 - 0.00  -- 15/15
mult: 0.72 x 0.12 - 0.00  -- 16/16
mult: 0.00 x 0.00 - 0.00  -- 17/17
mult: 0.54 x 0.00 - 0.00  -- 18/18
mult: 0.00 x 0.53 - 0.00  -- 19/19
mult: 0.71 x 0.77 - 0.00  -- 20/20
mult: 0.88 x 0.19 - 0.00  -- 21/21
mult: 0.95 x 0.36 - 0.00  -- 22/22
mult: 0.00 x 0.29 - 0.00  -- 23/23
mult: 0.00 x 0.00 - 0.00  -- 24/24
mult: 0.00 x 0.26 - 0.00  -- 25/25
mult: 0.03 x 0.00 - 0.00  -- 26/26
mult: 0.00 x 0.00 - 0.00  -- 27/27
mult: 0.00 x 0.02 - 0.00  -- 28/28
mult

In [88]:
(res_np-mult).sum()

-1.2665987e-06

In [89]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [90]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [91]:
res_np

array([[1.6294171 , 0.36525553, 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791255 , 1.4661919 ,
        2.3058922 , 0.81397647, 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [92]:
a.dot(b)

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [93]:
res_np==mult

array([[ True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False,  True, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False,

In [94]:
res_np.shape

(16, 24)

In [95]:
mult.shape

(16, 24)

# Matmult Transposed Dense

In [96]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [97]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [98]:
at

array([[0.52390957, 0.3209946 , 0.28137836, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.51713663, 0.        ,
        0.02384352, 0.64305884, 0.97078615, 0.        , 0.27334523,
        0.        ],
       [0.        , 0.        , 0.80891   , 0.        , 0.        ,
        0.        , 0.17786872, 0.        , 0.        , 0.        ,
        0.69861746, 0.        , 0.57698685, 0.        , 0.        ,
        0.        ],
       [0.28779745, 0.37998632, 0.        , 0.        , 0.8348913 ,
        0.02476312, 0.        , 0.        , 0.11436905, 0.        ,
        0.61044616, 0.        , 0.4466196 , 0.        , 0.        ,
        0.12572199],
       [0.        , 0.        , 0.        , 0.83257234, 0.        ,
        0.23834811, 0.        , 0.        , 0.        , 0.09053238,
        0.73914844, 0.        , 0.        , 0.        , 0.5414916 ,
        0.6565848 ],
       [0.5066265 , 0.35968715, 0.4850459 , 0.        , 0.        ,
        0.        , 0.        , 

In [99]:
c

array([[0.52390957, 0.32099459, 0.28137836, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.51713663, 0.        ,
        0.02384352, 0.64305884, 0.97078615, 0.        , 0.27334523,
        0.        ],
       [0.        , 0.        , 0.80891001, 0.        , 0.        ,
        0.        , 0.17786872, 0.        , 0.        , 0.        ,
        0.69861746, 0.        , 0.57698685, 0.        , 0.        ,
        0.        ],
       [0.28779745, 0.37998632, 0.        , 0.        , 0.83489132,
        0.02476312, 0.        , 0.        , 0.11436905, 0.        ,
        0.61044616, 0.        , 0.4466196 , 0.        , 0.        ,
        0.12572199],
       [0.        , 0.        , 0.        , 0.83257234, 0.        ,
        0.23834811, 0.        , 0.        , 0.        , 0.09053238,
        0.73914844, 0.        , 0.        , 0.        , 0.54149163,
        0.6565848 ],
       [0.50662649, 0.35968715, 0.48504591, 0.        , 0.        ,
        0.        , 0.        , 

In [100]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint isize
                          ) { // LOCAL SHARED BUFFER
      uint osize = get_global_size(0);
      int gidy = get_global_id(0); // row
      
      for (uint gidx = 0; gidx < isize; gidx++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [101]:
a.shape, b.T.shape

((16, 32), (24, 32))

In [102]:
rows = a.shape[0]

In [103]:
mult = mult.astype(np.float32)

In [104]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [b.shape[1]], None, a_buf, b_buf, res_buf, np.uint32(a.shape[1]), np.uint32(rows))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.52 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.84 - 0.00  -- 16/24
mult: 0.29 x 0.00 - 0.00  -- 32/48
mult: 0.00 x 0.77 - 0.00  -- 48/72
mult: 0.51 x 0.00 - 0.00  -- 64/96
mult: 0.03 x 0.00 - 0.00  -- 80/120
mult: 0.00 x 0.00 - 0.00  -- 96/144
mult: 0.00 x 0.00 - 0.00  -- 112/168
mult: 0.00 x 0.00 - 0.00  -- 128/192
mult: 0.00 x 0.60 - 0.00  -- 144/216
mult: 0.00 x 0.00 - 0.00  -- 160/240
mult: 0.00 x 0.02 - 0.00  -- 176/264
mult: 0.00 x 0.00 - 0.00  -- 192/288
mult: 0.00 x 0.00 - 0.00  -- 208/312
mult: 0.88 x 0.56 - 0.49  -- 224/336
mult: 0.00 x 0.81 - 0.49  -- 240/360
mult: 0.72 x 0.12 - 0.57  -- 256/384
mult: 0.00 x 0.00 - 0.57  -- 272/408
mult: 0.54 x 0.00 - 0.57  -- 288/432
mult: 0.00 x 0.53 - 0.57  -- 304/456
mult: 0.71 x 0.77 - 1.12  -- 320/480
mult: 0.88 x 0.19 - 1.29  -- 336/504
mult: 0.95 x 0.36 - 1.63  -- 352/528
mult: 0.00 x 0.29 - 1.63  -- 368/552
mult: 0.00 x 0.00 - 1.63  -- 384/576
mult: 0.00 x 0.26 - 1.63  -- 400/600
mult: 0.03 x 0.00 - 1.63  -- 416/624
mult: 0.00 x 0

<pyopencl._cl.NannyEvent at 0x7fccfc4b6a40>

.00 - 1.63  -- 432/648
mult: 0.00 x 0.02 - 1.63  -- 448/672
mult: 0.00 x 0.00 - 1.63  -- 464/696
mult: 0.51 x 0.00 - 1.63  -- 480/720
mult: 0.00 x 0.00 - 1.63  -- 496/744

In [105]:
(res_np-mult).sum()

-1.2665987e-06

In [106]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [107]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [108]:
res_np

array([[1.6294171 , 0.36525553, 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791255 , 1.4661919 ,
        2.3058922 , 0.81397647, 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [109]:
a.dot(b)

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [110]:
res_np==mult

array([[ True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False,  True, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False,

In [111]:
res_np.shape

(16, 24)

In [112]:
mult.shape

(16, 24)

# Matmult Dense Transposed2

In [113]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint osize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      // osize = get_global_size(1);
      int gidx = get_global_id(0); // col
      // int gidy = get_global_id(1); // row

      for (uint gidy = 0; gidy < osize; gidy++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          ret += x[gidx*msize+i]*y[i*osize+gidy];
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f", x[gidx*msize+i],y[i*msize+gidy], ret);
        }

        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [114]:
a.shape, b.shape

((16, 32), (32, 24))

In [115]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [116]:
rows = a.shape[0]

In [117]:
mult = mult.astype(np.float32)

In [118]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]), np.uint32(b.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.52 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.29 x 0.82 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.51 x 0.25 - 0.00
mult: 0.03 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.12 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.88 x 0.00 - 0.49
mult: 0.00 x 0.77 - 0.49
mult: 0.72 x 0.00 - 0.57
mult: 0.00 x 0.91 - 0.57
mult: 0.54 x 0.00 - 0.57
mult: 0.00 x 0.73 - 0.57
mult: 0.71 x 0.25 - 1.12
mult: 0.88 x 0.02 - 1.29
mult: 0.95 x 0.00 - 1.63
mult: 0.00 x 0.44 - 1.63
mult: 0.00 x 1.63 - 1.63
mult: 0.00 x 1.35 - 1.63
mult: 0.03 x 1.33 - 1.63
mult: 0.00 x 1.05 - 1.63
mult: 0.00 x 0.14 - 1.63
mult: 0.00 x 1.99 - 1.63
mult: 0.51 x 0.44 - 1.63
mult: 0.00 x 0.72 - 1.63

<pyopencl._cl.NannyEvent at 0x7fccfc495f40>

In [119]:
(res_np-mult).sum()

-1.2665987e-06

In [120]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [121]:
b.T

array([[0.00000000e+00, 8.35044205e-01, 0.00000000e+00, 7.72912621e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 6.04525506e-01, 0.00000000e+00, 2.04175711e-02,
        0.00000000e+00, 0.00000000e+00, 5.55010617e-01, 8.11873853e-01,
        1.16767600e-01, 0.00000000e+00, 0.00000000e+00, 5.30311882e-01,
        7.69208670e-01, 1.91230401e-01, 3.60188931e-01, 2.91794538e-01,
        0.00000000e+00, 2.58038431e-01, 0.00000000e+00, 0.00000000e+00,
        1.59830544e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.38727891e-01, 0.00000000e+00, 5.01399815e-01,
        1.78162962e-01, 6.14967883e-01, 0.00000000e+00, 4.79711026e-01,
        9.56434608e-01, 0.00000000e+00, 8.76694322e-01, 9.48271155e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.74078238e-01,
        0.00000000e+00, 9.01507556e-01, 0.00000000e+00, 1.86964422e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000

In [122]:
a[0]

array([0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
       0.02850537, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.8833858 ,
       0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
       0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
       0.        , 0.03330261, 0.        , 0.        , 0.        ,
       0.50521904, 0.        ], dtype=float32)

In [123]:
b.T[0]

array([0.        , 0.8350442 , 0.        , 0.7729126 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.6045255 ,
       0.        , 0.02041757, 0.        , 0.        , 0.5550106 ,
       0.81187385, 0.1167676 , 0.        , 0.        , 0.5303119 ,
       0.76920867, 0.1912304 , 0.36018893, 0.29179454, 0.        ,
       0.25803843, 0.        , 0.        , 0.01598305, 0.        ,
       0.        , 0.        ], dtype=float32)

In [124]:
res_buf

<pyopencl._cl.Buffer at 0x7fccfc495e50>

In [125]:
res_np

array([[1.6294171 , 0.36525553, 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791255 , 1.4661919 ,
        2.3058922 , 0.81397647, 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [126]:
a.dot(b)

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [127]:
res_np==mult

array([[ True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False,  True, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [False, False,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False,

In [128]:
res_np.shape

(16, 24)

In [129]:
mult.shape

(16, 24)

## Weight update kernel

In [130]:
bs = 4

In [131]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [132]:
dim = 8
topk = 2

x = np.random.rand(bs,dim).astype(np.float32)
y = np.random.rand(bs,dim).astype(np.float32)
x.shape,y.shape, topk

((4, 8), (4, 8), 2)

x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint bs = get_global_size(1);
      uint gid2 = get_global_id(1);

      uint idx = n*gid2+gid;

      float valx = x[idx];
      float valy = y[idx];
      uint posx = 0;
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval = x[idx2];
        float tempval2 = y[idx2];
        bool larger = tempval > valx;
        bool larger2 = tempval2 > valy;

        barrier(CLK_GLOBAL_MEM_FENCE);
        posx += (larger)?1:0;
        posy += (larger2)?1:0;
        barrier(CLK_GLOBAL_MEM_FENCE);
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      //printf("posx:%i", posx);
      if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
      }
      if (posy < topk) {
        youtidx[posy+topk*gid2] = gid;
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      if (gid < topk) {
        for (uint j=0; j<topk; j++) {
          float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
          //printf("\\nJ:%i  gid:%i", j, gid);
          //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
          barrier(CLK_GLOBAL_MEM_FENCE);
          xout[gid2*topk*topk+j*topk+gid] = res;
          barrier(CLK_GLOBAL_MEM_FENCE);
          
        }
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
    }""").build()

In [133]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#x_cp_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
#x_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#y_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             uint bs,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      //uint bs = get_global_size(1);
      //uint gid2 = get_global_id(1);

      for (uint gid2=0; gid2<bs; gid2++){
        uint idx = n*gid2+gid;

        float valx = x[idx];
        float valy = y[idx];
        uint posx = 0;
        uint posy = 0;
        for (uint i = 0; i < n; i++) {
          uint idx2 = n*gid2+i;
          float tempval = x[idx2];
          float tempval2 = y[idx2];
          bool larger = tempval > valx;
          bool larger2 = tempval2 > valy;

          posx += (larger)?1:0;
          posy += (larger2)?1:0;
        }
        //printf("posx:%i", posx);
        if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
        }
        if (posy < topk) {
          youtidx[posy+topk*gid2] = gid;
        }
      }
      for (uint gid2=0; gid2<bs; gid2++){
        if (gid < topk) {
          for (uint j=0; j<topk; j++) {
            float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
            //printf("\\nJ:%i  gid:%i", j, gid);
            //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
            //barrier(CLK_GLOBAL_MEM_FENCE);
            xout[gid2*topk*topk+j*topk+gid] = res;
          }
        }
      }
    }""").build()

In [134]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
evt = knl(queue, [dim], None, x_buf, y_buf, x_cp_buf, np.uint32(topk), np.uint32(bs), x_idx_buf, y_idx_buf)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7fccfc42ef40>

knl(queue, [dim], None, y_buf, x_buf, x_cp_buft, np.uint32(topk), np.uint32(bs), x_idx_buft, y_idx_buft)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

In [135]:
x

array([[0.23743789, 0.84827596, 0.07081494, 0.47835743, 0.6848672 ,
        0.47584036, 0.2269264 , 0.10315062],
       [0.6238609 , 0.94381785, 0.5939783 , 0.29716107, 0.7217284 ,
        0.96478915, 0.98836243, 0.8964571 ],
       [0.87454623, 0.16888846, 0.28714436, 0.21900587, 0.02899518,
        0.7505003 , 0.7352158 , 0.60820794],
       [0.42916727, 0.26459584, 0.8398711 , 0.73737067, 0.4224287 ,
        0.1721209 , 0.23654433, 0.17798318]], dtype=float32)

In [136]:
y

array([[6.3968706e-05, 4.8997143e-01, 7.9112959e-01, 8.6863035e-01,
        5.6950593e-01, 7.5062472e-01, 8.5974175e-01, 4.9627638e-01],
       [6.4751649e-01, 5.8691168e-01, 9.7601062e-01, 8.5458517e-02,
        8.4434569e-01, 8.0659783e-01, 9.8088032e-01, 8.1364113e-01],
       [9.0899855e-01, 2.3904254e-01, 6.7353302e-01, 1.4023180e-01,
        1.7416324e-01, 7.3531561e-02, 9.2328030e-01, 9.8882484e-01],
       [7.3330247e-01, 5.0152808e-01, 5.9703894e-02, 6.8768448e-01,
        3.4756634e-01, 6.7819726e-01, 4.1137290e-01, 6.4349568e-01]],
      dtype=float32)

In [137]:
x.shape, y.shape

((4, 8), (4, 8))

In [138]:
resx

array([0.7368382 , 0.59489644, 0.72929823, 0.58880895, 0.96946526,
       0.9463427 , 0.96465224, 0.94164443, 0.86477304, 0.74211335,
       0.8074513 , 0.6929222 , 0.61587954, 0.54071575, 0.5775663 ,
       0.50707835], dtype=float32)

In [139]:
resx.reshape(bs,topk,topk)

array([[[0.7368382 , 0.59489644],
        [0.72929823, 0.58880895]],

       [[0.96946526, 0.9463427 ],
        [0.96465224, 0.94164443]],

       [[0.86477304, 0.74211335],
        [0.8074513 , 0.6929222 ]],

       [[0.61587954, 0.54071575],
        [0.5775663 , 0.50707835]]], dtype=float32)

In [140]:
resxidx

array([1, 4, 6, 5, 0, 5, 2, 3], dtype=uint32)

In [141]:
resyidx

array([3, 6, 6, 2, 7, 6, 0, 3], dtype=uint32)

In [142]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [143]:
xy0[3][7]

0.24178247

## Weight update kernel new

In [144]:
b

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.79806799e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80055600e-01,
        7.96413183e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.50798619e-01, 0.00000000e+00, 2.20665541e-02, 0.00000000e+00,
        2.88000137e-01, 0.00000000e+00, 0.00000000e+00, 1.40719712e-01,
        0.00000000e+00, 3.51273656e-01, 3.27207237e-01, 0.00000000e+00],
       [8.35044205e-01, 3.38727891e-01, 0.00000000e+00, 3.54924470e-01,
        1.99968800e-01, 5.70148349e-01, 0.00000000e+00, 2.30531722e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.21350515e-01, 3.68743688e-01, 4.29615289e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.99728179e-01, 7.84764469e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.0620

In [145]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [146]:
at

array([[0.52390957, 0.3209946 , 0.28137836, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.51713663, 0.        ,
        0.02384352, 0.64305884, 0.97078615, 0.        , 0.27334523,
        0.        ],
       [0.        , 0.        , 0.80891   , 0.        , 0.        ,
        0.        , 0.17786872, 0.        , 0.        , 0.        ,
        0.69861746, 0.        , 0.57698685, 0.        , 0.        ,
        0.        ],
       [0.28779745, 0.37998632, 0.        , 0.        , 0.8348913 ,
        0.02476312, 0.        , 0.        , 0.11436905, 0.        ,
        0.61044616, 0.        , 0.4466196 , 0.        , 0.        ,
        0.12572199],
       [0.        , 0.        , 0.        , 0.83257234, 0.        ,
        0.23834811, 0.        , 0.        , 0.        , 0.09053238,
        0.73914844, 0.        , 0.        , 0.        , 0.5414916 ,
        0.6565848 ],
       [0.5066265 , 0.35968715, 0.4850459 , 0.        , 0.        ,
        0.        , 0.        , 

In [147]:
c

array([[0.52390957, 0.32099459, 0.28137836, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.51713663, 0.        ,
        0.02384352, 0.64305884, 0.97078615, 0.        , 0.27334523,
        0.        ],
       [0.        , 0.        , 0.80891001, 0.        , 0.        ,
        0.        , 0.17786872, 0.        , 0.        , 0.        ,
        0.69861746, 0.        , 0.57698685, 0.        , 0.        ,
        0.        ],
       [0.28779745, 0.37998632, 0.        , 0.        , 0.83489132,
        0.02476312, 0.        , 0.        , 0.11436905, 0.        ,
        0.61044616, 0.        , 0.4466196 , 0.        , 0.        ,
        0.12572199],
       [0.        , 0.        , 0.        , 0.83257234, 0.        ,
        0.23834811, 0.        , 0.        , 0.        , 0.09053238,
        0.73914844, 0.        , 0.        , 0.        , 0.54149163,
        0.6565848 ],
       [0.50662649, 0.35968715, 0.48504591, 0.        , 0.        ,
        0.        , 0.        , 

a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate3(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             uint topk,
                             uint msize,
                             __global  float* xout,    // INPUT
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint bs = get_global_size(1);
      uint gid2 = get_global_id(1);



      uint idx = n*gid2+gid;
      float valx = x[idx];
      uint posx = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval = x[idx2];
        bool larger = tempval > valx;
        posx += (larger)?1:0;
      }
      
      uint idxy = n*gid2+gid;
      float valy = y[idx];
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval2 = y[idx2];
        bool larger2 = tempval2 > valy;
        posy += (larger2)?1:0;
      }
      
      if (posx < topk) {
        xoutidx[posx+topk*gid2] = idx;
      }
      if (posy < topk) {
        youtidx[posy+topk*gid2] = idxy;
      }
      return;
      if (gid < topk) {
        for (uint j=0; j<topk; j++) {
          float res = x[xoutidx[gid+topk*gid2]+gid2*msize] * y[youtidx[j+topk*gid2]+gid2*msize];
          printf("\\nJ:%i  gid:(%i,%i)", j, gid, gid2);
          printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
          //barrier(CLK_GLOBAL_MEM_FENCE);
          xout[gid2*topk*topk+j*topk+gid] = res;
        }
      }
    }""").build()

In [148]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate3(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topk,
                              __global  float* xout,
                              __global  uint* xoutidx,   
                              __global  uint* youtidx    
                              ) { 
      uint gid = get_global_id(0);
      
      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }
        
        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = xsum[i];
          bool larger = tempval > valx;
          posx += (larger)?1:0;
        }
        if (posx < topk) {
          xoutidx[posx] = gid;
        }
      }
      
      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          if (gid == 0) {
            printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          }
          ysum[gid] += val;
        }
        
        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = ysum[i];
          bool larger = tempval > valy;
          posy += (larger)?1:0;
        }
      
        if (posy < topk) {
          youtidx[posy] = gid;
        }
      }
      
      if (gid < topk) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[posx] = valx;
        
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[posy] = valy;
      }
      
      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < topk) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topk; j++) {
          uint idxy = youtidx[j];
          xout[j*topk+gid] = 0;
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            xout[j*topk+gid] += x[xidx2] * y[yidx2];
            //if (gid == 0 && j == 1)
            //  printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
        }
      }
    }""").build()

In [149]:
a.shape, b.shape

((16, 32), (32, 24))

In [150]:
rows = a.shape[0]
msize = a.shape[1]

In [151]:
cols = b.shape[1]

In [152]:
mult = a.dot(b)

In [153]:
mult = mult.astype(np.float32)

In [154]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.genwupdate3  # Use this Kernel object for repeated calls
evt = knl(queue, [max(rows,cols)], None, a_buf, b_buf, x_sum_buf, y_sum_buf, np.uint32(rows), np.uint32(msize),np.uint32(cols), np.uint32(topk), x_cp_buf, x_idx_buf, y_idx_buf)

resxsum = np.zeros(a.shape[0]).astype(np.float32)
resysum = np.zeros(b.shape[1]).astype(np.float32)
resx = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, resxsum, x_sum_buf)
cl.enqueue_copy(queue, resysum, y_sum_buf)
cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)


ADD VALx: 0.00 - 0
ADD VALx: 0.84 - 1
ADD VALx: 0.00 - 2
ADD VALx: 0.77 - 3
ADD VALx: 0.00 - 4
ADD VALx: 0.00 - 5
ADD VALx: 0.00 - 6
ADD VALx: 0.00 - 7
ADD VALx: 0.00 - 8
ADD VALx: 0.60 - 9
ADD VALx: 0.00 - 10
ADD VALx: 0.02 - 11
ADD VALx: 0.00 - 12
ADD VALx: 0.00 - 13
ADD VALx: 0.56 - 14
ADD VALx: 0.81 - 15
ADD VALx: 0.12 - 16
ADD VALx: 0.00 - 17
ADD VALx: 0.00 - 18
ADD VALx: 0.53 - 19
ADD VALx: 0.77 - 20
ADD VALx: 0.19 - 21
ADD VALx: 0.36 - 22
ADD VALx: 0.29 - 23
ADD VALx: 0.00 - 24
ADD VALx: 0.26 - 25
ADD VALx: 0.00 - 26
ADD VALx: 0.00 - 27
ADD VALx: 0.02 - 28
ADD VALx: 0.00 - 29
ADD VALx: 0.00 - 30
ADD VALx: 0.00 - 31

<pyopencl._cl.NannyEvent at 0x7fccfc43aef0>

In [155]:
resx.reshape(topk,topk)

array([[2.657633 , 2.039695 ],
       [2.5944054, 1.9188594]], dtype=float32)

In [156]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [157]:
b.T

array([[0.00000000e+00, 8.35044205e-01, 0.00000000e+00, 7.72912621e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 6.04525506e-01, 0.00000000e+00, 2.04175711e-02,
        0.00000000e+00, 0.00000000e+00, 5.55010617e-01, 8.11873853e-01,
        1.16767600e-01, 0.00000000e+00, 0.00000000e+00, 5.30311882e-01,
        7.69208670e-01, 1.91230401e-01, 3.60188931e-01, 2.91794538e-01,
        0.00000000e+00, 2.58038431e-01, 0.00000000e+00, 0.00000000e+00,
        1.59830544e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.38727891e-01, 0.00000000e+00, 5.01399815e-01,
        1.78162962e-01, 6.14967883e-01, 0.00000000e+00, 4.79711026e-01,
        9.56434608e-01, 0.00000000e+00, 8.76694322e-01, 9.48271155e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.74078238e-01,
        0.00000000e+00, 9.01507556e-01, 0.00000000e+00, 1.86964422e-01,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000

In [158]:
resxsum

array([6.5683227, 5.967158 , 6.3542824, 6.185669 , 6.443426 , 6.5472393,
       7.110329 , 4.9253597, 4.0015106, 7.1791635, 5.3589845, 6.7283797,
       6.3908844, 5.90038  , 6.876007 , 5.044566 ], dtype=float32)

In [159]:
resysum

array([6.133309 , 9.435176 , 3.9057505, 6.389828 , 5.8149047, 6.49338  ,
       5.4250383, 6.988711 , 3.6132226, 7.40563  , 2.1524422, 5.816851 ,
       6.396004 , 6.123145 , 3.6306083, 6.3584247, 5.05509  , 5.551777 ,
       3.923165 , 8.899167 , 2.741568 , 5.112133 , 7.0280957, 7.638768 ],
      dtype=float32)

In [160]:
a.sum(axis=1)

array([6.5683227, 5.9671583, 6.3542824, 6.1856694, 6.4434257, 6.5472393,
       7.1103287, 4.9253597, 4.0015106, 7.1791635, 5.358984 , 6.7283792,
       6.3908844, 5.90038  , 6.876007 , 5.044566 ], dtype=float32)

In [161]:
b.sum(axis=0)

array([6.133309 , 9.435176 , 3.9057505, 6.389828 , 5.8149047, 6.49338  ,
       5.4250383, 6.988711 , 3.6132226, 7.40563  , 2.1524422, 5.816851 ,
       6.396004 , 6.123145 , 3.6306083, 6.3584247, 5.05509  , 5.551777 ,
       3.923165 , 8.899167 , 2.741568 , 5.112133 , 7.0280957, 7.638768 ],
      dtype=float32)

In [162]:
mult

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [163]:
resxidx

array([6, 9], dtype=uint32)

In [164]:
resyidx

array([ 1, 19], dtype=uint32)

In [165]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [166]:
xy0[0][0]

0.40396023

## Weight update kernel new2 (sparse ouput)

In [167]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [168]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
sdata_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topk*4)
sidxs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topk*4)
snnzs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*4)
sdatat_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topk*4)
sidxst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topk*4)
snnzst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate4(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topk,
                              __global  uint*  xoutidx,
                              __global  uint*  youtidx,
                              __global  float* matData,     // INPUT MATRIX DATA
                              __global  uint*  colIdx,
                              __global  uint*  rowNnz,
                              __global  float* matDatat,     // INPUT MATRIX DATA
                              __global  uint*  colIdxt,
                              __global  uint*  rowNnzt
                              ) { 
      uint gid = get_global_id(0);
      
      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }
        
        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = xsum[i];
          bool larger = tempval > valx;
          posx += (larger)?1:0;
        }
        if (posx < topk) {
          xoutidx[posx] = gid;
        }
      }
      
      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          //}
          ysum[gid] += val;
        }
        
        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = ysum[i];
          bool larger = tempval > valy;
          posy += (larger)?1:0;
        }
      
        if (posy < topk) {
          youtidx[posy] = gid;
        }
      }
      
      if (gid < topk) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[posx] = valx;
        
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[posy] = valy;
      }
      
      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < isize) {
        for (uint i=0; i<topk; i++) {
          matData[gid*topk+i] = 0;
          colIdx[gid*topk+i] = 0;
        }
        rowNnz[gid] = 0;  
      }
      if (gid < osize) {
        for (uint i=0; i<topk; i++) {
          matDatat[gid*topk+i] = 0;
          colIdxt[gid*topk+i] = 0;
        }
        rowNnzt[gid] = 0;  
      }
      
      
      if (gid < topk) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topk; j++) {
          uint idxy = youtidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            uint colidx = idxy;
            matData[idxx*topk+j] += x[xidx2] * y[yidx2];
            colIdx[idxx*topk+j] = idxy;
            //if (gid == 0)
            //  printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnz[idxx] += 1;
        }
      }
      
      if (gid < topk) {
        uint idxy = youtidx[gid];
        for (uint j=0; j<topk; j++) {
          uint idxx = xoutidx[j];
          printf("\\nB-IDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            uint colidx = idxy;
            float addval = x[xidx2] * y[yidx2];
            matDatat[idxy*topk+j] += addval;
            colIdxt[idxy*topk+j] = idxx;
            if (gid == 0)
              printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnzt[idxy] += 1;
          printf("\\nAdd NNz:%i - %i", idxy, rowNnzt[idxy]);
        }
      }
    }""").build()

In [169]:
a.shape, b.shape

((16, 32), (32, 24))

In [170]:
rows = a.shape[0]
msize = a.shape[1]

In [171]:
cols = b.shape[1]

In [172]:
mult = a.dot(b)

In [173]:
mult = mult.astype(np.float32)

In [174]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.genwupdate4  # Use this Kernel object for repeated calls
evt = knl(queue, [max(rows,cols)], None, a_buf, b_buf, x_sum_buf, y_sum_buf, np.uint32(rows), np.uint32(msize),np.uint32(cols), 
          np.uint32(topk), x_idx_buf, y_idx_buf, sdata_buf, sidxs_buf, snnzs_buf, sdatat_buf, sidxst_buf, snnzst_buf)


B-IDXX:6  IDXY:1
B-IDXX:6  IDXY:19
 ADD VAL:0.00,0.00 - (6,1) - (0,0,0)
 ADD VAL:0.18,0.34 - (6,1) - (0,0,1)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,2)
 ADD VAL:0.00,0.50 - (6,1) - (0,0,3)
 ADD VAL:0.00,0.18 - (6,1) - (0,0,4)
 ADD VAL:0.00,0.61 - (6,1) - (0,0,5)
 ADD VAL:0.68,0.00 - (6,1) - (0,0,6)
 ADD VAL:0.00,0.48 - (6,1) - (0,0,7)
 ADD VAL:0.00,0.96 - (6,1) - (0,0,8)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,9)
 ADD VAL:0.35,0.88 - (6,1) - (0,0,10)
 ADD VAL:0.00,0.95 - (6,1) - (0,0,11)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,12)
 ADD VAL:0.72,0.00 - (6,1) - (0,0,13)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,14)
 ADD VAL:0.98,0.97 - (6,1) - (0,0,15)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,16)
 ADD VAL:0.73,0.90 - (6,1) - (0,0,17)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,18)
 ADD VAL:0.00,0.19 - (6,1) - (0,0,19)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,20)
 ADD VAL:0.00,0.00 - (6,1) - (0,0,21)
 ADD VAL:0.25,0.00 - (6,1) - (0,0,22)
 ADD VAL:0.57,0.00 - (6,1) - (0,0,23)
 ADD VAL:0.89,0.00 - (6,1) - (0,0,24)
 ADD VAL:0.00,0.00 - (6,

In [175]:
resxsum = np.zeros(a.shape[0]).astype(np.float32)
resysum = np.zeros(b.shape[1]).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)
resxdat = np.zeros(a.shape[0]*topk).astype(np.float32)
resxcol = np.zeros(a.shape[0]*topk).astype(np.uint32)
resxnnz = np.zeros(a.shape[0]).astype(np.uint32)
resxdatt = np.zeros(b.shape[1]*topk).astype(np.float32)
resxcolt = np.zeros(b.shape[1]*topk).astype(np.uint32)
resxnnzt = np.zeros(b.shape[1]).astype(np.uint32)

cl.enqueue_copy(queue, resxsum, x_sum_buf)
cl.enqueue_copy(queue, resysum, y_sum_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)
cl.enqueue_copy(queue, resxdat, sdata_buf)
cl.enqueue_copy(queue, resxcol, sidxs_buf)
cl.enqueue_copy(queue, resxnnz, snnzs_buf)
cl.enqueue_copy(queue, resxdatt, sdatat_buf)
cl.enqueue_copy(queue, resxcolt, sidxst_buf)
cl.enqueue_copy(queue, resxnnzt, snnzst_buf)

<pyopencl._cl.NannyEvent at 0x7fccfc3df040>

## results

In [176]:
mult.T

array([[1.6294171 , 0.18166207, 1.1198102 , 2.3589926 , 1.0459102 ,
        1.3025382 , 1.2000593 , 0.9868014 , 0.4365524 , 1.6035323 ,
        1.5755153 , 0.9047202 , 0.8062944 , 1.1431439 , 0.8117372 ,
        1.9208248 ],
       [0.3652555 , 2.6036541 , 2.5431209 , 1.8276073 , 1.2633338 ,
        2.081274  , 2.657633  , 1.4677691 , 1.3792473 , 2.039695  ,
        1.6275084 , 2.4527457 , 1.4064335 , 2.009085  , 1.793393  ,
        1.691347  ],
       [0.42776406, 1.1487687 , 0.07278372, 0.47929585, 1.3355373 ,
        0.02237026, 1.0820868 , 0.95165986, 0.11597993, 1.3025286 ,
        1.4566172 , 0.5637467 , 1.2921878 , 1.1349369 , 0.4503973 ,
        0.8986734 ],
       [1.6028032 , 1.3172693 , 1.1791257 , 2.0679436 , 1.1406506 ,
        1.186177  , 0.89772534, 1.5635384 , 0.30894303, 0.7343426 ,
        1.354146  , 0.8724855 , 1.3893279 , 0.8441851 , 0.8408757 ,
        1.5497655 ],
       [1.0215062 , 1.0539525 , 1.466192  , 0.8810755 , 0.8951473 ,
        1.8429136 , 0.99431485, 

In [177]:
resxdat.reshape(a.shape[0],topk)

array([[0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [2.657633 , 2.5944054],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [2.039695 , 1.9188594],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ]], dtype=float32)

In [178]:
resxcol.reshape(a.shape[0],topk)

array([[ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 1, 19],
       [ 0,  0],
       [ 0,  0],
       [ 1, 19],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0]], dtype=uint32)

In [179]:
resxnnz.reshape(a.shape[0])

array([0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0], dtype=uint32)

In [180]:
resxdatt.reshape(b.shape[1],topk)

array([[0.       , 0.       ],
       [2.657633 , 2.039695 ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [2.5944054, 1.9188594],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ],
       [0.       , 0.       ]], dtype=float32)

In [181]:
resxcolt.reshape(b.shape[1],topk)

array([[0, 0],
       [6, 9],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [6, 9],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0]], dtype=uint32)

In [182]:
resxnnzt.reshape(b.shape[1])

array([0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0], dtype=uint32)

## comp

In [183]:
resxsum

array([6.5683227, 5.967158 , 6.3542824, 6.185669 , 6.443426 , 6.5472393,
       7.110329 , 4.9253597, 4.0015106, 7.1791635, 5.3589845, 6.7283797,
       6.3908844, 5.90038  , 6.876007 , 5.044566 ], dtype=float32)

In [184]:
a.sum(axis=1)

array([6.5683227, 5.9671583, 6.3542824, 6.1856694, 6.4434257, 6.5472393,
       7.1103287, 4.9253597, 4.0015106, 7.1791635, 5.358984 , 6.7283792,
       6.3908844, 5.90038  , 6.876007 , 5.044566 ], dtype=float32)

In [185]:
resysum

array([6.133309 , 9.435176 , 3.9057505, 6.389828 , 5.8149047, 6.49338  ,
       5.4250383, 6.988711 , 3.6132226, 7.40563  , 2.1524422, 5.816851 ,
       6.396004 , 6.123145 , 3.6306083, 6.3584247, 5.05509  , 5.551777 ,
       3.923165 , 8.899167 , 2.741568 , 5.112133 , 7.0280957, 7.638768 ],
      dtype=float32)

In [186]:
b.sum(axis=0)

array([6.133309 , 9.435176 , 3.9057505, 6.389828 , 5.8149047, 6.49338  ,
       5.4250383, 6.988711 , 3.6132226, 7.40563  , 2.1524422, 5.816851 ,
       6.396004 , 6.123145 , 3.6306083, 6.3584247, 5.05509  , 5.551777 ,
       3.923165 , 8.899167 , 2.741568 , 5.112133 , 7.0280957, 7.638768 ],
      dtype=float32)

In [187]:
mult

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [188]:
resxidx

array([6, 9], dtype=uint32)

In [189]:
resyidx

array([ 1, 19], dtype=uint32)

In [190]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [191]:
xy0[0][0]

0.40396023

### Update Vals (add sparse)

In [192]:
multdata, multcols, multnnz, multellw = to_data(mult)
multdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multdata)
multcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multcols)
multnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multnnz)


prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            __global  float* matDataAdd,     // INPUT MATRIX DATA
                            __global  uint*  colIdxAdd,
                            __global  uint*  rowNnzAdd,
                            uint ellwidthAdd
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];
      
      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*ellwidthAdd;
      
      uint nnzadd = rowNnzAdd[gid];
      printf("\\nNNZs: %i   GID:%i", nnzadd, gid);
      
      for (uint i=0; i<nnzadd; i++) {
        float addval = matDataAdd[baseidxd+i];
        uint addcol = colIdxAdd[baseidxd+i];
        
        uint refcol = colIdx[baseidxs+i];
        uint m = 0;
        while (addcol > refcol) {
          m += 1;
          refcol = colIdx[baseidxs+i+m];
        }
        
        //printf("\\nADD VAL:%.2f  ADDCOL:%i  idxs/d:(%i/%i)  gid/i:(%i/%i)", addval, addcol, baseidxs, baseidxd, gid,i);
        if (addval == 0.0) {
          //printf("\\nZERO VAL, CONT: %.2f - %i", addval, gid);
          continue;
        }
        if (addcol == refcol) {
          matData[baseidxs+i+m] += addval;
          printf("\\nINCREMENT: %.2f",addval);
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (addcol > refcol) {
            rowNnz[gid] += 1;
            printf("\\nSET VAL0:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i+m] = addval;
            colIdx[baseidxs+i+m] = addcol;
            continue;
          }
          for (uint j=nnz; j>i+m; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          
          printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i+m] = addval;
          colIdx[baseidxs+i+m] = addcol;
          if (nnz >= ellwidth)
            break;
        }
      }
    }""").build()

In [193]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [194]:
rows = a.shape[0]

In [195]:
mult = mult.astype(np.float32)

In [196]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, multdata_buf, multcols_buf, multnnzs_buf, np.float32(1), np.uint32(multellw), 
    sdata_buf, sidxs_buf, snnzs_buf, np.uint32(topk))

<pyopencl._cl.Event at 0x7fccfc448e00>


NNZs: 0   GID:0
NNZs: 0   GID:1
NNZs: 0   GID:2
NNZs: 0   GID:3
NNZs: 0   GID:4
NNZs: 0   GID:5
NNZs: 2   GID:6
NNZs: 0   GID:7
NNZs: 0   GID:8
NNZs: 2   GID:9
NNZs: 0   GID:10
NNZs: 0   GID:11
NNZs: 0   GID:12
NNZs: 0   GID:13
NNZs: 0   GID:14
NNZs: 0   GID:15
INCREMENT: 2.66
INCREMENT: 2.04
INCREMENT: 2.59
INCREMENT: 1.92

In [197]:
mult

array([[1.6294171 , 0.3652555 , 0.42776406, 1.6028032 , 1.0215062 ,
        0.12642185, 2.0452654 , 2.042922  , 0.9641465 , 1.0109627 ,
        0.9518984 , 1.0344464 , 1.2077466 , 0.82906187, 0.8158375 ,
        1.7826574 , 1.6191896 , 0.99687296, 0.6931111 , 1.9219018 ,
        1.3466228 , 1.2641569 , 1.7963077 , 1.736011  ],
       [0.18166207, 2.6036541 , 1.1487687 , 1.3172693 , 1.0539525 ,
        1.5085787 , 0.49086148, 0.5046138 , 1.3540616 , 2.1261594 ,
        0.753877  , 1.4574277 , 0.5758329 , 1.258477  , 0.5482328 ,
        1.0452143 , 1.6999059 , 0.26990923, 0.6382393 , 2.4425354 ,
        0.        , 0.68067676, 2.069395  , 0.26347846],
       [1.1198102 , 2.5431209 , 0.07278372, 1.1791257 , 1.466192  ,
        2.3058922 , 0.8139765 , 1.4870273 , 0.22409344, 1.109697  ,
        0.42799425, 1.2300221 , 1.3703419 , 1.8662343 , 0.67573184,
        1.0109664 , 1.333266  , 1.3154588 , 0.40825096, 1.1815734 ,
        0.586634  , 1.1518855 , 1.0570257 , 1.1085757 ],
       [2.358

In [198]:
data_res = np.empty_like(multdata)
cols_res = np.empty_like(multcols)
nnzs_res = np.empty_like(multnnz)
cl.enqueue_copy(queue, data_res, multdata_buf, is_blocking=True)
cl.enqueue_copy(queue, cols_res, multcols_buf, is_blocking=True)
cl.enqueue_copy(queue, nnzs_res, multnnzs_buf, is_blocking=True)

<pyopencl._cl.NannyEvent at 0x7fccfc3e9f90>

In [199]:
adenseadd = to_dense(data_res, cols_res, nnzs_res, multellw, mult.shape)
adenseadd.T

array([[1.62941706, 0.18166207, 1.11981022, 2.35899258, 1.04591024,
        1.30253816, 1.20005929, 0.98680139, 0.43655241, 1.60353231,
        1.57551527, 0.90472019, 0.80629438, 1.14314389, 0.81173718,
        1.92082477],
       [0.3652555 , 2.60365415, 2.54312086, 1.82760727, 1.2633338 ,
        2.08127403, 5.31526613, 1.46776915, 1.37924731, 4.07939005,
        1.6275084 , 2.45274568, 1.40643346, 2.00908494, 1.79339302,
        1.691347  ],
       [0.42776406, 1.14876866, 0.07278372, 0.47929585, 1.33553731,
        0.02237026, 1.0820868 , 0.95165986, 0.11597993, 1.30252862,
        1.45661724, 0.56374669, 1.29218781, 1.13493693, 0.45039731,
        0.89867342],
       [1.60280323, 1.31726933, 1.17912567, 2.06794357, 1.14065063,
        1.18617702, 0.89772534, 1.56353843, 0.30894303, 0.73434258,
        1.354146  , 0.87248552, 1.38932788, 0.84418511, 0.84087569,
        1.54976547],
       [1.02150619, 1.05395246, 1.46619201, 0.8810755 , 0.89514732,
        1.84291363, 0.99431485, 

In [200]:
mult-adenseadd

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
  

### Update Vals (add sparset)

In [254]:
multdata, multcols, multnnz, multellw = to_data(mult)
multdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multdata)
multcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multcols)
multnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multnnz)


prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            __global  float* matDataAdd,     // INPUT MATRIX DATA
                            __global  uint*  colIdxAdd,
                            __global  uint*  rowNnzAdd,
                            uint ellwidthAdd
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];
      
      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*ellwidthAdd;
      
      uint nnzadd = rowNnzAdd[gid];
      printf("\\nNNZs: %i   GID:%i", nnzadd, gid);
      
      
      for (uint i=0; i<nnzadd; i++) {
        float addval = 0;//matDataAdd[baseidxd+i];
        uint addcol = colIdxAdd[baseidxd+i];
        
        uint refcol = colIdx[baseidxs+i];
        uint m = 0;
        while (addcol > refcol) {
          m += 1;
          refcol = colIdx[baseidxs+i+m];
        }
        
        //printf("\\nADD VAL:%.2f  ADDCOL:%i  idxs/d:(%i/%i)  gid/i:(%i/%i)", addval, addcol, baseidxs, baseidxd, gid,i);
        if (addval == 0.0) {
          //printf("\\nZERO VAL, CONT: %.2f - %i", addval, gid);
          continue;
        }
        if (addcol == refcol) {
          matData[baseidxs+i+m] += addval;
          printf("\\nINCREMENT: %.2f",addval);
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (addcol > refcol) {
            rowNnz[gid] += 1;
            printf("\\nSET VAL0:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i+m] = addval;
            colIdx[baseidxs+i+m] = addcol;
            continue;
          }
          for (uint j=nnz; j>i+m; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          
          printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i+m] = addval;
          colIdx[baseidxs+i+m] = addcol;
          if (nnz >= ellwidth)
            break;
        }
      }
    }""").build()

In [255]:
multt=np.zeros(mult.T.shape)

for row in range(multt.shape[0]):
    for col in range(multt.shape[1]):
        multt[row][col] = mult[col][row]

In [256]:
multdata, multcols, multnnz, multellw = to_data(multt)
multdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multdata)
multcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multcols)
multnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multnnz)

In [257]:
a.shape, b.shape

((16, 32), (32, 24))

In [258]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [259]:
rows = mult.T.shape[0]

In [260]:
mult = mult.astype(np.float32)

In [261]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, multdata_buf, multcols_buf, multnnzs_buf, np.float32(1), np.uint32(multellw), 
    sdatat_buf, sidxst_buf, snnzst_buf, np.uint32(topk))

<pyopencl._cl.Event at 0x7fccfc3e7860>


NNZs: 0   GID:0
NNZs: 2   GID:1
NNZs: 0   GID:2
NNZs: 0   GID:3
NNZs: 0   GID:4
NNZs: 0   GID:5
NNZs: 0   GID:6
NNZs: 0   GID:7
NNZs: 0   GID:8
NNZs: 0   GID:9
NNZs: 0   GID:10
NNZs: 0   GID:11
NNZs: 0   GID:12
NNZs: 0   GID:13
NNZs: 0   GID:14
NNZs: 0   GID:15
NNZs: 0   GID:16
NNZs: 0   GID:17
NNZs: 0   GID:18
NNZs: 2   GID:19
NNZs: 0   GID:20
NNZs: 0   GID:21
NNZs: 0   GID:22
NNZs: 0   GID:23

In [262]:
mult.T

array([[1.6294171 , 0.18166207, 1.1198102 , 2.3589926 , 1.0459102 ,
        1.3025382 , 1.2000593 , 0.9868014 , 0.4365524 , 1.6035323 ,
        1.5755153 , 0.9047202 , 0.8062944 , 1.1431439 , 0.8117372 ,
        1.9208248 ],
       [0.3652555 , 2.6036541 , 2.5431209 , 1.8276073 , 1.2633338 ,
        2.081274  , 2.657633  , 1.4677691 , 1.3792473 , 2.039695  ,
        1.6275084 , 2.4527457 , 1.4064335 , 2.009085  , 1.793393  ,
        1.691347  ],
       [0.42776406, 1.1487687 , 0.07278372, 0.47929585, 1.3355373 ,
        0.02237026, 1.0820868 , 0.95165986, 0.11597993, 1.3025286 ,
        1.4566172 , 0.5637467 , 1.2921878 , 1.1349369 , 0.4503973 ,
        0.8986734 ],
       [1.6028032 , 1.3172693 , 1.1791257 , 2.0679436 , 1.1406506 ,
        1.186177  , 0.89772534, 1.5635384 , 0.30894303, 0.7343426 ,
        1.354146  , 0.8724855 , 1.3893279 , 0.8441851 , 0.8408757 ,
        1.5497655 ],
       [1.0215062 , 1.0539525 , 1.466192  , 0.8810755 , 0.8951473 ,
        1.8429136 , 0.99431485, 

In [263]:
data_res = np.empty_like(multdata)
cols_res = np.empty_like(multcols)
nnzs_res = np.empty_like(multnnz)
cl.enqueue_copy(queue, data_res, multdata_buf, is_blocking=True)
cl.enqueue_copy(queue, cols_res, multcols_buf, is_blocking=True)
cl.enqueue_copy(queue, nnzs_res, multnnzs_buf, is_blocking=True)

<pyopencl._cl.NannyEvent at 0x7fccfc400ef0>

In [264]:
multt-data_res.reshape(multt.shape)

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0

In [211]:
nnzs_res

array([16, 16, 16, 16, 16, 16, 16, 16, 15, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 13, 16, 16, 16], dtype=uint32)

In [212]:
adenseaddt = to_dense(data_res, cols_res, nnzs_res, multellw, multt.shape)
adenseaddt

array([[1.62941706, 0.18166207, 1.11981022, 2.35899258, 1.04591024,
        1.30253816, 1.20005929, 0.98680139, 0.43655241, 1.60353231,
        1.57551527, 0.90472019, 0.80629438, 1.14314389, 0.81173718,
        1.92082477],
       [0.3652555 , 2.60365415, 2.54312086, 1.82760727, 1.2633338 ,
        2.08127403, 5.31526613, 1.46776915, 1.37924731, 4.07939005,
        1.6275084 , 2.45274568, 1.40643346, 2.00908494, 1.79339302,
        1.691347  ],
       [0.42776406, 1.14876866, 0.07278372, 0.47929585, 1.33553731,
        0.02237026, 1.0820868 , 0.95165986, 0.11597993, 1.30252862,
        1.45661724, 0.56374669, 1.29218781, 1.13493693, 0.45039731,
        0.89867342],
       [1.60280323, 1.31726933, 1.17912567, 2.06794357, 1.14065063,
        1.18617702, 0.89772534, 1.56353843, 0.30894303, 0.73434258,
        1.354146  , 0.87248552, 1.38932788, 0.84418511, 0.84087569,
        1.54976547],
       [1.02150619, 1.05395246, 1.46619201, 0.8810755 , 0.89514732,
        1.84291363, 0.99431485, 

In [213]:
multt-adenseaddt

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -2.65763307,  0.        ,  0.        , -2.03969502,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0

In [214]:
adenseaddt

array([[1.62941706, 0.18166207, 1.11981022, 2.35899258, 1.04591024,
        1.30253816, 1.20005929, 0.98680139, 0.43655241, 1.60353231,
        1.57551527, 0.90472019, 0.80629438, 1.14314389, 0.81173718,
        1.92082477],
       [0.3652555 , 2.60365415, 2.54312086, 1.82760727, 1.2633338 ,
        2.08127403, 5.31526613, 1.46776915, 1.37924731, 4.07939005,
        1.6275084 , 2.45274568, 1.40643346, 2.00908494, 1.79339302,
        1.691347  ],
       [0.42776406, 1.14876866, 0.07278372, 0.47929585, 1.33553731,
        0.02237026, 1.0820868 , 0.95165986, 0.11597993, 1.30252862,
        1.45661724, 0.56374669, 1.29218781, 1.13493693, 0.45039731,
        0.89867342],
       [1.60280323, 1.31726933, 1.17912567, 2.06794357, 1.14065063,
        1.18617702, 0.89772534, 1.56353843, 0.30894303, 0.73434258,
        1.354146  , 0.87248552, 1.38932788, 0.84418511, 0.84087569,
        1.54976547],
       [1.02150619, 1.05395246, 1.46619201, 0.8810755 , 0.89514732,
        1.84291363, 0.99431485, 

In [215]:
adenseadd.T == adenseaddt

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  Tru

### Update Vals (add dense)

In [216]:
matadd = np.random.randn(*a.shape).astype(np.float32)
matadd

array([[ 7.23428428e-01, -1.04790270e+00, -8.03970337e-01,
         1.35833299e+00, -7.94822723e-02, -1.24150507e-01,
        -4.10405278e-01, -2.73382515e-01,  2.11167812e-01,
         1.55882761e-01,  2.96572238e-01,  1.63084042e+00,
         2.58383298e+00, -9.18212533e-02, -7.84058124e-02,
         2.24612430e-01,  5.91340184e-01,  8.15700293e-02,
        -6.49575531e-01,  1.17962074e+00,  7.60069788e-01,
        -6.21060073e-01, -1.72183320e-01,  2.84396350e-01,
        -1.09678495e+00,  2.28973317e+00, -1.07332933e+00,
         1.55521059e+00, -1.42624712e+00, -1.13070333e+00,
        -1.51236224e+00, -1.25393927e+00],
       [-1.13348317e+00,  1.66563213e+00, -4.97895241e-01,
         1.84440285e-01,  1.31422043e+00,  6.09186172e-01,
         1.25180638e+00, -6.50567591e-01, -2.02861857e+00,
        -1.12695003e+00,  6.59119070e-01,  3.66337955e-01,
        -4.01594102e-01,  4.05708462e-01, -2.27029458e-01,
         1.63969755e+00,  8.55524778e-01, -4.23325449e-01,
        -7.10

In [217]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [218]:
a_added = a + matadd

In [219]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   awidth,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*awidth;
      
      for (uint i=0; i<awidth; i++) {
        float addval = vector_x[baseidxd+i];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (i > colIdx[baseidxs+i]) {
            rowNnz[gid] += 1;
            //if (gid==1)
            //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i] = addval;
            colIdx[baseidxs+i] = i;
            continue;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
          if (nnz >= ellwidth)
            break;
        }
      }
    }""").build()

In [220]:
a.shape, b.shape

((16, 32), (32, 24))

In [221]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [222]:
rows = a.shape[0]

In [223]:
mult = mult.astype(np.float32)

In [224]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa),np.uint32(a.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7fccfc483720>

In [225]:
matadd[0][0]

0.7234284

In [226]:
data_res = np.empty_like(adata)
cols_res = np.empty_like(acols)
nnzs_res = np.empty_like(annz)
cl.enqueue_copy(queue, data_res, adata_buf)
cl.enqueue_copy(queue, cols_res, acols_buf)
cl.enqueue_copy(queue, nnzs_res, annzs_buf)

<pyopencl._cl.NannyEvent at 0x7fccfc40b6d0>

In [227]:
adenseadd = to_dense(data_res, cols_res, nnzs_res, ellwa, a.shape)
adenseadd

array([[ 1.24733806, -1.0479027 , -0.51617289,  1.35833299,  0.42714423,
        -0.09564514, -0.41040528, -0.27338251,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.88338578,
         0.        ,  0.72140181,  0.        ,  0.53931004,  0.        ,
         0.70789772,  0.88259971,  0.94836694,  0.        ,  0.        ,
         0.        ,  0.03330261,  0.        ,  0.        ,  0.        ,
         0.50521904,  0.        ],
       [-0.81248856,  1.66563213, -0.11790892,  0.18444028,  1.67390752,
         1.48490345,  1.25180638, -0.65056759,  0.29720682,  0.        ,
         0.        ,  0.71653605,  0.        ,  0.93284988,  0.        ,
         0.        ,  0.16859464,  0.95579499,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.57102251,  0.        ,  0.22954714,  0.        ,  0.        ,
         0.15922078,  0.        ],
       [ 0.32170033, -0.37944043,  0.84287655,  0.2071

In [228]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [229]:
matadd

array([[ 7.23428428e-01, -1.04790270e+00, -8.03970337e-01,
         1.35833299e+00, -7.94822723e-02, -1.24150507e-01,
        -4.10405278e-01, -2.73382515e-01,  2.11167812e-01,
         1.55882761e-01,  2.96572238e-01,  1.63084042e+00,
         2.58383298e+00, -9.18212533e-02, -7.84058124e-02,
         2.24612430e-01,  5.91340184e-01,  8.15700293e-02,
        -6.49575531e-01,  1.17962074e+00,  7.60069788e-01,
        -6.21060073e-01, -1.72183320e-01,  2.84396350e-01,
        -1.09678495e+00,  2.28973317e+00, -1.07332933e+00,
         1.55521059e+00, -1.42624712e+00, -1.13070333e+00,
        -1.51236224e+00, -1.25393927e+00],
       [-1.13348317e+00,  1.66563213e+00, -4.97895241e-01,
         1.84440285e-01,  1.31422043e+00,  6.09186172e-01,
         1.25180638e+00, -6.50567591e-01, -2.02861857e+00,
        -1.12695003e+00,  6.59119070e-01,  3.66337955e-01,
        -4.01594102e-01,  4.05708462e-01, -2.27029458e-01,
         1.63969755e+00,  8.55524778e-01, -4.23325449e-01,
        -7.10

In [230]:
a_added

array([[ 1.247338  , -1.0479027 , -0.5161729 ,  1.358333  ,  0.42714423,
        -0.09564514, -0.41040528, -0.2733825 ,  0.21116781,  0.15588276,
         0.29657224,  1.6308404 ,  2.583833  , -0.09182125,  0.80498   ,
         0.22461243,  1.312742  ,  0.08157003, -0.11026549,  1.1796207 ,
         1.4679675 ,  0.26153964,  0.7761836 ,  0.28439635, -1.096785  ,
         2.2897332 , -1.0400267 ,  1.5552106 , -1.4262471 , -1.1307033 ,
        -1.0071433 , -1.2539393 ],
       [-0.81248856,  1.6656321 , -0.11790892,  0.18444028,  1.6739075 ,
         1.4849035 ,  1.2518064 , -0.6505676 , -1.7314117 , -1.12695   ,
         0.65911907,  1.0828741 , -0.4015941 ,  1.3385583 , -0.22702946,
         1.6396976 ,  1.0241194 ,  0.5324695 , -0.71097183, -0.393672  ,
        -1.964109  , -0.17796814, -0.04434384,  1.976612  , -0.1191747 ,
         1.6145726 , -0.4683473 ,  0.68993616,  0.5613338 ,  0.20387952,
        -0.30872938,  0.42492518],
       [ 0.32170033, -0.37944043,  0.84287655,  0.2071

In [231]:
adenseadd == a_added

array([[ True,  True,  True,  True,  True,  True,  True,  True, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False],
       [ True,  True,  True,  True,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, F

### update vals

In [232]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddenset(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   aheight,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint ncols = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      
      for (uint i=0; i<aheight; i++) {
        if (nnz > ellwidth)
            break;
        uint baseidxd = i*ncols+gid;
        float addval = vector_x[baseidxd];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (i > colIdx[baseidxs+i]) {
            rowNnz[gid] += 1;
            //if (gid==1)
            //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i] = addval;
            colIdx[baseidxs+i] = i;
            continue;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
        }
      }
    }""").build()

In [233]:
a.shape, b.shape

((16, 32), (32, 24))

In [234]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [235]:
cols = a.shape[1]

In [236]:
mult = mult.astype(np.float32)

In [237]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddenset  # Use this Kernel object for repeated calls
knl(queue, [cols], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat),np.uint32(a.T.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7fccfc4634a0>


ADD VAL:0.72 idx:0/0  col:0
ADD VAL:-0.80 idx:32/2  col:0
ADD VAL:-0.08 idx:64/4  col:0
ADD VAL:-0.12 idx:80/5  col:0
ADD VAL:-0.08 idx:224/14  col:0
ADD VAL:0.59 idx:256/16  col:0
ADD VAL:-0.65 idx:288/18  col:0
ADD VAL:0.76 idx:320/20  col:0
ADD VAL:-0.62 idx:336/21  col:0
ADD VAL:-0.17 idx:352/22  col:0
ADD VAL:-1.07 idx:416/26  col:0
ADD VAL:-1.51 idx:480/30  col:0
ADD VAL:-1.13 idx:1/33  col:1
ADD VAL:-0.50 idx:33/35  col:1
ADD VAL:1.31 idx:65/37  col:1
ADD VAL:0.61 idx:81/38  col:1
ADD VAL:-2.03 idx:129/41  col:1
ADD VAL:0.37 idx:177/44  col:1
ADD VAL:0.41 idx:209/46  col:1
ADD VAL:0.86 idx:257/49  col:1
ADD VAL:-0.42 idx:273/50  col:1
ADD VAL:1.04 idx:401/58  col:1
ADD VAL:0.46 idx:433/60  col:1
ADD VAL:-0.47 idx:481/63  col:1
ADD VAL:0.04 idx:2/66  col:2
ADD VAL:-1.19 idx:18/67  col:2
ADD VAL:-1.37 idx:66/70  col:2
ADD VAL:1.03 idx:114/73  col:2
ADD VAL:-0.48 idx:130/74  col:2
ADD VAL:-0.71 idx:194/78  col:2
ADD VAL:2.45 idx:274/83  col:2
ADD VAL:-0.52 idx:306/85  col:2
ADD VA

In [238]:
matadd[0][0]

0.7234284

In [239]:
datat_res = np.empty_like(adatat)
colst_res = np.empty_like(acolst)
nnzst_res = np.empty_like(annzt)
cl.enqueue_copy(queue, datat_res, adatat_buf)
cl.enqueue_copy(queue, colst_res, acolst_buf)
cl.enqueue_copy(queue, nnzst_res, annzst_buf)

<pyopencl._cl.NannyEvent at 0x7fccfc4639a0>

In [240]:
adenseaddt = to_dense(datat_res, colst_res, nnzst_res, ellwat, a.T.shape).T
adenseaddt

array([[ 1.24733806, -1.0479027 , -0.51617289,  1.35833299,  0.42714423,
        -0.09564514, -0.41040528, -0.27338251,  0.21116781,  0.15588276,
         0.29657224,  1.63084042,  2.58383298, -0.09182125,  0.80497998,
         0.22461243,  1.31274199,  0.08157003, -0.11026549,  1.17962074,
         1.46796751,  0.26153964,  0.77618361,  0.28439635, -1.09678495,
         2.28973317, -1.04002666,  1.55521059, -1.42624712, -1.13070333,
        -1.00714326, -1.25393927],
       [-0.81248856,  1.66563213, -0.11790892,  0.18444028,  1.67390752,
         1.48490345,  1.25180638, -0.65056759, -1.7314117 , -1.12695003,
         0.65911907,  1.08287406, -0.4015941 ,  1.33855832, -0.22702946,
         1.63969755,  1.02411938,  0.53246951, -0.71097183, -0.39367199,
        -1.96410894, -0.17796814, -0.04434384,  1.97661197, -0.1191747 ,
         1.61457264, -0.46834731,  0.68993616,  0.56133378,  0.20387952,
        -0.30872938,  0.42492518],
       [ 0.32170033, -0.37944043,  0.84287655,  0.2071

In [241]:
a

array([[0.52390957, 0.        , 0.28779745, 0.        , 0.5066265 ,
        0.02850537, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.8833858 ,
        0.        , 0.7214018 , 0.        , 0.53931004, 0.        ,
        0.7078977 , 0.8825997 , 0.94836694, 0.        , 0.        ,
        0.        , 0.03330261, 0.        , 0.        , 0.        ,
        0.50521904, 0.        ],
       [0.3209946 , 0.        , 0.37998632, 0.        , 0.35968715,
        0.8757173 , 0.        , 0.        , 0.29720682, 0.        ,
        0.        , 0.71653605, 0.        , 0.9328499 , 0.        ,
        0.        , 0.16859464, 0.955795  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5710225 , 0.        , 0.22954714, 0.        , 0.        ,
        0.15922078, 0.        ],
       [0.28137836, 0.80891   , 0.        , 0.        , 0.4850459 ,
        0.        , 0.        , 0.6968132 , 0.2098

In [242]:
matadd

array([[ 7.23428428e-01, -1.04790270e+00, -8.03970337e-01,
         1.35833299e+00, -7.94822723e-02, -1.24150507e-01,
        -4.10405278e-01, -2.73382515e-01,  2.11167812e-01,
         1.55882761e-01,  2.96572238e-01,  1.63084042e+00,
         2.58383298e+00, -9.18212533e-02, -7.84058124e-02,
         2.24612430e-01,  5.91340184e-01,  8.15700293e-02,
        -6.49575531e-01,  1.17962074e+00,  7.60069788e-01,
        -6.21060073e-01, -1.72183320e-01,  2.84396350e-01,
        -1.09678495e+00,  2.28973317e+00, -1.07332933e+00,
         1.55521059e+00, -1.42624712e+00, -1.13070333e+00,
        -1.51236224e+00, -1.25393927e+00],
       [-1.13348317e+00,  1.66563213e+00, -4.97895241e-01,
         1.84440285e-01,  1.31422043e+00,  6.09186172e-01,
         1.25180638e+00, -6.50567591e-01, -2.02861857e+00,
        -1.12695003e+00,  6.59119070e-01,  3.66337955e-01,
        -4.01594102e-01,  4.05708462e-01, -2.27029458e-01,
         1.63969755e+00,  8.55524778e-01, -4.23325449e-01,
        -7.10

In [243]:
a_added

array([[ 1.247338  , -1.0479027 , -0.5161729 ,  1.358333  ,  0.42714423,
        -0.09564514, -0.41040528, -0.2733825 ,  0.21116781,  0.15588276,
         0.29657224,  1.6308404 ,  2.583833  , -0.09182125,  0.80498   ,
         0.22461243,  1.312742  ,  0.08157003, -0.11026549,  1.1796207 ,
         1.4679675 ,  0.26153964,  0.7761836 ,  0.28439635, -1.096785  ,
         2.2897332 , -1.0400267 ,  1.5552106 , -1.4262471 , -1.1307033 ,
        -1.0071433 , -1.2539393 ],
       [-0.81248856,  1.6656321 , -0.11790892,  0.18444028,  1.6739075 ,
         1.4849035 ,  1.2518064 , -0.6505676 , -1.7314117 , -1.12695   ,
         0.65911907,  1.0828741 , -0.4015941 ,  1.3385583 , -0.22702946,
         1.6396976 ,  1.0241194 ,  0.5324695 , -0.71097183, -0.393672  ,
        -1.964109  , -0.17796814, -0.04434384,  1.976612  , -0.1191747 ,
         1.6145726 , -0.4683473 ,  0.68993616,  0.5613338 ,  0.20387952,
        -0.30872938,  0.42492518],
       [ 0.32170033, -0.37944043,  0.84287655,  0.2071

In [244]:
adenseaddt == a_added

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  

### Make Random

In [245]:
rand = SparseTensor.uniform(2,4)
rand

<SparseTensor <GPUBuffer with shape (8,)> with grad None>

In [246]:
rand.to_numpy()

array([[7.98264053e-03, 3.59408441e-03, 9.06711165e-03, 3.49083496e-03],
       [7.98628945e-03, 4.67222941e-04, 7.97804678e-05, 7.83341192e-03]])

In [247]:
rand.data

<GPUBuffer with shape (8,)>

### update vals

In [248]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)

In [249]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1)z;
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint col = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint row = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

RuntimeError: clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE

Build on <pyopencl.Device 'GeForce GTX 1080 Ti' on 'NVIDIA CUDA' at 0x55f5613f6e40>:

<kernel>:13:35: error: expected ';' at end of declaration
      uint gid2 = get_global_id(1)z;
                                  ^
                                  ;

(options: -I /home/fpaboim/.conda/envs/tinygrad/lib/python3.8/site-packages/pyopencl/cl)
(source saved as /tmp/tmpu779vgmf.cl)

In [None]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint topk = get_global_size(0);
      uint col = updateyidx[gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint row = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [None]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,1], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa), x_cp_buf, x_idx_buf, y_idx_buf)

resa = np.empty_like(adata)
resaidx = np.zeros(acols.shape).astype(np.uint32)
resannz = np.zeros(annz.shape).astype(np.uint32)

cl.enqueue_copy(queue, resa, adata_buf)
cl.enqueue_copy(queue, resaidx, acols_buf)
cl.enqueue_copy(queue, resannz, annzs_buf)

In [None]:
resa.shape, resaidx.shape, resannz.shape, ellwa, a.T.shape

In [None]:
adenseadd = to_dense(resa, resaidx, resannz, ellwa, a.shape)
adenseadd

In [None]:
adenseadd - adense

In [None]:
adenseadd == adense

In [None]:
ellwa

In [None]:
adata2 = adata.reshape(-1, ellwa)
adata2

In [None]:
resa = resa.reshape(-1, ellwa)
resa

In [None]:
resa - adata2

In [None]:
acols

In [None]:
resaidx

In [None]:
resannz

In [None]:
annz

### update vals2

In [None]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)

In [None]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1);
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint row = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint col = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [None]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,bs], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat), x_cp_buf, x_idx_buf, y_idx_buf)

resat = np.empty_like(adatat)
resaidxt = np.zeros(acolst.shape).astype(np.uint32)
resannzt = np.zeros(annzt.shape).astype(np.uint32)

cl.enqueue_copy(queue, resat, adatat_buf)
cl.enqueue_copy(queue, resaidxt, acolst_buf)
cl.enqueue_copy(queue, resannzt, annzst_buf)

In [None]:
ellwa

In [None]:
resat.shape, resaidxt.shape, resannzt.shape

In [None]:
adenseaddt = to_dense(resat, resaidxt, resannzt, ellwat, a.T.shape)
adenseaddt

In [None]:
adenseadd == adenseaddt.T

In [None]:
adata2t = adatat.reshape(-1, ellwat)
adata2t

In [None]:
resat = resat.reshape(-1, ellwat)
resat

In [None]:
resat - adata2t

In [None]:
acols

In [None]:
resaidx

In [None]:
resannz

In [None]:
annz

# OTHER

import numpy as np
import pyopencl as cl

mf = cl.mem_flags

dim = 16
topk = 4

x = np.random.rand(dim).astype(np.float32)
y = np.random.rand(dim).astype(np.float32)
x.shape,y.shape

dim1 = 4
dim2 = 8
dim3 = 1

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.2

a = np.zeros((dim1,dim2))
b = np.random.rand(dim2,dim3).flatten().astype(np.float32)

a.shape, b.shape

In [None]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
val_out_buf = cl.Buffer(ctx, mf.READ_WRITE, 4*topk*topk)
x_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                         __global  float* y,    // INPUT
                         __global  float* xout,    // INPUT
                         uint topk,
                         __global  uint* xoutidx,    // INPUT
                         __global  uint* youtidx    // INPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  uint n = get_global_size(0);
  
  xout[gid] = x[gid];
  xoutidx[gid] = gid;
  youtidx[gid] = gid;
  
  float valx = x[gid];
  float valy = y[gid];
  uint posx = 0;
  uint posy = 0;
  for (uint i = 0; i < n; i++) {
    float tempval = x[i];
    float tempval2 = y[i];
    bool larger = tempval > valx;
    bool larger2 = tempval2 > valy;
      
    posx += (larger)?1:0;
    posy += (larger2)?1:0;
  }
  //printf("posx:%i", posx);
  if (posx < topk) {
    xoutidx[posx] = gid;
  }
  if (posy < topk) {
    youtidx[posy] = gid;
  }
  if (gid < topk) {
    uint i = gid;
    for (uint j=0; j<topk; j++) {
      xout[gid*topk+j] = x[xoutidx[gid]] * y[youtidx[j]];
    }
  }
}""").build()

In [None]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
event = knl(queue, [dim,], None, x_buf, y_buf, val_out_buf, np.uint32(topk), x_idx_buf, y_idx_buf)

#event.wait()
val_out = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, val_out, val_out_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf, wait_for=[event])
cl.enqueue_copy(queue, resyidx, y_idx_buf)

In [None]:
val_out

In [None]:
resxidx

In [None]:
resyidx

In [None]:
asdf

In [None]:
from __future__ import division

KERNEL_CODE = """
// Thread block size
#define BLOCK_SIZE %(block_size)d
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA %(w_a)d // Matrix A width
#define HA %(h_a)d // Matrix A height
#define WB %(w_b)d // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height
/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */
/* Matrix multiplication: C = A * B.
 * Device code.
 */
#define AS(j, i) As[i + j * BLOCK_SIZE]
#define BS(j, i) Bs[i + j * BLOCK_SIZE]
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! WA is A's width and WB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(16,16,1))) 
void
matrixMul( __global float* C, __global float* A, __global float* B)
{
    __local float As[BLOCK_SIZE*BLOCK_SIZE];
    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);
    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);
    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + WA - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * WB;
    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0f;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {
        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + WA * ty + tx];
        BS(ty, tx) = B[b + WB * ty + tx];
        // Synchronize to make sure the matrices are loaded
        barrier(CLK_LOCAL_MEM_FENCE);
        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
"""


In [None]:
a2 = np.random.rand(4,4)

In [None]:
a2

In [None]:
a2.sum(axis=1)

In [None]:
b2 = np.random.rand(4,4)

In [None]:
b2

In [None]:
b2.sum(axis=0)

In [None]:
matmul = a2.dot(b2)
matmul