In [1]:
from tinygrad.densetensor import DenseTensor
from tinygrad.sparsetensor import SparseTensor
import numpy as np

%load_ext autoreload
%autoreload 2

DEVICE:GPU


In [2]:
x_init = np.random.randn(2,6).astype(np.float32)
x2_init = np.random.randn(3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(6,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)

In [3]:
x = DenseTensor(x_init)
W = DenseTensor(W_init)
m = DenseTensor(m_init)
out = x.dot(W).relu()
out = out.logsoftmax()
out = out.mul(m).add(m).sum()
out.backward()

out.cpu().data, x

(array([5.4301043], dtype=float32),
 <DenseTensor <GPUBuffer with shape (2, 6)> with grad <GPUBuffer with shape (2, 6)>>)

x2 = DenseTensor(x2_init)#.gpu()
W = SparseTensor(W_init)
out = W.dot(x2).relu().sum()

out.backward()

out.cpu().data, x

In [4]:
import numpy as np
import pyopencl as cl

mf = cl.mem_flags

In [5]:
dim1 = 8
dim2 = 64
dim3 = 32
bs = dim3

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.3

a = np.zeros((dim1,dim2))
b = np.zeros((dim2,dim3)).astype(np.float32)

a.shape, b.shape

((8, 64), (64, 32))

In [6]:
def fill_sparse(mat, sparsity=0.1):
    indices = np.array(range(mat.shape[1]))
    nrows = int(mat.shape[1]*sparsity)
    for row in range(mat.shape[0]):
        lim = nrows #+ int(np.random.random()*3)
        mat[row][np.random.permutation(indices)[:lim]] = np.random.random(lim)
    return mat

a = fill_sparse(a, sparsity).astype(np.float32)
b = fill_sparse(b, sparsity).astype(np.float32)

In [7]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [8]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [9]:
x2_init.T

array([-0.9929819 , -1.1527574 , -0.45470536], dtype=float32)

In [10]:
mult = a.dot(b)
mult.shape

(8, 32)

In [11]:
mult.shape

(8, 32)

In [12]:
def to_data(mat):
    all_rows = []
    all_idxs = []
    all_nnzs = []
    for row in range(mat.shape[0]):
        rowdata = []
        colidxs = []
        all_nnzs.append(0)
        for col in range(mat.shape[1]):
            val = mat[row][col]
            if val != 0:
                rowdata.append(val)
                colidxs.append(col)
                all_nnzs[-1] += 1
        all_rows.append(rowdata)
        all_idxs.append(colidxs)
    
    ellwidth = min(int(np.sqrt(np.max(all_nnzs))+1)**2, mat.shape[1])
    #all_rows = np.array(all_rows)#.astype(np.float32).flatten()           
    for row in range(mat.shape[0]):
        #print(row, all_rows)
        all_rows[row] = np.array(all_rows[row])
        all_rows[row].resize(ellwidth)
        all_idxs[row] = np.array(all_idxs[row])
        all_idxs[row].resize(ellwidth)
        #print(all_idxs[row])
    all_rows = np.array(all_rows)
    all_idxs = np.array(all_idxs)
    all_nnzs = np.array(all_nnzs)
    
    while (not all_rows[:,-1].any()):
        all_rows = all_rows[:,:-1]
        all_idxs = all_idxs[:,:-1]
        ellwidth -= 1
        
    
    all_rows = np.array(all_rows).astype(np.float32).flatten()
    all_idxs = np.array(all_idxs).astype(np.uint32).flatten()
    
    all_nnzs = np.array(all_nnzs).astype(np.uint32)
    
    
    return all_rows, all_idxs, all_nnzs, ellwidth

In [13]:
def to_dense(data, cols, nnzs, ellw, shape):
    out = np.zeros(shape)
    for row in range(shape[0]):
        for icol in range(nnzs[row]):
            out[row,cols[row*ellw+icol]] = data[row*ellw+icol]
    return out

In [14]:
adata, acols, annz, ellwa = to_data(a)
adata, acols, annz, ellwa

(array([0.8480498 , 0.24151757, 0.57852703, 0.32244816, 0.5725312 ,
        0.6063008 , 0.9894926 , 0.02636844, 0.6720787 , 0.98229825,
        0.3666549 , 0.9397504 , 0.11934852, 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.02024503, 0.14442326, 0.90089077,
        0.07546286, 0.03899236, 0.81757367, 0.35094827, 0.9109925 ,
        0.73281217, 0.26387694, 0.6998149 , 0.38586915, 0.47988915,
        0.05865214, 0.12562034, 0.7752392 , 0.79431444, 0.02824831,
        0.22504327, 0.7702183 , 0.1210502 , 0.77405983, 0.5966783 ,
        0.12070392, 0.3565511 , 0.13704483, 0.13205202, 0.89834917,
        0.01865679, 0.0462791 , 0.17046145, 0.11487772, 0.6445183 ,
        0.5508075 , 0.57453287, 0.76656425, 0.51505405, 0.19128856,
        0.28466925, 0.6991739 , 0.11944163, 0.72324735, 0.11644594,
        0.10038723, 0.29722813, 0.68166405, 0.34802416, 0.9030912 ,
        0.16123049, 0.61387503, 0.22525464, 0.9697108 , 0.16613238,
        0.20547062, 0.84905845, 0.5463404 , 0.18

In [15]:
adatat, acolst, annzt, ellwat = to_data(a.T)
adatat, acolst, annzt, ellwat

(array([0.8480498 , 0.77405983, 0.94465804, 0.        , 0.        ,
        0.11944163, 0.        , 0.        , 0.        , 0.        ,
        0.90089077, 0.72324735, 0.18000361, 0.        , 0.        ,
        0.07546286, 0.69285464, 0.9873242 , 0.        , 0.        ,
        0.11644594, 0.02384542, 0.        , 0.        , 0.        ,
        0.8868406 , 0.        , 0.        , 0.        , 0.        ,
        0.03899236, 0.10038723, 0.7634248 , 0.        , 0.        ,
        0.29722813, 0.81027746, 0.45962164, 0.        , 0.        ,
        0.17147203, 0.2784405 , 0.        , 0.        , 0.        ,
        0.5966783 , 0.68166405, 0.        , 0.        , 0.        ,
        0.12070392, 0.5139346 , 0.40541545, 0.        , 0.        ,
        0.81757367, 0.34802416, 0.46915936, 0.03892968, 0.51622564,
        0.35094827, 0.3565511 , 0.17208202, 0.        , 0.        ,
        0.6777007 , 0.        , 0.        , 0.        , 0.        ,
        0.24151757, 0.        , 0.        , 0.  

In [16]:
bdata, bcols, bnnz, ellwb = to_data(b)
bdata, bcols, bnnz, ellwb

(array([2.62002051e-01, 1.93622217e-01, 6.67813301e-01, 7.28292942e-01,
        1.15920804e-01, 2.32446626e-01, 2.17969924e-01, 5.06616473e-01,
        8.77798080e-01, 7.11554945e-01, 6.42500743e-02, 6.02539599e-01,
        5.35580099e-01, 9.93584514e-01, 1.12368517e-01, 7.78178513e-01,
        1.29090816e-01, 6.93333387e-01, 8.01957369e-01, 5.75487912e-02,
        2.10757196e-01, 1.25005290e-01, 5.37637651e-01, 2.36243993e-01,
        3.09089273e-01, 8.07502568e-01, 6.79051220e-01, 3.58957738e-01,
        2.03234240e-01, 6.39099479e-01, 7.58844852e-01, 3.78172845e-01,
        9.03413534e-01, 3.91220480e-01, 5.80873370e-01, 4.21397209e-01,
        4.67349368e-04, 6.48336887e-01, 2.56263524e-01, 5.21069944e-01,
        6.81902945e-01, 7.40234852e-01, 5.15490830e-01, 3.04551311e-02,
        8.95087540e-01, 5.91695070e-01, 6.68335795e-01, 7.62129426e-01,
        1.96754798e-01, 9.10652459e-01, 6.17356241e-01, 3.42009604e-01,
        7.00421989e-01, 8.69149387e-01, 8.07476282e-01, 8.136243

In [17]:
bdatat, bcolst, bnnzt, ellwbt = to_data(b.T)
adatat, bcolst, bnnzt, ellwbt

(array([0.8480498 , 0.77405983, 0.94465804, 0.        , 0.        ,
        0.11944163, 0.        , 0.        , 0.        , 0.        ,
        0.90089077, 0.72324735, 0.18000361, 0.        , 0.        ,
        0.07546286, 0.69285464, 0.9873242 , 0.        , 0.        ,
        0.11644594, 0.02384542, 0.        , 0.        , 0.        ,
        0.8868406 , 0.        , 0.        , 0.        , 0.        ,
        0.03899236, 0.10038723, 0.7634248 , 0.        , 0.        ,
        0.29722813, 0.81027746, 0.45962164, 0.        , 0.        ,
        0.17147203, 0.2784405 , 0.        , 0.        , 0.        ,
        0.5966783 , 0.68166405, 0.        , 0.        , 0.        ,
        0.12070392, 0.5139346 , 0.40541545, 0.        , 0.        ,
        0.81757367, 0.34802416, 0.46915936, 0.03892968, 0.51622564,
        0.35094827, 0.3565511 , 0.17208202, 0.        , 0.        ,
        0.6777007 , 0.        , 0.        , 0.        , 0.        ,
        0.24151757, 0.        , 0.        , 0.  

In [18]:
adense = to_dense(adata, acols, annz, ellwa, a.shape)

In [19]:
adenset = to_dense(adatat, acolst, annzt, ellwat, a.T.shape)

In [20]:
bdense = to_dense(bdata, bcols, bnnz, ellwb, b.shape)

In [21]:
bdenset = to_dense(bdatat, bcolst, bnnzt, ellwbt, b.T.shape)

In [22]:
adense

array([[0.84804982, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.57253122, 0.60630077,
        0.9894926 , 0.02636844, 0.        , 0.67207873, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.93975037, 0.11934852, 0.        , 0.49876079, 0.35858813,
        0.64539164, 0.99408501, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [23]:
adenset.T == adense

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  Tru

In [24]:
bdenset.T == bdense

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [25]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [26]:
a == adense

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  Tru

In [27]:
a.shape

(8, 64)

In [28]:
adata.shape, acols.shape, annz.shape, ellwa

((152,), (152,), (8,), 19)

In [29]:
#acols = acols.astype(np.uint32)
#annz = annz.astype(np.uint32)

In [30]:
adata, acols, annz, b

(array([0.8480498 , 0.24151757, 0.57852703, 0.32244816, 0.5725312 ,
        0.6063008 , 0.9894926 , 0.02636844, 0.6720787 , 0.98229825,
        0.3666549 , 0.9397504 , 0.11934852, 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.02024503, 0.14442326, 0.90089077,
        0.07546286, 0.03899236, 0.81757367, 0.35094827, 0.9109925 ,
        0.73281217, 0.26387694, 0.6998149 , 0.38586915, 0.47988915,
        0.05865214, 0.12562034, 0.7752392 , 0.79431444, 0.02824831,
        0.22504327, 0.7702183 , 0.1210502 , 0.77405983, 0.5966783 ,
        0.12070392, 0.3565511 , 0.13704483, 0.13205202, 0.89834917,
        0.01865679, 0.0462791 , 0.17046145, 0.11487772, 0.6445183 ,
        0.5508075 , 0.57453287, 0.76656425, 0.51505405, 0.19128856,
        0.28466925, 0.6991739 , 0.11944163, 0.72324735, 0.11644594,
        0.10038723, 0.29722813, 0.68166405, 0.34802416, 0.9030912 ,
        0.16123049, 0.61387503, 0.22525464, 0.9697108 , 0.16613238,
        0.20547062, 0.84905845, 0.5463404 , 0.18

## MatMul (Sparse-Dense)

adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      float sum = 0;
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[col*ncols+gid2];
          //if (gid==0 && gid2==2)
          //  printf("aval, xval: %.2f,%.2f: (%i,%i) \\n", aval, xval, col, index);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [31]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          uint xidx = col*ncols+gid2;
          float xval  = vector_x[xidx];
          if (gid==0 && gid2==1)
            printf("aval, xval: %.2f,%.2f: (%i,%i) - %i \\n", aval, xval, col, index, xidx);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [32]:
a.shape, b.shape

((8, 64), (64, 32))

In [33]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [34]:
rows = a.shape[0]

In [35]:
mult = mult.astype(np.float32)

In [36]:
outshape = (a.shape[0], b.shape[1])
outshape

(8, 32)

In [37]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul2  # Use this Kernel object for repeated calls
knl(queue, [outshape[0]], None, adata_buf, acols_buf, annzs_buf, np.uint32(ellwa), np.uint32(outshape[1]), b_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

aval, xval: 0.85,0.00: (0,0) - 1 
aval, xval: 0.24,0.00: (14,1) - 449 
aval, xval: 0.58,0.92: (16,2) - 513 
aval, xval: 0.32,0.00: (20,3) - 641 
aval, xval: 0.57,0.00: (23,4) - 737 
aval, xval: 0.61,0.00: (24,5) - 769 
aval, xval: 0.99,0.00: (25,6) - 801 
aval, xval: 0.03,0.00: (26,7) - 833 
aval, xval: 0.67,0.60: (28,8) - 897 
aval, xval: 0.98,0.00: (34,9) - 1089 
aval, xval: 0.37,0.57: (35,10) - 1121 
aval, xval: 0.94,0.00: (40,11) - 1281 
aval, xval: 0.12,0.00: (41,12) - 1313 
aval, xval: 0.50,0.00: (43,13) - 1377 
aval, xval: 0.36,0.00: (44,14) - 1409 
aval, xval: 0.65,0.00: (45,15) - 1441 
aval, xval: 0.99,0.00: (46,16) - 1473 
aval, xval: 0.02,0.00: (62,17) - 1985 
aval, xval: 0.14,0.00: (63,18) - 2017 


<pyopencl._cl.NannyEvent at 0x7fa17834a680>

In [38]:
(res_np-mult).sum()

-3.8929284e-07

In [39]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [40]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [41]:
res_buf

<pyopencl._cl.Buffer at 0x7fa178345d10>

In [42]:
res_np

array([[1.9616023 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.48470148, 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437804 , 1.1737825 , 0.998789  , 1.651209  ,
        0.8851915 , 3.3559442 , 1.6590875 , 0.42023018, 2.0140884 ,
        1.4066358 , 1.6796163 , 2.2332704 , 0.6536164 , 1.3619102 ,
        1.6098175 , 1.3137349 , 1.2370825 , 0.89148694, 1.2607467 ,
        2.4422865 , 1.6772664 ],
       [1.2752076 , 1.7259339 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716728 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.25563458, 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.5942491 , 0.04015317,
        0.1251733 , 2.1408708 , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831336 , 0.93258435, 1.2005807 , 0.7605144 ,
        1.8359319 , 1.0324435 ],
       [0.8095597 , 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [43]:
mult

array([[1.9616024 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.4847015 , 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437801 , 1.1737825 , 0.9987891 , 1.651209  ,
        0.8851915 , 3.3559437 , 1.6590874 , 0.42023015, 2.0140886 ,
        1.4066358 , 1.6796165 , 2.2332702 , 0.6536164 , 1.3619103 ,
        1.6098175 , 1.3137348 , 1.2370825 , 0.891487  , 1.2607467 ,
        2.4422865 , 1.6772662 ],
       [1.2752078 , 1.7259338 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716727 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.2556346 , 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.594249  , 0.04015317,
        0.1251733 , 2.140871  , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831335 , 0.9325843 , 1.2005808 , 0.7605144 ,
        1.8359318 , 1.0324435 ],
       [0.80955964, 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [44]:
res_np==mult

array([[False,  True,  True,  True, False, False,  True,  True,  True,
         True,  True, False,  True, False,  True,  True, False, False,
        False, False,  True, False, False,  True, False,  True, False,
         True, False,  True,  True, False],
       [False, False,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True,  True,  True, False,
        False, False,  True, False,  True],
       [False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True, False,  True,  True,  True,  True,
        False, False,  True, False,  True, False,  True,  True,  True,
         True,  True, False, False, False],
       [ True, False,  True,  True, False,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False, False,  True,  True, False, False, False,
         True,  

In [45]:
res_np.shape

(8, 32)

In [46]:
mult.shape

(8, 32)

## MatMul (dense * sparse)

In [47]:
bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmul(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        uint nnz = rowNnz[gid2];
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid2 * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[gid*mwidth+col];
          sum  += aval * xval;
          if (gid==0 && gid2==0)
            printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [48]:
a.shape, b.shape

((8, 64), (64, 32))

In [49]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [50]:
rows = a.shape[0]

In [51]:
mult = mult.astype(np.float32)

In [52]:
outshape = np.array([a.shape[0], b.shape[1]])
outshape

array([ 8, 32])

In [53]:
b.T

array([[0.        , 0.71155494, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.8216976 , 0.        ,
        0.        ],
       [0.        , 0.06425007, 0.        , ..., 0.70057744, 0.19453382,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.8777981 , 0.12909082, 0.6790512 , ..., 0.        , 0.        ,
        0.29924747],
       [0.        , 0.6933334 , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [54]:
a.T

array([[0.8480498 , 0.        , 0.77405983, 0.        , 0.        ,
        0.        , 0.94465804, 0.        ],
       [0.        , 0.        , 0.        , 0.11944163, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.90089077, 0.        , 0.72324735, 0.        ,
        0.18000361, 0.        , 0.        ],
       [0.        , 0.07546286, 0.        , 0.        , 0.69285464,
        0.9873242 , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.11644594, 0.        ,
        0.        , 0.02384542, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.8868406 ],
       [0.        , 0.03899236, 0.        , 0.10038723, 0.        ,
        0.        , 0.        , 0.7634248 ],
       [0.        , 0.        , 0.        , 0.29722813, 0.81027746,
        0.45962164, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.17147203,
        0.        , 0.      

In [55]:
outshape.T

array([ 8, 32])

In [56]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul  # Use this Kernel object for repeated calls
knl(queue, [outshape.T[0]], None, bdatat_buf, bcolst_buf, bnnzst_buf, np.uint32(ellwbt), np.uint32(b.shape[0]), np.uint32(outshape.T[1]), a_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7fa178357b80>

aval, xval: 0.71,0.00 - 0.00: (1,0) 
aval, xval: 0.27,0.24 - 0.07: (14,1) 
aval, xval: 0.55,0.00 - 0.07: (15,2) 
aval, xval: 0.17,0.00 - 0.07: (18,3) 
aval, xval: 0.69,0.00 - 0.07: (21,4) 
aval, xval: 0.95,0.61 - 0.64: (24,5) 
aval, xval: 0.49,0.99 - 1.13: (25,6) 
aval, xval: 0.37,0.00 - 1.13: (27,7) 
aval, xval: 0.79,0.67 - 1.66: (28,8) 
aval, xval: 0.95,0.00 - 1.66: (32,9) 
aval, xval: 0.10,0.00 - 1.66: (38,10) 
aval, xval: 0.61,0.50 - 1.96: (43,11) 
aval, xval: 0.35,0.00 - 1.96: (48,12) 
aval, xval: 0.04,0.00 - 1.96: (52,13) 
aval, xval: 0.85,0.00 - 1.96: (57,14) 
aval, xval: 0.33,0.00 - 1.96: (60,15) 


In [57]:
(res_np-mult).sum()

-3.8929284e-07

In [58]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [59]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [60]:
res_buf

<pyopencl._cl.Buffer at 0x7fa1783af540>

In [61]:
res_np

array([[1.9616023 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.48470148, 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437804 , 1.1737825 , 0.998789  , 1.651209  ,
        0.8851915 , 3.3559442 , 1.6590875 , 0.42023018, 2.0140884 ,
        1.4066358 , 1.6796163 , 2.2332704 , 0.6536164 , 1.3619102 ,
        1.6098175 , 1.3137349 , 1.2370825 , 0.89148694, 1.2607467 ,
        2.4422865 , 1.6772664 ],
       [1.2752076 , 1.7259339 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716728 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.25563458, 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.5942491 , 0.04015317,
        0.1251733 , 2.1408708 , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831336 , 0.93258435, 1.2005807 , 0.7605144 ,
        1.8359319 , 1.0324435 ],
       [0.8095597 , 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [62]:
mult

array([[1.9616024 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.4847015 , 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437801 , 1.1737825 , 0.9987891 , 1.651209  ,
        0.8851915 , 3.3559437 , 1.6590874 , 0.42023015, 2.0140886 ,
        1.4066358 , 1.6796165 , 2.2332702 , 0.6536164 , 1.3619103 ,
        1.6098175 , 1.3137348 , 1.2370825 , 0.891487  , 1.2607467 ,
        2.4422865 , 1.6772662 ],
       [1.2752078 , 1.7259338 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716727 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.2556346 , 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.594249  , 0.04015317,
        0.1251733 , 2.140871  , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831335 , 0.9325843 , 1.2005808 , 0.7605144 ,
        1.8359318 , 1.0324435 ],
       [0.80955964, 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [63]:
res_np==mult

array([[False,  True,  True,  True, False, False,  True,  True,  True,
         True,  True, False,  True, False,  True,  True, False, False,
        False, False,  True, False, False,  True, False,  True, False,
         True, False,  True,  True, False],
       [False, False,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True,  True,  True, False,
        False, False,  True, False,  True],
       [False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True, False,  True,  True,  True,  True,
        False, False,  True, False,  True, False,  True,  True,  True,
         True,  True, False, False, False],
       [ True, False,  True,  True, False,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False, False,  True,  True, False, False, False,
         True,  

In [64]:
res_np-mult

array([[-1.1920929e-07,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
        -1.8626451e-09, -2.9802322e-08,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  2.3841858e-07,
         0.0000000e+00, -5.9604645e-08,  0.0000000e+00,  0.0000000e+00,
         4.7683716e-07,  1.1920929e-07,  2.9802322e-08, -2.3841858e-07,
         0.0000000e+00, -1.1920929e-07,  2.3841858e-07,  0.0000000e+00,
        -1.1920929e-07,  0.0000000e+00,  1.1920929e-07,  0.0000000e+00,
        -5.9604645e-08,  0.0000000e+00,  0.0000000e+00,  1.1920929e-07],
       [-1.1920929e-07,  1.1920929e-07,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  1.1920929e-07,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00, -2.9802322e-08,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  1.1920929e-07,  0.0000000e+00,
         0.0000000e+00, -2.3841858e-07,  0.0000000e+00,  0.0000

In [65]:
res_np.shape

(8, 32)

In [66]:
mult.shape

(8, 32)

# Matmult Dense Dense

In [67]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i; 
        float xval = x[xidx];
        uint yidx = osize*i+gidy;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [68]:
a.shape, b.shape

((8, 64), (64, 32))

In [69]:
rows = a.shape[0]

In [70]:
mult = mult.astype(np.float32)

In [71]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.85 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.71 - 0.00  -- 1/32
mult: 0.00 x 0.00 - 0.00  -- 2/64
mult: 0.00 x 0.00 - 0.00  -- 3/96
mult: 0.00 x 0.00 - 0.00  -- 4/128
mult: 0.00 x 0.00 - 0.00  -- 5/160
mult: 0.00 x 0.00 - 0.00  -- 6/192
mult: 0.00 x 0.00 - 0.00  -- 7/224
mult: 0.00 x 0.00 - 0.00  -- 8/256
mult: 0.00 x 0.00 - 0.00  -- 9/288
mult: 0.00 x 0.00 - 0.00  -- 10/320
mult: 0.00 x 0.00 - 0.00  -- 11/352
mult: 0.00 x 0.00 - 0.00  -- 12/384
mult: 0.00 x 0.00 - 0.00  -- 13/416
mult: 0.24 x 0.27 - 0.00  -- 14/448
mult: 0.00 x 0.55 - 0.00  -- 15/480
mult: 0.58 x 0.00 - 0.00  -- 16/512
mult: 0.00 x 0.00 - 0.00  -- 17/544
mult: 0.00 x 0.17 - 0.00  -- 18/576
mult: 0.00 x 0.00 - 0.00  -- 19/608
mult: 0.32 x 0.00 - 0.00  -- 20/640
mult: 0.00 x 0.69 - 0.00  -- 21/672
mult: 0.00 x 0.00 - 0.00  -- 22/704
mult: 0.57 x 0.00 - 0.00  -- 23/736
mult: 0.61 x 0.95 - 0.00  -- 24/768
mult: 0.99 x 0.49 - 0.00  -- 25/800
mult: 0.03 x 0.00 - 0.00  -- 26/832
mult: 0.00 x 0.37 - 0.00  -- 27/864
mult: 

<pyopencl._cl.NannyEvent at 0x7fa17836b2c0>

0.67 x 0.79 - 0.00  -- 28/896
mult: 0.00 x 0.00 - 0.00  -- 29/928
mult: 0.00 x 0.00 - 0.00  -- 30/960
mult: 0.00 x 0.00 - 0.00  -- 31/992
mult: 0.00 x 0.95 - 0.00  -- 32/1024
mult: 0.00 x 0.00 - 0.00  -- 33/1056
mult: 0.98 x 0.00 - 0.00  -- 34/1088
mult: 0.37 x 0.00 - 0.00  -- 35/1120
mult: 0.00 x 0.00 - 0.00  -- 36/1152
mult: 0.00 x 0.00 - 0.00  -- 37/1184
mult: 0.00 x 0.10 - 0.00  -- 38/1216
mult: 0.00 x 0.00 - 0.00  -- 39/1248
mult: 0.94 x 0.00 - 0.00  -- 40/1280
mult: 0.12 x 0.00 - 0.00  -- 41/1312
mult: 0.00 x 0.00 - 0.00  -- 42/1344
mult: 0.50 x 0.61 - 0.00  -- 43/1376
mult: 0.36 x 0.00 - 0.00  -- 44/1408
mult: 0.65 x 0.00 - 0.00  -- 45/1440
mult: 0.99 x 0.00 - 0.00  -- 46/1472
mult: 0.00 x 0.00 - 0.00  -- 47/1504
mult: 0.00 x 0.35 - 0.00  -- 48/1536
mult: 0.00 x 0.00 - 0.00  -- 49/1568
mult: 0.00 x 0.00 - 0.00  -- 50/1600
mult: 0.00 x 0.00 - 0.00  -- 51/1632
mult: 0.00 x 0.04 - 0.00  -- 52/1664
mult: 0.00 x 0.00 - 0.00  -- 53/1696
mult: 0.00 x 0.00 - 0.00  -- 54/1728
mult: 0.00 

In [72]:
(res_np-mult).sum()

-3.8929284e-07

In [73]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [74]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [75]:
res_np

array([[1.9616023 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.48470148, 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437804 , 1.1737825 , 0.998789  , 1.651209  ,
        0.8851915 , 3.3559442 , 1.6590875 , 0.42023018, 2.0140884 ,
        1.4066358 , 1.6796163 , 2.2332704 , 0.6536164 , 1.3619102 ,
        1.6098175 , 1.3137349 , 1.2370825 , 0.89148694, 1.2607467 ,
        2.4422865 , 1.6772664 ],
       [1.2752076 , 1.7259339 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716728 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.25563458, 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.5942491 , 0.04015317,
        0.1251733 , 2.1408708 , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831336 , 0.93258435, 1.2005807 , 0.7605144 ,
        1.8359319 , 1.0324435 ],
       [0.8095597 , 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [76]:
a.dot(b)

array([[1.9616024 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.4847015 , 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437801 , 1.1737825 , 0.9987891 , 1.651209  ,
        0.8851915 , 3.3559437 , 1.6590874 , 0.42023015, 2.0140886 ,
        1.4066358 , 1.6796165 , 2.2332702 , 0.6536164 , 1.3619103 ,
        1.6098175 , 1.3137348 , 1.2370825 , 0.891487  , 1.2607467 ,
        2.4422865 , 1.6772662 ],
       [1.2752078 , 1.7259338 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716727 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.2556346 , 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.594249  , 0.04015317,
        0.1251733 , 2.140871  , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831335 , 0.9325843 , 1.2005808 , 0.7605144 ,
        1.8359318 , 1.0324435 ],
       [0.80955964, 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [77]:
res_np==mult

array([[False,  True,  True,  True, False, False,  True,  True,  True,
         True,  True, False,  True, False,  True,  True, False, False,
        False, False,  True, False, False,  True, False,  True, False,
         True, False,  True,  True, False],
       [False, False,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True,  True,  True, False,
        False, False,  True, False,  True],
       [False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True, False,  True,  True,  True,  True,
        False, False,  True, False,  True, False,  True,  True,  True,
         True,  True, False, False, False],
       [ True, False,  True,  True, False,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False, False,  True,  True, False, False, False,
         True,  

In [78]:
res_np.shape

(8, 32)

In [79]:
mult.shape

(8, 32)

# Matmult Dense Transposed

In [80]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [81]:
c=np.zeros(b.T.shape)
bt = b.T
for row in range(bt.shape[0]):
    for col in range(bt.shape[1]):
        c[row][col] = bt[row][col]

In [82]:
bt

array([[0.        , 0.71155494, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.8216976 , 0.        ,
        0.        ],
       [0.        , 0.06425007, 0.        , ..., 0.70057744, 0.19453382,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.8777981 , 0.12909082, 0.6790512 , ..., 0.        , 0.        ,
        0.29924747],
       [0.        , 0.6933334 , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [83]:
c

array([[0.        , 0.71155494, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.82169759, 0.        ,
        0.        ],
       [0.        , 0.06425007, 0.        , ..., 0.70057744, 0.19453382,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.87779808, 0.12909082, 0.67905122, ..., 0.        , 0.        ,
        0.29924747],
       [0.        , 0.69333339, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [84]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i;
        float xval = x[xidx];
        uint yidx = msize*gidy+i;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [85]:
a.shape, b.T.shape

((8, 64), (32, 64))

In [86]:
rows = a.shape[0]

In [87]:
mult = mult.astype(np.float32)

In [88]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7fa17836edb0>


mult: 0.85 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.71 - 0.00  -- 1/1
mult: 0.00 x 0.00 - 0.00  -- 2/2
mult: 0.00 x 0.00 - 0.00  -- 3/3
mult: 0.00 x 0.00 - 0.00  -- 4/4
mult: 0.00 x 0.00 - 0.00  -- 5/5
mult: 0.00 x 0.00 - 0.00  -- 6/6
mult: 0.00 x 0.00 - 0.00  -- 7/7
mult: 0.00 x 0.00 - 0.00  -- 8/8
mult: 0.00 x 0.00 - 0.00  -- 9/9
mult: 0.00 x 0.00 - 0.00  -- 10/10
mult: 0.00 x 0.00 - 0.00  -- 11/11
mult: 0.00 x 0.00 - 0.00  -- 12/12
mult: 0.00 x 0.00 - 0.00  -- 13/13
mult: 0.24 x 0.27 - 0.00  -- 14/14
mult: 0.00 x 0.55 - 0.00  -- 15/15
mult: 0.58 x 0.00 - 0.00  -- 16/16
mult: 0.00 x 0.00 - 0.00  -- 17/17
mult: 0.00 x 0.17 - 0.00  -- 18/18
mult: 0.00 x 0.00 - 0.00  -- 19/19
mult: 0.32 x 0.00 - 0.00  -- 20/20
mult: 0.00 x 0.69 - 0.00  -- 21/21
mult: 0.00 x 0.00 - 0.00  -- 22/22
mult: 0.57 x 0.00 - 0.00  -- 23/23
mult: 0.61 x 0.95 - 0.00  -- 24/24
mult: 0.99 x 0.49 - 0.00  -- 25/25
mult: 0.03 x 0.00 - 0.00  -- 26/26
mult: 0.00 x 0.37 - 0.00  -- 27/27
mult: 0.67 x 0.79 - 0.00  -- 28/28
mult

In [89]:
(res_np-mult).sum()

-3.8929284e-07

In [90]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [91]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [92]:
res_np

array([[1.9616023 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.48470148, 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437804 , 1.1737825 , 0.998789  , 1.651209  ,
        0.8851915 , 3.3559442 , 1.6590875 , 0.42023018, 2.0140884 ,
        1.4066358 , 1.6796163 , 2.2332704 , 0.6536164 , 1.3619102 ,
        1.6098175 , 1.3137349 , 1.2370825 , 0.89148694, 1.2607467 ,
        2.4422865 , 1.6772664 ],
       [1.2752076 , 1.7259339 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716728 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.25563458, 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.5942491 , 0.04015317,
        0.1251733 , 2.1408708 , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831336 , 0.93258435, 1.2005807 , 0.7605144 ,
        1.8359319 , 1.0324435 ],
       [0.8095597 , 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [93]:
a.dot(b)

array([[1.9616024 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.4847015 , 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437801 , 1.1737825 , 0.9987891 , 1.651209  ,
        0.8851915 , 3.3559437 , 1.6590874 , 0.42023015, 2.0140886 ,
        1.4066358 , 1.6796165 , 2.2332702 , 0.6536164 , 1.3619103 ,
        1.6098175 , 1.3137348 , 1.2370825 , 0.891487  , 1.2607467 ,
        2.4422865 , 1.6772662 ],
       [1.2752078 , 1.7259338 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716727 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.2556346 , 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.594249  , 0.04015317,
        0.1251733 , 2.140871  , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831335 , 0.9325843 , 1.2005808 , 0.7605144 ,
        1.8359318 , 1.0324435 ],
       [0.80955964, 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [94]:
res_np==mult

array([[False,  True,  True,  True, False, False,  True,  True,  True,
         True,  True, False,  True, False,  True,  True, False, False,
        False, False,  True, False, False,  True, False,  True, False,
         True, False,  True,  True, False],
       [False, False,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True,  True,  True, False,
        False, False,  True, False,  True],
       [False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True, False,  True,  True,  True,  True,
        False, False,  True, False,  True, False,  True,  True,  True,
         True,  True, False, False, False],
       [ True, False,  True,  True, False,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False, False,  True,  True, False, False, False,
         True,  

In [95]:
res_np.shape

(8, 32)

In [96]:
mult.shape

(8, 32)

# Matmult Transposed Dense

In [97]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [98]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [99]:
at

array([[0.8480498 , 0.        , 0.77405983, 0.        , 0.        ,
        0.        , 0.94465804, 0.        ],
       [0.        , 0.        , 0.        , 0.11944163, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.90089077, 0.        , 0.72324735, 0.        ,
        0.18000361, 0.        , 0.        ],
       [0.        , 0.07546286, 0.        , 0.        , 0.69285464,
        0.9873242 , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.11644594, 0.        ,
        0.        , 0.02384542, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.8868406 ],
       [0.        , 0.03899236, 0.        , 0.10038723, 0.        ,
        0.        , 0.        , 0.7634248 ],
       [0.        , 0.        , 0.        , 0.29722813, 0.81027746,
        0.45962164, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.17147203,
        0.        , 0.      

In [100]:
c

array([[0.84804982, 0.        , 0.77405983, 0.        , 0.        ,
        0.        , 0.94465804, 0.        ],
       [0.        , 0.        , 0.        , 0.11944163, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.90089077, 0.        , 0.72324735, 0.        ,
        0.18000361, 0.        , 0.        ],
       [0.        , 0.07546286, 0.        , 0.        , 0.69285464,
        0.98732418, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.11644594, 0.        ,
        0.        , 0.02384542, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.88684058],
       [0.        , 0.03899236, 0.        , 0.10038723, 0.        ,
        0.        , 0.        , 0.76342481],
       [0.        , 0.        , 0.        , 0.29722813, 0.81027746,
        0.45962164, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.17147203,
        0.        , 0.      

In [101]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint isize
                          ) { // LOCAL SHARED BUFFER
      uint osize = get_global_size(0);
      int gidy = get_global_id(0); // row
      
      for (uint gidx = 0; gidx < isize; gidx++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [102]:
a.shape, b.T.shape

((8, 64), (32, 64))

In [103]:
rows = a.shape[0]

In [104]:
mult = mult.astype(np.float32)

In [105]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [b.shape[1]], None, a_buf, b_buf, res_buf, np.uint32(a.shape[1]), np.uint32(rows))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7fa178333f90>


mult: 0.85 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.71 - 0.00  -- 8/32
mult: 0.00 x 0.00 - 0.00  -- 16/64
mult: 0.00 x 0.00 - 0.00  -- 24/96
mult: 0.00 x 0.00 - 0.00  -- 32/128
mult: 0.00 x 0.00 - 0.00  -- 40/160
mult: 0.00 x 0.00 - 0.00  -- 48/192
mult: 0.00 x 0.00 - 0.00  -- 56/224
mult: 0.00 x 0.00 - 0.00  -- 64/256
mult: 0.00 x 0.00 - 0.00  -- 72/288
mult: 0.00 x 0.00 - 0.00  -- 80/320
mult: 0.00 x 0.00 - 0.00  -- 88/352
mult: 0.00 x 0.00 - 0.00  -- 96/384
mult: 0.00 x 0.00 - 0.00  -- 104/416
mult: 0.24 x 0.27 - 0.07  -- 112/448
mult: 0.00 x 0.55 - 0.07  -- 120/480
mult: 0.58 x 0.00 - 0.07  -- 128/512
mult: 0.00 x 0.00 - 0.07  -- 136/544
mult: 0.00 x 0.17 - 0.07  -- 144/576
mult: 0.00 x 0.00 - 0.07  -- 152/608
mult: 0.32 x 0.00 - 0.07  -- 160/640
mult: 0.00 x 0.69 - 0.07  -- 168/672
mult: 0.00 x 0.00 - 0.07  -- 176/704
mult: 0.57 x 0.00 - 0.07  -- 184/736
mult: 0.61 x 0.95 - 0.64  -- 192/768
mult: 0.99 x 0.49 - 1.13  -- 200/800
mult: 0.03 x 0.00 - 1.13  -- 208/832
mult: 0.00 x 0.37 - 

In [106]:
(res_np-mult).sum()

-3.8929284e-07

In [107]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [108]:
b

array([[0.        , 0.        , 0.        , ..., 0.        , 0.8777981 ,
        0.        ],
       [0.71155494, 0.        , 0.06425007, ..., 0.        , 0.12909082,
        0.6933334 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.6790512 ,
        0.        ],
       ...,
       [0.        , 0.8216976 , 0.70057744, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.19453382, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.29924747,
        0.        ]], dtype=float32)

In [109]:
res_np

array([[1.9616023 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.48470148, 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437804 , 1.1737825 , 0.998789  , 1.651209  ,
        0.8851915 , 3.3559442 , 1.6590875 , 0.42023018, 2.0140884 ,
        1.4066358 , 1.6796163 , 2.2332704 , 0.6536164 , 1.3619102 ,
        1.6098175 , 1.3137349 , 1.2370825 , 0.89148694, 1.2607467 ,
        2.4422865 , 1.6772664 ],
       [1.2752076 , 1.7259339 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716728 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.25563458, 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.5942491 , 0.04015317,
        0.1251733 , 2.1408708 , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831336 , 0.93258435, 1.2005807 , 0.7605144 ,
        1.8359319 , 1.0324435 ],
       [0.8095597 , 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [110]:
a.dot(b)

array([[1.9616024 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.4847015 , 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437801 , 1.1737825 , 0.9987891 , 1.651209  ,
        0.8851915 , 3.3559437 , 1.6590874 , 0.42023015, 2.0140886 ,
        1.4066358 , 1.6796165 , 2.2332702 , 0.6536164 , 1.3619103 ,
        1.6098175 , 1.3137348 , 1.2370825 , 0.891487  , 1.2607467 ,
        2.4422865 , 1.6772662 ],
       [1.2752078 , 1.7259338 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716727 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.2556346 , 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.594249  , 0.04015317,
        0.1251733 , 2.140871  , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831335 , 0.9325843 , 1.2005808 , 0.7605144 ,
        1.8359318 , 1.0324435 ],
       [0.80955964, 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [111]:
res_np==mult

array([[False,  True,  True,  True, False, False,  True,  True,  True,
         True,  True, False,  True, False,  True,  True, False, False,
        False, False,  True, False, False,  True, False,  True, False,
         True, False,  True,  True, False],
       [False, False,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True,  True,  True, False,
        False, False,  True, False,  True],
       [False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True, False,  True,  True,  True,  True,
        False, False,  True, False,  True, False,  True,  True,  True,
         True,  True, False, False, False],
       [ True, False,  True,  True, False,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False, False,  True,  True, False, False, False,
         True,  

In [112]:
res_np.shape

(8, 32)

In [113]:
mult.shape

(8, 32)

# Matmult Dense Transposed2

In [114]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint osize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      // osize = get_global_size(1);
      int gidx = get_global_id(0); // col
      // int gidy = get_global_id(1); // row

      for (uint gidy = 0; gidy < osize; gidy++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          ret += x[gidx*msize+i]*y[i*osize+gidy];
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f", x[gidx*msize+i],y[i*msize+gidy], ret);
        }

        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [115]:
a.shape, b.shape

((8, 64), (64, 32))

In [116]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [117]:
rows = a.shape[0]

In [118]:
mult = mult.astype(np.float32)

In [119]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]), np.uint32(b.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7fa178326860>


mult: 0.85 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.27 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.17 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.95 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.24 x 0.79 - 0.07
mult: 0.00 x 0.00 - 0.07
mult: 0.58 x 0.95 - 0.07
mult: 0.00 x 0.00 - 0.07
mult: 0.00 x 0.00 - 0.07
mult: 0.00 x 0.10 - 0.07
mult: 0.32 x 0.00 - 0.07
mult: 0.00 x 0.00 - 0.07
mult: 0.00 x 0.00 - 0.07
mult: 0.57 x 0.00 - 0.07
mult: 0.61 x 0.35 - 0.64
mult: 0.99 x 0.00 - 1.13
mult: 0.03 x 0.04 - 1.13
mult: 0.00 x 0.00 - 1.13
mult: 0.67 x 0.00 - 1.66
mult: 0.00 x 0.00 - 1.66
mult: 0.00 x 0.33 - 1.66
mult: 0.00 x 0.00 - 1.66
mult: 0.00 x 1.96 - 1.66
mult: 0.00 x 0.81 - 1.66
mult: 0.98 x 0.85 - 1.66
mult: 0.37 x 2.39 - 1.66
mult: 0.00 x 0.00 - 1.66
mult: 0.00 x 0.00 - 1.66
mult: 0.00 x 0.00 - 1.66
mult: 0.00 x 0.00 - 1.66

In [120]:
(res_np-mult).sum()

-3.8929284e-07

In [121]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [122]:
b.T

array([[0.        , 0.71155494, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.8216976 , 0.        ,
        0.        ],
       [0.        , 0.06425007, 0.        , ..., 0.70057744, 0.19453382,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.8777981 , 0.12909082, 0.6790512 , ..., 0.        , 0.        ,
        0.29924747],
       [0.        , 0.6933334 , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [123]:
a[0]

array([0.8480498 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.24151757,
       0.        , 0.57852703, 0.        , 0.        , 0.        ,
       0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
       0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.98229825,
       0.3666549 , 0.        , 0.        , 0.        , 0.        ,
       0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
       0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02024503, 0.14442326], dtype=float32)

In [124]:
b.T[0]

array([0.        , 0.71155494, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.27366054,
       0.55052847, 0.        , 0.        , 0.16765107, 0.        ,
       0.        , 0.6886489 , 0.        , 0.        , 0.9504861 ,
       0.4929784 , 0.        , 0.36551505, 0.7865161 , 0.        ,
       0.        , 0.        , 0.94946414, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.10173589, 0.        ,
       0.        , 0.        , 0.        , 0.6071627 , 0.        ,
       0.        , 0.        , 0.        , 0.34772286, 0.        ,
       0.        , 0.        , 0.03734786, 0.        , 0.        ,
       0.        , 0.        , 0.8510094 , 0.        , 0.        ,
       0.3296328 , 0.        , 0.        , 0.        ], dtype=float32)

In [125]:
res_buf

<pyopencl._cl.Buffer at 0x7fa17832c4f0>

In [126]:
res_np

array([[1.9616023 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.48470148, 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437804 , 1.1737825 , 0.998789  , 1.651209  ,
        0.8851915 , 3.3559442 , 1.6590875 , 0.42023018, 2.0140884 ,
        1.4066358 , 1.6796163 , 2.2332704 , 0.6536164 , 1.3619102 ,
        1.6098175 , 1.3137349 , 1.2370825 , 0.89148694, 1.2607467 ,
        2.4422865 , 1.6772664 ],
       [1.2752076 , 1.7259339 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716728 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.25563458, 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.5942491 , 0.04015317,
        0.1251733 , 2.1408708 , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831336 , 0.93258435, 1.2005807 , 0.7605144 ,
        1.8359319 , 1.0324435 ],
       [0.8095597 , 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [127]:
a.dot(b)

array([[1.9616024 , 1.1421939 , 1.3346761 , 1.240076  , 0.02350759,
        0.4847015 , 1.8412888 , 1.1972191 , 1.5001533 , 1.4210726 ,
        0.5402745 , 1.7437801 , 1.1737825 , 0.9987891 , 1.651209  ,
        0.8851915 , 3.3559437 , 1.6590874 , 0.42023015, 2.0140886 ,
        1.4066358 , 1.6796165 , 2.2332702 , 0.6536164 , 1.3619103 ,
        1.6098175 , 1.3137348 , 1.2370825 , 0.891487  , 1.2607467 ,
        2.4422865 , 1.6772662 ],
       [1.2752078 , 1.7259338 , 1.0421345 , 0.8146857 , 1.1589297 ,
        1.7716727 , 2.392328  , 1.2461631 , 1.0769001 , 0.948504  ,
        1.6411772 , 0.2556346 , 1.339129  , 0.9742772 , 1.7138177 ,
        1.3025115 , 1.4571431 , 0.72237605, 1.594249  , 0.04015317,
        0.1251733 , 2.140871  , 1.2661698 , 1.4071221 , 1.2982895 ,
        0.9704528 , 1.2831335 , 0.9325843 , 1.2005808 , 0.7605144 ,
        1.8359318 , 1.0324435 ],
       [0.80955964, 0.77134174, 0.259071  , 1.0867811 , 0.9902814 ,
        0.9730427 , 0.5575609 , 1.710138  , 1.0925

In [128]:
res_np==mult

array([[False,  True,  True,  True, False, False,  True,  True,  True,
         True,  True, False,  True, False,  True,  True, False, False,
        False, False,  True, False, False,  True, False,  True, False,
         True, False,  True,  True, False],
       [False, False,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True,  True,  True, False,
        False, False,  True, False,  True],
       [False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True, False,  True,  True,  True,  True,
        False, False,  True, False,  True, False,  True,  True,  True,
         True,  True, False, False, False],
       [ True, False,  True,  True, False,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False, False,  True,  True, False, False, False,
         True,  

In [129]:
res_np.shape

(8, 32)

In [130]:
mult.shape

(8, 32)

## Weight update kernel

In [131]:
bs = 4

In [132]:
dim = 8
topk = 2

x = np.random.rand(bs,dim).astype(np.float32)
y = np.random.rand(bs,dim).astype(np.float32)
x.shape,y.shape, topk

((4, 8), (4, 8), 2)

x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint bs = get_global_size(1);
      uint gid2 = get_global_id(1);

      uint idx = n*gid2+gid;

      float valx = x[idx];
      float valy = y[idx];
      uint posx = 0;
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval = x[idx2];
        float tempval2 = y[idx2];
        bool larger = tempval > valx;
        bool larger2 = tempval2 > valy;

        barrier(CLK_GLOBAL_MEM_FENCE);
        posx += (larger)?1:0;
        posy += (larger2)?1:0;
        barrier(CLK_GLOBAL_MEM_FENCE);
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      //printf("posx:%i", posx);
      if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
      }
      if (posy < topk) {
        youtidx[posy+topk*gid2] = gid;
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      if (gid < topk) {
        for (uint j=0; j<topk; j++) {
          float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
          //printf("\\nJ:%i  gid:%i", j, gid);
          //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
          barrier(CLK_GLOBAL_MEM_FENCE);
          xout[gid2*topk*topk+j*topk+gid] = res;
          barrier(CLK_GLOBAL_MEM_FENCE);
          
        }
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
    }""").build()

In [133]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#x_cp_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
#x_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#y_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             uint bs,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      //uint bs = get_global_size(1);
      //uint gid2 = get_global_id(1);

      for (uint gid2=0; gid2<bs; gid2++){
        uint idx = n*gid2+gid;

        float valx = x[idx];
        float valy = y[idx];
        uint posx = 0;
        uint posy = 0;
        for (uint i = 0; i < n; i++) {
          uint idx2 = n*gid2+i;
          float tempval = x[idx2];
          float tempval2 = y[idx2];
          bool larger = tempval > valx;
          bool larger2 = tempval2 > valy;

          posx += (larger)?1:0;
          posy += (larger2)?1:0;
        }
        //printf("posx:%i", posx);
        if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
        }
        if (posy < topk) {
          youtidx[posy+topk*gid2] = gid;
        }
      }
      for (uint gid2=0; gid2<bs; gid2++){
        if (gid < topk) {
          for (uint j=0; j<topk; j++) {
            float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
            //printf("\\nJ:%i  gid:%i", j, gid);
            //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
            //barrier(CLK_GLOBAL_MEM_FENCE);
            xout[gid2*topk*topk+j*topk+gid] = res;
          }
        }
      }
    }""").build()

In [134]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
evt = knl(queue, [dim], None, x_buf, y_buf, x_cp_buf, np.uint32(topk), np.uint32(bs), x_idx_buf, y_idx_buf)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7fa178332db0>

knl(queue, [dim], None, y_buf, x_buf, x_cp_buft, np.uint32(topk), np.uint32(bs), x_idx_buft, y_idx_buft)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

In [135]:
x

array([[0.4176165 , 0.87723774, 0.93799776, 0.4989376 , 0.53865486,
        0.99209   , 0.43981695, 0.9238172 ],
       [0.9643642 , 0.39408413, 0.05129792, 0.8836294 , 0.46156052,
        0.79162216, 0.8183572 , 0.7356259 ],
       [0.24731638, 0.527043  , 0.4360408 , 0.74492866, 0.97902703,
        0.7912716 , 0.3733227 , 0.70151573],
       [0.5769778 , 0.63135755, 0.12137668, 0.4770543 , 0.71375763,
        0.20705712, 0.31999564, 0.89218843]], dtype=float32)

In [136]:
y

array([[0.80792934, 0.62712705, 0.16275288, 0.8634472 , 0.32329696,
        0.69843024, 0.02926809, 0.01935733],
       [0.7276622 , 0.7565494 , 0.9753612 , 0.50341916, 0.39989227,
        0.08980402, 0.06077567, 0.63835126],
       [0.30189443, 0.43184277, 0.42384407, 0.00961545, 0.6057264 ,
        0.4530684 , 0.19571681, 0.67383313],
       [0.3370894 , 0.92448306, 0.70374525, 0.48769042, 0.68747973,
        0.4852812 , 0.36849618, 0.73591477]], dtype=float32)

In [137]:
x.shape, y.shape

((4, 8), (4, 8))

In [138]:
resx

array([0.85661733, 0.80991155, 0.8015386 , 0.7578359 , 0.94060344,
       0.86185783, 0.72958916, 0.6685093 , 0.6597009 , 0.53318506,
       0.5930225 , 0.47929412, 0.82481307, 0.65985686, 0.65657467,
       0.5252648 ], dtype=float32)

In [139]:
resx.reshape(bs,topk,topk)

array([[[0.85661733, 0.80991155],
        [0.8015386 , 0.7578359 ]],

       [[0.94060344, 0.86185783],
        [0.72958916, 0.6685093 ]],

       [[0.6597009 , 0.53318506],
        [0.5930225 , 0.47929412]],

       [[0.82481307, 0.65985686],
        [0.65657467, 0.5252648 ]]], dtype=float32)

In [140]:
resxidx

array([5, 2, 0, 3, 4, 5, 7, 4], dtype=uint32)

In [141]:
resyidx

array([3, 0, 2, 1, 7, 4, 1, 7], dtype=uint32)

In [142]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [143]:
xy0[3][7]

0.56406593

### update vals add dense

In [144]:
matadd = np.random.randn(*a.shape).astype(np.float32)
matadd

array([[ 0.05752929, -0.0509786 , -0.60213155,  1.0984399 , -0.9453555 ,
         0.15302877,  0.14521188, -0.49142304,  0.96433103,  0.44916302,
        -1.4808178 , -1.613814  , -0.05258876, -0.12159251,  0.08476306,
        -0.10924815, -0.17528397, -1.160786  ,  2.4249256 ,  0.77042574,
        -1.4372578 , -0.7746275 ,  0.62930965, -0.54892737, -0.98212427,
         0.22562088,  2.1874866 , -0.19054703,  1.9791809 ,  0.04097094,
        -0.26948825,  0.27872083, -1.3592024 ,  3.3965538 , -1.2980617 ,
         0.9647827 , -1.1351737 ,  0.67432976,  0.5838635 , -1.0670233 ,
        -1.6267926 ,  0.15372923,  0.8212984 ,  1.5532309 ,  1.4015162 ,
         1.4737281 ,  0.46314296, -0.0056166 , -0.7598129 , -1.4425282 ,
        -0.538515  , -1.0111481 , -0.24240252, -1.1558282 , -1.5719486 ,
        -0.00770444,  1.6872655 , -0.05218272,  0.17531519,  0.7683851 ,
         0.9194887 , -0.29241613, -0.02061458,  1.0297822 ],
       [ 0.7131262 , -0.8430804 , -1.1752378 , -0.9420021 , -1.

In [145]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [146]:
a_added = a + matadd

In [184]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   awidth,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*awidth;
      
      for (uint i=0; i<awidth; i++) {
        float addval = vector_x[baseidxd+i];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
        }
      }
    }""").build()

In [185]:
a.shape, b.shape

((8, 64), (64, 32))

In [186]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [187]:
rows = a.shape[0]

In [188]:
mult = mult.astype(np.float32)

In [189]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa),np.uint32(a.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7fa178257e00>

In [190]:
matadd[0][0]

0.05752929

In [191]:
data_res = np.empty_like(adata)
cols_res = np.empty_like(acols)
nnzs_res = np.empty_like(annz)
cl.enqueue_copy(queue, data_res, adata_buf)
cl.enqueue_copy(queue, cols_res, acols_buf)
cl.enqueue_copy(queue, nnzs_res, annzs_buf)

<pyopencl._cl.NannyEvent at 0x7fa178360770>

In [192]:
adenseadd = to_dense(data_res, cols_res, nnzs_res, ellwa, a.shape)
adenseadd

array([[ 0.90557909,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.24151757,
         0.        ,  0.57852703,  0.        ,  0.        ,  0.        ,
         0.32244816,  0.        ,  0.        ,  0.57253122,  0.60630077,
         0.9894926 ,  0.02636844,  0.        ,  0.67207873,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.98229825,
         0.3666549 ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.93975037,  0.11934852,  0.        ,  0.49876079,  0.35858813,
         0.64539164,  0.99408501,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.02024503,  0.14442326],
       [ 0.        ,  0.        ,  0.90089077,  0.07546286,  0.

In [193]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [194]:
matadd

array([[ 0.05752929, -0.0509786 , -0.60213155,  1.0984399 , -0.9453555 ,
         0.15302877,  0.14521188, -0.49142304,  0.96433103,  0.44916302,
        -1.4808178 , -1.613814  , -0.05258876, -0.12159251,  0.08476306,
        -0.10924815, -0.17528397, -1.160786  ,  2.4249256 ,  0.77042574,
        -1.4372578 , -0.7746275 ,  0.62930965, -0.54892737, -0.98212427,
         0.22562088,  2.1874866 , -0.19054703,  1.9791809 ,  0.04097094,
        -0.26948825,  0.27872083, -1.3592024 ,  3.3965538 , -1.2980617 ,
         0.9647827 , -1.1351737 ,  0.67432976,  0.5838635 , -1.0670233 ,
        -1.6267926 ,  0.15372923,  0.8212984 ,  1.5532309 ,  1.4015162 ,
         1.4737281 ,  0.46314296, -0.0056166 , -0.7598129 , -1.4425282 ,
        -0.538515  , -1.0111481 , -0.24240252, -1.1558282 , -1.5719486 ,
        -0.00770444,  1.6872655 , -0.05218272,  0.17531519,  0.7683851 ,
         0.9194887 , -0.29241613, -0.02061458,  1.0297822 ],
       [ 0.7131262 , -0.8430804 , -1.1752378 , -0.9420021 , -1.

In [195]:
a_added

array([[ 9.0557909e-01, -5.0978601e-02, -6.0213155e-01,  1.0984399e+00,
        -9.4535547e-01,  1.5302877e-01,  1.4521188e-01, -4.9142304e-01,
         9.6433103e-01,  4.4916302e-01, -1.4808178e+00, -1.6138140e+00,
        -5.2588761e-02, -1.2159251e-01,  3.2628062e-01, -1.0924815e-01,
         4.0324306e-01, -1.1607860e+00,  2.4249256e+00,  7.7042574e-01,
        -1.1148096e+00, -7.7462751e-01,  6.2930965e-01,  2.3603857e-02,
        -3.7582350e-01,  1.2151135e+00,  2.2138550e+00, -1.9054703e-01,
         2.6512597e+00,  4.0970940e-02, -2.6948825e-01,  2.7872083e-01,
        -1.3592024e+00,  3.3965538e+00, -3.1576347e-01,  1.3314376e+00,
        -1.1351737e+00,  6.7432976e-01,  5.8386350e-01, -1.0670233e+00,
        -6.8704218e-01,  2.7307776e-01,  8.2129842e-01,  2.0519917e+00,
         1.7601043e+00,  2.1191196e+00,  1.4572279e+00, -5.6165955e-03,
        -7.5981289e-01, -1.4425282e+00, -5.3851497e-01, -1.0111481e+00,
        -2.4240252e-01, -1.1558282e+00, -1.5719486e+00, -7.70444

In [196]:
adenseadd == a_added

array([[ True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, Fals

### update vals

In [169]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddenset(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   aheight,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint ncols = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      
      for (uint i=0; i<aheight; i++) {
        if (nnz > ellwidth)
            break;
        uint baseidxd = i*ncols+gid;
        float addval = vector_x[baseidxd];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (i > colIdx[baseidxs+i]) {
            rowNnz[gid] += 1;
            //if (gid==1)
            //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i] = addval;
            colIdx[baseidxs+i] = i;
            if (nnz >= ellwidth)
              break;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
        }
      }
    }""").build()

In [170]:
a.shape, b.shape

((8, 64), (64, 32))

In [171]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [172]:
cols = a.shape[1]

In [173]:
mult = mult.astype(np.float32)

In [174]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddenset  # Use this Kernel object for repeated calls
knl(queue, [cols], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat),np.uint32(a.T.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7fa1783489f0>


ADD VAL:0.06 idx:0/0  col:0
ADD VAL:0.08 idx:70/14  col:0
ADD VAL:-0.18 idx:80/16  col:0
ADD VAL:-1.44 idx:100/20  col:0
ADD VAL:-0.55 idx:115/23  col:0
ADD VAL:-0.98 idx:120/24  col:0
ADD VAL:0.23 idx:125/25  col:0
ADD VAL:2.19 idx:130/26  col:0
ADD VAL:1.98 idx:140/28  col:0
ADD VAL:0.04 idx:145/29  col:0
ADD VAL:-1.30 idx:170/34  col:0
ADD VAL:0.96 idx:175/35  col:0
ADD VAL:-1.63 idx:200/40  col:0
ADD VAL:0.15 idx:205/41  col:0
ADD VAL:1.55 idx:215/43  col:0
ADD VAL:1.40 idx:220/44  col:0
ADD VAL:1.47 idx:225/45  col:0
ADD VAL:0.46 idx:230/46  col:0
ADD VAL:-1.16 idx:265/53  col:0
ADD VAL:0.92 idx:300/60  col:0
ADD VAL:-0.02 idx:310/62  col:0
ADD VAL:1.03 idx:315/63  col:0
ADD VAL:-1.54 idx:166/98  col:2
ADD VAL:-0.30 idx:191/103  col:2
ADD VAL:0.74 idx:196/104  col:2
ADD VAL:0.59 idx:211/107  col:2
ADD VAL:0.53 idx:221/109  col:2
ADD VAL:-0.84 idx:256/116  col:2
ADD VAL:-0.19 idx:261/117  col:2
ADD VAL:1.74 idx:291/123  col:2
ADD VAL:-1.18 idx:11/67  col:2
ADD VAL:-0.94 idx:16/68 

In [175]:
matadd[0][0]

0.05752929

In [176]:
datat_res = np.empty_like(adatat)
colst_res = np.empty_like(acolst)
nnzst_res = np.empty_like(annzt)
cl.enqueue_copy(queue, datat_res, adatat_buf)
cl.enqueue_copy(queue, colst_res, acolst_buf)
cl.enqueue_copy(queue, nnzst_res, annzst_buf)

<pyopencl._cl.NannyEvent at 0x7fa178348400>

In [177]:
adenseaddt = to_dense(datat_res, colst_res, nnzst_res, ellwat, a.T.shape).T
adenseaddt

array([[ 0.90557909, -0.60213155, -0.60213155,  1.09843993, -0.94535547,
         0.15302877,  0.14521188, -0.49142304,  0.96433103, -1.48081779,
        -1.48081779,  0.        , -0.05258876, -0.12159251,  0.        ,
        -0.10924815,  0.40324306, -1.16078603,  2.42492557,  0.77042574,
        -1.11480963, -0.77462751,  0.62930965,  0.        , -0.3758235 ,
         1.21511352,  2.21385503, -0.19054703,  2.65125966, -0.26948825,
        -0.26948825,  0.27872083, -1.35920238,  3.39655375, -0.31576347,
         1.33143759, -1.13517368,  0.        , -1.06702328, -1.06702328,
        -0.68704218,  0.27307776,  0.82129842,  0.        ,  1.7601043 ,
         1.45722795,  0.        , -0.0056166 , -0.75981289, -1.44252825,
         0.        , -1.0111481 , -0.24240252, -1.57194865, -1.57194865,
         0.        ,  1.68726552, -0.05218272,  0.        ,  0.76838511,
        -0.29241613, -0.29241613,  0.        ,  1.17420542],
       [ 0.71312618, -0.8430804 , -0.27434701, -0.86653924, -1.

In [178]:
a

array([[0.8480498 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24151757,
        0.        , 0.57852703, 0.        , 0.        , 0.        ,
        0.32244816, 0.        , 0.        , 0.5725312 , 0.6063008 ,
        0.9894926 , 0.02636844, 0.        , 0.6720787 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.98229825,
        0.3666549 , 0.        , 0.        , 0.        , 0.        ,
        0.9397504 , 0.11934852, 0.        , 0.4987608 , 0.35858813,
        0.64539164, 0.994085  , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02024503, 0.14442326],
       [0.        , 0.        , 0.90089077, 0.07546286, 0.        ,
        0.        , 0.03899236, 0.        , 0.        , 0. 

In [179]:
matadd

array([[ 0.05752929, -0.0509786 , -0.60213155,  1.0984399 , -0.9453555 ,
         0.15302877,  0.14521188, -0.49142304,  0.96433103,  0.44916302,
        -1.4808178 , -1.613814  , -0.05258876, -0.12159251,  0.08476306,
        -0.10924815, -0.17528397, -1.160786  ,  2.4249256 ,  0.77042574,
        -1.4372578 , -0.7746275 ,  0.62930965, -0.54892737, -0.98212427,
         0.22562088,  2.1874866 , -0.19054703,  1.9791809 ,  0.04097094,
        -0.26948825,  0.27872083, -1.3592024 ,  3.3965538 , -1.2980617 ,
         0.9647827 , -1.1351737 ,  0.67432976,  0.5838635 , -1.0670233 ,
        -1.6267926 ,  0.15372923,  0.8212984 ,  1.5532309 ,  1.4015162 ,
         1.4737281 ,  0.46314296, -0.0056166 , -0.7598129 , -1.4425282 ,
        -0.538515  , -1.0111481 , -0.24240252, -1.1558282 , -1.5719486 ,
        -0.00770444,  1.6872655 , -0.05218272,  0.17531519,  0.7683851 ,
         0.9194887 , -0.29241613, -0.02061458,  1.0297822 ],
       [ 0.7131262 , -0.8430804 , -1.1752378 , -0.9420021 , -1.

In [180]:
a_added

array([[ 9.0557909e-01, -5.0978601e-02, -6.0213155e-01,  1.0984399e+00,
        -9.4535547e-01,  1.5302877e-01,  1.4521188e-01, -4.9142304e-01,
         9.6433103e-01,  4.4916302e-01, -1.4808178e+00, -1.6138140e+00,
        -5.2588761e-02, -1.2159251e-01,  3.2628062e-01, -1.0924815e-01,
         4.0324306e-01, -1.1607860e+00,  2.4249256e+00,  7.7042574e-01,
        -1.1148096e+00, -7.7462751e-01,  6.2930965e-01,  2.3603857e-02,
        -3.7582350e-01,  1.2151135e+00,  2.2138550e+00, -1.9054703e-01,
         2.6512597e+00,  4.0970940e-02, -2.6948825e-01,  2.7872083e-01,
        -1.3592024e+00,  3.3965538e+00, -3.1576347e-01,  1.3314376e+00,
        -1.1351737e+00,  6.7432976e-01,  5.8386350e-01, -1.0670233e+00,
        -6.8704218e-01,  2.7307776e-01,  8.2129842e-01,  2.0519917e+00,
         1.7601043e+00,  2.1191196e+00,  1.4572279e+00, -5.6165955e-03,
        -7.5981289e-01, -1.4425282e+00, -5.3851497e-01, -1.0111481e+00,
        -2.4240252e-01, -1.1558282e+00, -1.5719486e+00, -7.70444

In [181]:
adenseaddt == a_added

array([[ True, False,  True,  True,  True,  True,  True,  True,  True,
        False,  True, False,  True,  True, False,  True,  True,  True,
         True,  True,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True, False,  True,
        False, False,  True,  True,  True, False,  True,  True, False,
         True, False,  True,  True, False,  True, False,  True, False,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,
        False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  Tru

### Make Random

In [183]:
rand = SparseTensor.uniform(2,4)
rand

<SparseTensor <GPUBuffer with shape (8,)> with grad None>

In [None]:
rand.to_numpy()

In [None]:
rand.data

### update vals

In [None]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)

In [None]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1);
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint col = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint row = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [None]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,bs], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa), x_cp_buf, x_idx_buf, y_idx_buf)

resa = np.empty_like(adata)
resaidx = np.zeros(acols.shape).astype(np.uint32)
resannz = np.zeros(annz.shape).astype(np.uint32)

cl.enqueue_copy(queue, resa, adata_buf)
cl.enqueue_copy(queue, resaidx, acols_buf)
cl.enqueue_copy(queue, resannz, annzs_buf)

In [None]:
resa.shape, resaidx.shape, resannz.shape, ellwa, a.T.shape

In [None]:
adenseadd = to_dense(resa, resaidx, resannz, ellwa, a.T.shape)
adenseadd

In [None]:
adenseadd - adense

In [None]:
adenseadd == adense

In [None]:
ellwa

In [None]:
adata2 = adata.reshape(-1, ellwa)
adata2

In [None]:
resa = resa.reshape(-1, ellwa)
resa

In [None]:
resa - adata2

In [None]:
acols

In [None]:
resaidx

In [None]:
resannz

In [None]:
annz

### update vals2

In [None]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)

In [None]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1);
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint row = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint col = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [None]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,bs], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat), x_cp_buf, x_idx_buf, y_idx_buf)

resat = np.empty_like(adatat)
resaidxt = np.zeros(acolst.shape).astype(np.uint32)
resannzt = np.zeros(annzt.shape).astype(np.uint32)

cl.enqueue_copy(queue, resat, adatat_buf)
cl.enqueue_copy(queue, resaidxt, acolst_buf)
cl.enqueue_copy(queue, resannzt, annzst_buf)

In [None]:
ellwa

In [None]:
resat.shape, resaidxt.shape, resannzt.shape

In [None]:
adenseaddt = to_dense(resat, resaidxt, resannzt, ellwat, a.T.shape)
adenseaddt

In [None]:
adenseadd == adenseaddt.T

In [None]:
adata2t = adatat.reshape(-1, ellwat)
adata2t

In [None]:
resat = resat.reshape(-1, ellwat)
resat

In [None]:
resat - adata2t

In [None]:
acols

In [None]:
resaidx

In [None]:
resannz

In [None]:
annz

# OTHER

import numpy as np
import pyopencl as cl

mf = cl.mem_flags

dim = 16
topk = 4

x = np.random.rand(dim).astype(np.float32)
y = np.random.rand(dim).astype(np.float32)
x.shape,y.shape

dim1 = 4
dim2 = 8
dim3 = 1

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.2

a = np.zeros((dim1,dim2))
b = np.random.rand(dim2,dim3).flatten().astype(np.float32)

a.shape, b.shape

In [None]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
val_out_buf = cl.Buffer(ctx, mf.READ_WRITE, 4*topk*topk)
x_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                         __global  float* y,    // INPUT
                         __global  float* xout,    // INPUT
                         uint topk,
                         __global  uint* xoutidx,    // INPUT
                         __global  uint* youtidx    // INPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  uint n = get_global_size(0);
  
  xout[gid] = x[gid];
  xoutidx[gid] = gid;
  youtidx[gid] = gid;
  
  float valx = x[gid];
  float valy = y[gid];
  uint posx = 0;
  uint posy = 0;
  for (uint i = 0; i < n; i++) {
    float tempval = x[i];
    float tempval2 = y[i];
    bool larger = tempval > valx;
    bool larger2 = tempval2 > valy;
      
    posx += (larger)?1:0;
    posy += (larger2)?1:0;
  }
  //printf("posx:%i", posx);
  if (posx < topk) {
    xoutidx[posx] = gid;
  }
  if (posy < topk) {
    youtidx[posy] = gid;
  }
  if (gid < topk) {
    uint i = gid;
    for (uint j=0; j<topk; j++) {
      xout[gid*topk+j] = x[xoutidx[gid]] * y[youtidx[j]];
    }
  }
}""").build()

In [None]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
event = knl(queue, [dim,], None, x_buf, y_buf, val_out_buf, np.uint32(topk), x_idx_buf, y_idx_buf)

#event.wait()
val_out = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, val_out, val_out_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf, wait_for=[event])
cl.enqueue_copy(queue, resyidx, y_idx_buf)

In [None]:
val_out

In [None]:
resxidx

In [None]:
resyidx

In [None]:
asdf

In [None]:
from __future__ import division

KERNEL_CODE = """
// Thread block size
#define BLOCK_SIZE %(block_size)d
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA %(w_a)d // Matrix A width
#define HA %(h_a)d // Matrix A height
#define WB %(w_b)d // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height
/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */
/* Matrix multiplication: C = A * B.
 * Device code.
 */
#define AS(j, i) As[i + j * BLOCK_SIZE]
#define BS(j, i) Bs[i + j * BLOCK_SIZE]
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! WA is A's width and WB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(16,16,1))) 
void
matrixMul( __global float* C, __global float* A, __global float* B)
{
    __local float As[BLOCK_SIZE*BLOCK_SIZE];
    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);
    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);
    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + WA - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * WB;
    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0f;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {
        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + WA * ty + tx];
        BS(ty, tx) = B[b + WB * ty + tx];
        // Synchronize to make sure the matrices are loaded
        barrier(CLK_LOCAL_MEM_FENCE);
        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
"""
