In [1]:
from tinygrad.densetensor import DenseTensor
from tinygrad.sparsetensor import SparseTensor
import numpy as np

%load_ext autoreload
%autoreload 2

DEVICE:GPU


In [2]:
x_init = np.random.randn(2,6).astype(np.float32)
x2_init = np.random.randn(3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(6,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)

x = DenseTensor(x_init)
W = DenseTensor(W_init)
m = DenseTensor(m_init)
out = x.dot(W).relu()
out = out.logsoftmax()
out = out.mul(m).add(m).sum()
out.backward()

out.cpu().data, x

x2 = DenseTensor(x2_init)#.gpu()
W = SparseTensor(W_init)
out = W.dot(x2).relu().sum()

out.backward()

out.cpu().data, x

In [3]:
import numpy as np
import pyopencl as cl

mf = cl.mem_flags

In [4]:
dim1 = 2
dim2 = 6
dim3 = 4
topkx = 2
topky = 4
topk  = topkx
bs = dim3

np.random.seed(9)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.4

a = np.zeros((dim1,dim2))
b = np.zeros((dim2,dim3)).astype(np.float32)

a.shape, b.shape

((2, 6), (6, 4))

In [5]:
x_init = np.random.randn(dim1,dim3).astype(np.float32)
w_init = np.random.randn(dim2,dim3).astype(np.float32)

In [6]:
w_init

array([[-0.24068058, -0.64794743,  0.6358911 ,  1.7401173 ],
       [ 0.2966822 ,  0.7075037 ,  1.8228158 ,  0.43076903],
       [ 1.5427296 , -0.9007212 , -0.13712502,  1.297579  ],
       [ 0.67527115,  0.03195812,  0.9181459 ,  0.38050947],
       [ 0.5163675 , -0.35523945,  0.208777  ,  0.32841107],
       [-0.49822477, -2.0917768 , -0.08258774,  2.4551826 ]],
      dtype=float32)

In [7]:
def fill_sparse(mat, sparsity=0.5):
    indices = np.array(range(mat.shape[1]))
    nrows = int(mat.shape[1]*sparsity)
    for row in range(mat.shape[0]):
        lim = nrows #+ int(np.random.random()*3)
        mat[row][np.random.permutation(indices)[:lim]] = np.random.random(lim)
    return mat

a = fill_sparse(a, sparsity).astype(np.float32)
b = fill_sparse(b, sparsity).astype(np.float32)

In [8]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [9]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [10]:
x2_init.T

array([0.5358892, 1.8311105, 0.5782868], dtype=float32)

In [11]:
mult = a.dot(b)
mult.shape

(2, 4)

In [12]:
mult.shape

(2, 4)

In [13]:
def to_data(mat):
    all_rows = []
    all_idxs = []
    all_nnzs = []
    for row in range(mat.shape[0]):
        rowdata = []
        colidxs = []
        all_nnzs.append(0)
        for col in range(mat.shape[1]):
            val = mat[row][col]
            if val != 0:
                rowdata.append(val)
                colidxs.append(col)
                all_nnzs[-1] += 1
        all_rows.append(rowdata)
        all_idxs.append(colidxs)
    
    ellwidth = min(int(np.sqrt(np.max(all_nnzs))+1)**2, mat.shape[1])
    #all_rows = np.array(all_rows)#.astype(np.float32).flatten()           
    for row in range(mat.shape[0]):
        #print(row, all_rows)
        all_rows[row] = np.array(all_rows[row])
        all_rows[row].resize(ellwidth)
        all_idxs[row] = np.array(all_idxs[row])
        all_idxs[row].resize(ellwidth)
        #print(all_idxs[row])
    all_rows = np.array(all_rows)
    all_idxs = np.array(all_idxs)
    all_nnzs = np.array(all_nnzs)
    
#     while (not all_rows[:,-1].any()):
#         all_rows = all_rows[:,:-1]
#         all_idxs = all_idxs[:,:-1]
#         ellwidth -= 1
        
    
    all_rows = np.array(all_rows).astype(np.float32).flatten()
    all_idxs = np.array(all_idxs).astype(np.uint32).flatten()
    
    all_nnzs = np.array(all_nnzs).astype(np.uint32)
    
    
    return all_rows, all_idxs, all_nnzs, ellwidth

In [14]:
def to_dense(data, cols, nnzs, ellw, shape):
    out = np.zeros(shape)
    for row in range(shape[0]):
        for icol in range(nnzs[row]):
            out[row,cols[row*ellw+icol]] = data[row*ellw+icol]
    return out

In [15]:
wdata, wcols, wnnz, ellww = to_data(w_init)
wdata, wcols, wnnz, ellww

(array([-0.24068058, -0.64794743,  0.6358911 ,  1.7401173 ,  0.2966822 ,
         0.7075037 ,  1.8228158 ,  0.43076903,  1.5427296 , -0.9007212 ,
        -0.13712502,  1.297579  ,  0.67527115,  0.03195812,  0.9181459 ,
         0.38050947,  0.5163675 , -0.35523945,  0.208777  ,  0.32841107,
        -0.49822477, -2.0917768 , -0.08258774,  2.4551826 ], dtype=float32),
 array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
        2, 3], dtype=uint32),
 array([4, 4, 4, 4, 4, 4], dtype=uint32),
 4)

In [16]:
wdatat, wcolst, wnnzt, ellwwt = to_data(w_init.T)
wdatat, wcolst, wnnzt, ellwwt

(array([-0.24068058,  0.2966822 ,  1.5427296 ,  0.67527115,  0.5163675 ,
        -0.49822477, -0.64794743,  0.7075037 , -0.9007212 ,  0.03195812,
        -0.35523945, -2.0917768 ,  0.6358911 ,  1.8228158 , -0.13712502,
         0.9181459 ,  0.208777  , -0.08258774,  1.7401173 ,  0.43076903,
         1.297579  ,  0.38050947,  0.32841107,  2.4551826 ], dtype=float32),
 array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
        4, 5], dtype=uint32),
 array([6, 6, 6, 6], dtype=uint32),
 6)

In [17]:
adata, acols, annz, ellwa = to_data(a)
adata, acols, annz, ellwa

(array([0.43555546, 0.47797403, 0.        , 0.        , 0.9252001 ,
        0.7035292 , 0.        , 0.        ], dtype=float32),
 array([2, 4, 0, 0, 0, 3, 0, 0], dtype=uint32),
 array([2, 2], dtype=uint32),
 4)

In [18]:
adatat, acolst, annzt, ellwat = to_data(a.T)
adatat, acolst, annzt, ellwat

(array([0.9252001 , 0.        , 0.        , 0.        , 0.43555546,
        0.        , 0.7035292 , 0.        , 0.47797403, 0.        ,
        0.        , 0.        ], dtype=float32),
 array([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=uint32),
 array([1, 0, 1, 1, 1, 0], dtype=uint32),
 2)

In [19]:
bdata, bcols, bnnz, ellwb = to_data(b)
bdata, bcols, bnnz, ellwb

(array([0.27972782, 0.        , 0.        , 0.        , 0.16953142,
        0.        , 0.        , 0.        , 0.12835628, 0.        ,
        0.        , 0.        , 0.48449844, 0.        , 0.        ,
        0.        , 0.2667744 , 0.        , 0.        , 0.        ,
        0.0712197 , 0.        , 0.        , 0.        ], dtype=float32),
 array([2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0,
        0, 0], dtype=uint32),
 array([1, 1, 1, 1, 1, 1], dtype=uint32),
 4)

In [20]:
bdatat, bcolst, bnnzt, ellwbt = to_data(b.T)
adatat, bcolst, bnnzt, ellwbt

(array([0.9252001 , 0.        , 0.        , 0.        , 0.43555546,
        0.        , 0.7035292 , 0.        , 0.47797403, 0.        ,
        0.        , 0.        ], dtype=float32),
 array([3, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 4, 5, 0, 0], dtype=uint32),
 array([1, 1, 2, 2], dtype=uint32),
 4)

In [21]:
adense = to_dense(adata, acols, annz, ellwa, a.shape)

In [22]:
adenset = to_dense(adatat, acolst, annzt, ellwat, a.T.shape)

In [23]:
bdense = to_dense(bdata, bcols, bnnz, ellwb, b.shape)

In [24]:
bdenset = to_dense(bdatat, bcolst, bnnzt, ellwbt, b.T.shape)

In [25]:
adense

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.70352918, 0.        ,
        0.        ]])

In [26]:
adenset.T == adense

array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True]])

In [27]:
bdenset.T == bdense

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])

In [28]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [29]:
a == adense

array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True]])

In [30]:
a.shape

(2, 6)

In [31]:
adata.shape, acols.shape, annz.shape, ellwa

((8,), (8,), (2,), 4)

In [32]:
#acols = acols.astype(np.uint32)
#annz = annz.astype(np.uint32)

In [33]:
adata, acols, annz, b

(array([0.43555546, 0.47797403, 0.        , 0.        , 0.9252001 ,
        0.7035292 , 0.        , 0.        ], dtype=float32),
 array([2, 4, 0, 0, 0, 3, 0, 0], dtype=uint32),
 array([2, 2], dtype=uint32),
 array([[0.        , 0.        , 0.27972782, 0.        ],
        [0.        , 0.        , 0.16953142, 0.        ],
        [0.        , 0.12835628, 0.        , 0.        ],
        [0.48449844, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.2667744 ],
        [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32))

## MatMul (Sparse-Dense)

adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      float sum = 0;
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[col*ncols+gid2];
          //if (gid==0 && gid2==2)
          //  printf("aval, xval: %.2f,%.2f: (%i,%i) \\n", aval, xval, col, index);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [34]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // SPARSE x DENSE
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      

      uint nnz    = rowNnz[gid];
      
      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          uint xidx = col*ncols+gid2;
          float xval  = vector_x[xidx];
          if (gid==0 && gid2==1)
            printf("aval, xval: %.2f,%.2f: (%i,%i) - %i \\n", aval, xval, col, index, xidx);
          sum  += aval * xval;
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [35]:
a.shape, b.shape

((2, 6), (6, 4))

In [36]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [37]:
rows = a.shape[0]

In [38]:
mult = mult.astype(np.float32)

In [39]:
outshape = (a.shape[0], b.shape[1])
outshape

(2, 4)

In [40]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul2  # Use this Kernel object for repeated calls
knl(queue, [outshape[0]], None, adata_buf, acols_buf, annzs_buf, np.uint32(ellwa), np.uint32(outshape[1]), b_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

aval, xval: 0.44,0.13: (2,0) - 9 
aval, xval: 0.48,0.00: (4,1) - 17 


<pyopencl._cl.NannyEvent at 0x7f48ec13aef0>

In [41]:
(res_np-mult).sum()

0.0

In [42]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [43]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [44]:
res_buf

<pyopencl._cl.Buffer at 0x7f48ec124f90>

In [45]:
res_np

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [46]:
mult

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [47]:
res_np==mult

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True]])

In [48]:
res_np.shape

(2, 4)

In [49]:
mult.shape

(2, 4)

## MatMul (dense * sparse)

In [50]:
bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmul(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        uint nnz = rowNnz[gid2];
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid2 * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[gid*mwidth+col];
          sum  += aval * xval;
          if (gid==0 && gid2==0)
            printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid*ncols+gid2] = sum;
      }
    }""").build()

In [51]:
a.shape, b.shape

((2, 6), (6, 4))

In [52]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [53]:
rows = a.shape[0]

In [54]:
a.shape, b.shape

((2, 6), (6, 4))

In [55]:
mult = a.dot(b)
mult = mult.astype(np.float32)

In [56]:
outshape = np.array([a.shape[0], b.shape[1]])
outshape

array([2, 4])

In [57]:
b.T

array([[0.        , 0.        , 0.        , 0.48449844, 0.        ,
        0.        ],
       [0.        , 0.        , 0.12835628, 0.        , 0.        ,
        0.        ],
       [0.27972782, 0.16953142, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.2667744 ,
        0.0712197 ]], dtype=float32)

In [58]:
a.T

array([[0.        , 0.9252001 ],
       [0.        , 0.        ],
       [0.43555546, 0.        ],
       [0.        , 0.7035292 ],
       [0.47797403, 0.        ],
       [0.        , 0.        ]], dtype=float32)

In [59]:
outshape.T

array([2, 4])

In [60]:
b.shape, outshape

((6, 4), array([2, 4]))

In [61]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul  # Use this Kernel object for repeated calls
knl(queue, [outshape.T[0]], None, bdatat_buf, bcolst_buf, bnnzst_buf, np.uint32(ellwbt), np.uint32(b.shape[0]), np.uint32(outshape.T[1]), a_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec149810>

aval, xval: 0.48,0.00 - 0.00: (3,0) 


In [62]:
(res_np-mult).sum()

0.0

In [63]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [64]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [65]:
res_buf

<pyopencl._cl.Buffer at 0x7f48ec39cbd0>

In [66]:
res_np

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [67]:
mult

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [68]:
res_np==mult

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True]])

In [69]:
res_np-mult

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]], dtype=float32)

In [70]:
res_np.shape

(2, 4)

In [71]:
mult.shape

(2, 4)

## MatMul2 (dense * sparse)

wdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdata)
wcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcols)
wnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnz)
wdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdatat)
wcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcolst)
wnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnzt)
x_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=x_init)

prg = cl.Program(ctx, """
    // DENSE x SPARSE-T
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint nnz = rowNnz[gid];

      for (uint gid2 = 0; gid2 < ncols; gid2++) {
        float sum = 0;
        for (uint i = 0; i < nnz; i++) {
          uint index   = (gid * ellwidth) + i;
          uint col     = colIdx[index];
          float aval  = matData[index];
          float xval  = vector_x[gid2*ncols+col];
          sum  += aval * xval;
          if (gid==0 && gid2==1)
            printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        vector_y[gid2*ncols+gid] = sum;
      }
    }""").build()

In [72]:
wdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdata)
wcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcols)
wnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnz)
wdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wdatat)
wcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wcolst)
wnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=wnnzt)
x_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=x_init)

prg = cl.Program(ctx, """
    // DENSE x SPARSE-T
    __kernel void matmul2(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            uint   ncols0,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint nnz = rowNnz[gid];
      uint gid2 = get_global_id(1);
      uint ncols = get_global_size(1);

      float sum = 0;
      for (uint i = 0; i < nnz; i++) {
        uint index   = (gid2 * ellwidth) + i;
        uint col     = colIdx[index];
        float aval  = matData[index];
        float xval  = vector_x[gid*mwidth+col];
        sum  += aval * xval;
        if (gid==1 && gid2==0) {
          printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        }
      }
      //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
      vector_y[gid*ncols+gid2] = sum;
    }""").build()

In [73]:
outshape

array([2, 4])

In [74]:
w_init.shape, x_init.shape
w = w_init
x = x_init

In [75]:
res = np.zeros(w.shape[0]).astype(np.float32)
#res

In [76]:
rows = w.shape[0]

In [77]:
mult = mult.astype(np.float32)

In [78]:
outshape = np.array([x.shape[0], w.shape[0]])
outshape

array([2, 6])

In [79]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmul2  # Use this Kernel object for repeated calls
knl(queue, outshape, None, wdata_buf, wcols_buf, wnnzs_buf, np.uint32(ellww), np.uint32(w.shape[1]), np.uint32(x.shape[1]), x_buf, res_buf)

res_np = np.zeros(outshape).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

aval, xval: -0.24,-0.38 - 0.09: (0,0) 
aval, xval: -0.65,-0.48 - 0.40: (1,1) 
aval, xval: 0.64,-1.52 - -0.56: (2,2) 
aval, xval: 1.74,-0.49 - -1.42: (3,3) 


<pyopencl._cl.NannyEvent at 0x7f48ec1543b0>

In [80]:
mult = x.dot(w_init.T)
mult.shape

(2, 6)

In [81]:
mult

array([[-0.54477155, -2.2444572 ,  0.39883292, -1.0381185 , -0.13380992,
         0.66565317],
       [-1.4162176 , -3.4299257 , -0.5792218 , -1.8507855 , -0.50244695,
         0.11506943]], dtype=float32)

In [82]:
res_np

array([[-0.54477155, -2.2444572 ,  0.39883292, -1.0381185 , -0.13380992,
         0.66565317],
       [-1.4162176 , -3.4299257 , -0.5792218 , -1.8507855 , -0.50244695,
         0.11506943]], dtype=float32)

In [83]:
x

array([[ 1.1085547e-03, -2.8954408e-01, -1.1160663e+00, -1.2882757e-02],
       [-3.7836146e-01, -4.8113537e-01, -1.5173311e+00, -4.9087200e-01]],
      dtype=float32)

In [84]:
w

array([[-0.24068058, -0.64794743,  0.6358911 ,  1.7401173 ],
       [ 0.2966822 ,  0.7075037 ,  1.8228158 ,  0.43076903],
       [ 1.5427296 , -0.9007212 , -0.13712502,  1.297579  ],
       [ 0.67527115,  0.03195812,  0.9181459 ,  0.38050947],
       [ 0.5163675 , -0.35523945,  0.208777  ,  0.32841107],
       [-0.49822477, -2.0917768 , -0.08258774,  2.4551826 ]],
      dtype=float32)

In [85]:
(res_np-mult).sum()

0.0

In [86]:
mult

array([[-0.54477155, -2.2444572 ,  0.39883292, -1.0381185 , -0.13380992,
         0.66565317],
       [-1.4162176 , -3.4299257 , -0.5792218 , -1.8507855 , -0.50244695,
         0.11506943]], dtype=float32)

In [87]:
res_np==mult

array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True]])

In [88]:
res_np-mult

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32)

In [89]:
res_np.shape

(2, 6)

In [90]:
mult.shape

(2, 6)

## MatMul (dense * sparse) NEW

bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmulnew(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint gid2 = get_global_id(1);
      uint ncols = get_global_size(1);
      uint nnz = rowNnz[gid2];
      float sum = 0;
      for (uint i = 0; i < nnz; i++) {
        uint index   = (gid2 * ellwidth) + i;
        uint col     = colIdx[index];
        float aval  = matData[index];
        float xval  = vector_x[gid*mwidth+col];
        vector_y[gid2*nrows+gid] += aval * xval;
        if (gid==0 && gid2==0)
          printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
        
      }
      
    }""").build()

In [91]:
bdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdata)
bcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcols)
bnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnz)
bdatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bdatat)
bcolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bcolst)
bnnzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=bnnzt)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // DENSE x SPARSE
    __kernel void matmulnew(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            uint   ellwidth,
                            uint   mwidth,
                            __global  float* vector_x,    // INPUT
                            __global  float* vector_y    // OUTPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);
      uint gid2 = get_global_id(1);
      uint ncols = get_global_size(1);
      uint nnz = rowNnz[gid2];
      float sum = 0;
      for (uint i = 0; i < nnz; i++) {
        uint index   = (gid2 * ellwidth) + i;
        uint col     = colIdx[index];
        float aval  = matData[index];
        float xval  = vector_x[gid*mwidth+col];
        sum  += aval * xval;
        if (gid==1 && gid2==0)
          printf("aval, xval: %.2f,%.2f - %.2f: (%i,%i) \\n", aval, xval, sum, col, index);
        //printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
      }
      vector_y[gid2*ncols+gid] = sum;
    }""").build()

In [92]:
a.shape, b.shape

((2, 6), (6, 4))

In [93]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [94]:
rows = a.shape[0]

In [95]:
mult = mult.astype(np.float32)

In [96]:
outshape = np.array([a.shape[0], b.shape[1]])
outshape

array([2, 4])

In [97]:
b.T

array([[0.        , 0.        , 0.        , 0.48449844, 0.        ,
        0.        ],
       [0.        , 0.        , 0.12835628, 0.        , 0.        ,
        0.        ],
       [0.27972782, 0.16953142, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.2667744 ,
        0.0712197 ]], dtype=float32)

In [98]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [99]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod(outshape)*4)
knl = prg.matmulnew  # Use this Kernel object for repeated calls
knl(queue, outshape, None, bdatat_buf, bcolst_buf, bnnzst_buf, np.uint32(ellwbt), np.uint32(b.shape[0]), a_buf, res_buf)

res_np = np.zeros((outshape[0],)).astype(np.float32)
print(res_np.shape)
cl.enqueue_copy(queue, res_np, res_buf)

(2,)aval, xval: 0.48,0.70 - 0.34: (3,0) 



<pyopencl._cl.NannyEvent at 0x7f48ec1542c0>

In [100]:
(res_np-mult.T).sum()

12.605351

In [101]:
res_buf

<pyopencl._cl.Buffer at 0x7f48ec154e50>

In [102]:
res_np.T

array([0.       , 0.3408588], dtype=float32)

In [103]:
mult

array([[-0.54477155, -2.2444572 ,  0.39883292, -1.0381185 , -0.13380992,
         0.66565317],
       [-1.4162176 , -3.4299257 , -0.5792218 , -1.8507855 , -0.50244695,
         0.11506943]], dtype=float32)

In [104]:
res_np-mult.T

array([[ 0.54477155,  1.7570764 ],
       [ 2.2444572 ,  3.7707844 ],
       [-0.39883292,  0.92008054],
       [ 1.0381185 ,  2.1916442 ],
       [ 0.13380992,  0.8433057 ],
       [-0.66565317,  0.22578935]], dtype=float32)

In [105]:
res_np.shape

(2,)

In [106]:
mult.shape

(2, 6)

In [107]:
asdf

NameError: name 'asdf' is not defined

# Matmult Dense Dense

In [None]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i; 
        float xval = x[xidx];
        uint yidx = osize*i+gidy;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [None]:
a.shape, b.shape

In [None]:
rows = a.shape[0]

In [108]:
mult = mult.astype(np.float32)

In [109]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

AttributeError: 'matmul0' was not found as a program info attribute or as a kernel name

In [110]:
(res_np-mult).sum()

ValueError: operands could not be broadcast together with shapes (2,) (2,6) 

In [111]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [112]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [113]:
res_np

array([0.       , 0.3408588], dtype=float32)

In [114]:
a.dot(b)

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [115]:
res_np==mult

  res_np==mult


False

In [116]:
res_np.shape

(2,)

In [117]:
mult.shape

(2, 6)

# Matmult Dense Transposed

In [118]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [119]:
c=np.zeros(b.T.shape)
bt = b.T
for row in range(bt.shape[0]):
    for col in range(bt.shape[1]):
        c[row][col] = bt[row][col]

In [120]:
bt

array([[0.        , 0.        , 0.        , 0.48449844, 0.        ,
        0.        ],
       [0.        , 0.        , 0.12835628, 0.        , 0.        ,
        0.        ],
       [0.27972782, 0.16953142, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.2667744 ,
        0.0712197 ]], dtype=float32)

In [121]:
c

array([[0.        , 0.        , 0.        , 0.48449844, 0.        ,
        0.        ],
       [0.        , 0.        , 0.12835628, 0.        , 0.        ,
        0.        ],
       [0.27972782, 0.16953142, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.26677439,
        0.0712197 ]])

In [122]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      uint osize = get_global_size(1);
      int gidx = get_global_id(0); // row
      int gidy = get_global_id(1); // col

      float ret = 0.0;
      for (int i = 0; i < msize; i++) {
        uint xidx = gidx*msize+i;
        float xval = x[xidx];
        uint yidx = msize*gidy+i;
        float yval = y[yidx];
        ret += xval*yval;
        if (gidx==0 && gidy==0)
          printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, res, xidx, yidx);
      }

      //if (gidx==0&&gidy==0)
      //  printf("\\nsum:%.2f", ret);
      res[gidx * osize + gidy] = ret;
    }""").build()

In [123]:
a.shape, b.T.shape

((2, 6), (4, 6))

In [124]:
rows = a.shape[0]

In [125]:
mult = mult.astype(np.float32)

In [126]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows,b.shape[1]], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.00 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.00 - 0.00  -- 1/1
mult: 0.44 x 0.00 - 0.00  -- 2/2
mult: 0.00 x 0.48 - 0.00  -- 3/3
mult: 0.48 x 0.00 - 0.00  -- 4/4
mult: 0.00 x 0.00 - 0.00  -- 5/5

<pyopencl._cl.NannyEvent at 0x7f48ec089e50>

In [127]:
(res_np-mult).sum()

ValueError: operands could not be broadcast together with shapes (2,4) (2,6) 

In [128]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [129]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [130]:
res_np

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [131]:
a.dot(b)

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [132]:
res_np==mult

  res_np==mult


False

In [133]:
res_np.shape

(2, 4)

In [134]:
mult.shape

(2, 6)

# Matmult Transposed Dense

In [135]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [136]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [137]:
at

array([[0.        , 0.9252001 ],
       [0.        , 0.        ],
       [0.43555546, 0.        ],
       [0.        , 0.7035292 ],
       [0.47797403, 0.        ],
       [0.        , 0.        ]], dtype=float32)

In [138]:
c

array([[0.        , 0.9252001 ],
       [0.        , 0.        ],
       [0.43555546, 0.        ],
       [0.        , 0.70352918],
       [0.47797403, 0.        ],
       [0.        , 0.        ]])

In [139]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
    // multilplies x TRANSPOSED by y (dense-dense)
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint isize
                          ) { // LOCAL SHARED BUFFER
      uint osize = get_global_size(0);
      int gidy = get_global_id(0); // row
      
      for (uint gidx = 0; gidx < isize; gidx++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          uint xidx = i*isize+gidx;
          float xval = x[xidx];
          uint yidx = osize*i+gidy;
          float yval = y[yidx];
          ret += xval*yval;
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f  -- %i/%i", xval, yval, ret, xidx, yidx);
        }
        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [140]:
a.shape, b.T.shape

((2, 6), (4, 6))

In [141]:
rows = a.shape[0]

In [142]:
mult = mult.astype(np.float32)

In [143]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [b.shape[1]], None, a_buf, b_buf, res_buf, np.uint32(a.shape[1]), np.uint32(rows))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)


mult: 0.00 x 0.00 - 0.00  -- 0/0
mult: 0.00 x 0.00 - 0.00  -- 2/4
mult: 0.44 x 0.00 - 0.00  -- 4/8
mult: 0.00 x 0.48 - 0.00  -- 6/12
mult: 0.48 x 0.00 - 0.00  -- 8/16
mult: 0.00 x 0.00 - 0.00  -- 10/20

<pyopencl._cl.NannyEvent at 0x7f48ec095ef0>

In [144]:
(res_np-mult).sum()

ValueError: operands could not be broadcast together with shapes (2,4) (2,6) 

In [145]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [146]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [147]:
res_np

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [148]:
a.dot(b)

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [149]:
res_np==mult

  res_np==mult


False

In [150]:
res_np.shape

(2, 4)

In [151]:
mult.shape

(2, 6)

# Matmult Dense Transposed2

In [152]:
b_buf2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

prg = cl.Program(ctx, """
    // multilplies x by y WITH Y TRANSPOSED INDEXING
    __kernel void matmul0(__global  float* x,     // INPUT MATRIX DATA
                          __global  float* y,    // INPUT
                          __global  float* res,    // INPUT
                          uint msize,
                          uint osize
                          ) { // LOCAL SHARED BUFFER
      uint isize = get_global_size(0);
      // osize = get_global_size(1);
      int gidx = get_global_id(0); // col
      // int gidy = get_global_id(1); // row

      for (uint gidy = 0; gidy < osize; gidy++) {
        float ret = 0.0;
        for (uint i = 0; i < msize; i++) {
          ret += x[gidx*msize+i]*y[i*osize+gidy];
          if (gidx==0 && gidy==0)
            printf("\\nmult: %.2f x %.2f - %.2f", x[gidx*msize+i],y[i*msize+gidy], ret);
        }

        //if (gidx==0&&gidy==0)
        //  printf("\\nsum:%.2f", ret);
        res[gidx * osize + gidy] = ret;
      }
    }""").build()

In [153]:
a.shape, b.shape

((2, 6), (6, 4))

In [154]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [155]:
rows = a.shape[0]

In [156]:
mult = mult.astype(np.float32)

In [157]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.matmul0  # Use this Kernel object for repeated calls
knl(queue, [rows], None, a_buf, b_buf2, res_buf, np.uint32(a.shape[1]), np.uint32(b.shape[1]))

res_np = np.zeros([rows,b.shape[1]]).astype(np.float32)
cl.enqueue_copy(queue, res_np, res_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec08a8b0>


mult: 0.00 x 0.00 - 0.00
mult: 0.00 x 0.17 - 0.00
mult: 0.44 x 0.48 - 0.00
mult: 0.00 x 0.00 - 0.00
mult: 0.48 x 0.00 - 0.00
mult: 0.00 x 0.00 - 0.00

In [158]:
(res_np-mult).sum()

ValueError: operands could not be broadcast together with shapes (2,4) (2,6) 

In [159]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [160]:
b.T

array([[0.        , 0.        , 0.        , 0.48449844, 0.        ,
        0.        ],
       [0.        , 0.        , 0.12835628, 0.        , 0.        ,
        0.        ],
       [0.27972782, 0.16953142, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.2667744 ,
        0.0712197 ]], dtype=float32)

In [161]:
a[0]

array([0.        , 0.        , 0.43555546, 0.        , 0.47797403,
       0.        ], dtype=float32)

In [162]:
b.T[0]

array([0.        , 0.        , 0.        , 0.48449844, 0.        ,
       0.        ], dtype=float32)

In [163]:
res_buf

<pyopencl._cl.Buffer at 0x7f48ec0951d0>

In [164]:
res_np

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [165]:
a.dot(b)

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [166]:
res_np==mult

  res_np==mult


False

In [167]:
res_np.shape

(2, 4)

In [168]:
mult.shape

(2, 6)

## Weight update kernel

In [169]:
bs = 4

In [170]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [171]:
dim = 8

x = np.random.rand(bs,dim).astype(np.float32)
y = np.random.rand(bs,dim).astype(np.float32)
x.shape,y.shape, topk

((4, 8), (4, 8), 2)

x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint bs = get_global_size(1);
      uint gid2 = get_global_id(1);

      uint idx = n*gid2+gid;

      float valx = x[idx];
      float valy = y[idx];
      uint posx = 0;
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval = x[idx2];
        float tempval2 = y[idx2];
        bool larger = tempval > valx;
        bool larger2 = tempval2 > valy;

        barrier(CLK_GLOBAL_MEM_FENCE);
        posx += (larger)?1:0;
        posy += (larger2)?1:0;
        barrier(CLK_GLOBAL_MEM_FENCE);
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      //printf("posx:%i", posx);
      if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
      }
      if (posy < topk) {
        youtidx[posy+topk*gid2] = gid;
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
      if (gid < topk) {
        for (uint j=0; j<topk; j++) {
          float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
          //printf("\\nJ:%i  gid:%i", j, gid);
          //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
          barrier(CLK_GLOBAL_MEM_FENCE);
          xout[gid2*topk*topk+j*topk+gid] = res;
          barrier(CLK_GLOBAL_MEM_FENCE);
          
        }
      }
      barrier(CLK_GLOBAL_MEM_FENCE);
    }""").build()

In [172]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#x_cp_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*topk*4)
#x_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)
#y_idx_buft = cl.Buffer(ctx, mf.WRITE_ONLY, bs*topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             uint bs,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      //uint bs = get_global_size(1);
      //uint gid2 = get_global_id(1);

      for (uint gid2=0; gid2<bs; gid2++){
        uint idx = n*gid2+gid;

        float valx = x[idx];
        float valy = y[idx];
        uint posx = 0;
        uint posy = 0;
        for (uint i = 0; i < n; i++) {
          uint idx2 = n*gid2+i;
          float tempval = x[idx2];
          float tempval2 = y[idx2];
          bool larger = tempval > valx;
          bool larger2 = tempval2 > valy;

          posx += (larger)?1:0;
          posy += (larger2)?1:0;
        }
        //printf("posx:%i", posx);
        if (posx < topk) {
        xoutidx[posx+topk*gid2] = gid;
        }
        if (posy < topk) {
          youtidx[posy+topk*gid2] = gid;
        }
      }
      for (uint gid2=0; gid2<bs; gid2++){
        if (gid < topk) {
          for (uint j=0; j<topk; j++) {
            float res = x[xoutidx[gid+topk*gid2]+gid2*n] * y[youtidx[j+topk*gid2]+gid2*n];
            //printf("\\nJ:%i  gid:%i", j, gid);
            //printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
            //barrier(CLK_GLOBAL_MEM_FENCE);
            xout[gid2*topk*topk+j*topk+gid] = res;
          }
        }
      }
    }""").build()

In [173]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
evt = knl(queue, [dim], None, x_buf, y_buf, x_cp_buf, np.uint32(topk), np.uint32(bs), x_idx_buf, y_idx_buf)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec08fb30>

knl(queue, [dim], None, y_buf, x_buf, x_cp_buft, np.uint32(topk), np.uint32(bs), x_idx_buft, y_idx_buft)

#evt.wait()
resx = np.zeros(bs*topk*topk).astype(np.float32)
resxidx = np.zeros(bs*topk).astype(np.uint32)
resyidx = np.zeros(bs*topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

In [174]:
x

array([[0.34280524, 0.4441123 , 0.49813962, 0.41434634, 0.21425596,
        0.21663971, 0.23230712, 0.7290856 ],
       [0.41591868, 0.5838982 , 0.4943366 , 0.43004683, 0.7347658 ,
        0.5562165 , 0.30263063, 0.9707982 ],
       [0.22484261, 0.13876915, 0.6149325 , 0.49192894, 0.18315274,
        0.10931052, 0.5395755 , 0.28929928],
       [0.10305405, 0.49060154, 0.6320917 , 0.03796922, 0.50010794,
        0.08354343, 0.79131824, 0.758036  ]], dtype=float32)

In [175]:
y

array([[0.26227322, 0.10085859, 0.9171927 , 0.5174602 , 0.13353968,
        0.11254311, 0.8416411 , 0.54607475],
       [0.26101658, 0.5758255 , 0.3138526 , 0.7916232 , 0.49309072,
        0.96515816, 0.52031195, 0.9030216 ],
       [0.24980456, 0.5214873 , 0.5057457 , 0.8130715 , 0.07319422,
        0.836207  , 0.80423564, 0.40632126],
       [0.03645125, 0.7672541 , 0.67892754, 0.91892886, 0.5601856 ,
        0.96771896, 0.30838436, 0.33888996]], dtype=float32)

In [176]:
x.shape, y.shape

((4, 8), (4, 8))

In [177]:
resx

array([0.668712  , 0.45689002, 0.61362845, 0.41925478, 0.9369738 ,
       0.7091652 , 0.8766517 , 0.66350937, 0.5142108 , 0.45119682,
       0.49998406, 0.43871346, 0.76577365, 0.7335658 , 0.72716516,
       0.6965812 ], dtype=float32)

In [178]:
resx.reshape(bs,topk,topk)

array([[[0.668712  , 0.45689002],
        [0.61362845, 0.41925478]],

       [[0.9369738 , 0.7091652 ],
        [0.8766517 , 0.66350937]],

       [[0.5142108 , 0.45119682],
        [0.49998406, 0.43871346]],

       [[0.76577365, 0.7335658 ],
        [0.72716516, 0.6965812 ]]], dtype=float32)

In [179]:
resxidx

array([7, 2, 7, 4, 2, 6, 6, 7], dtype=uint32)

In [180]:
resyidx

array([2, 6, 5, 7, 5, 3, 5, 3], dtype=uint32)

In [181]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [182]:
xy0[3][7]

0.38834158

## Weight update kernel new

In [183]:
b

array([[0.        , 0.        , 0.27972782, 0.        ],
       [0.        , 0.        , 0.16953142, 0.        ],
       [0.        , 0.12835628, 0.        , 0.        ],
       [0.48449844, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.2667744 ],
       [0.        , 0.        , 0.        , 0.0712197 ]], dtype=float32)

In [184]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [185]:
at

array([[0.        , 0.9252001 ],
       [0.        , 0.        ],
       [0.43555546, 0.        ],
       [0.        , 0.7035292 ],
       [0.47797403, 0.        ],
       [0.        , 0.        ]], dtype=float32)

In [186]:
c

array([[0.        , 0.9252001 ],
       [0.        , 0.        ],
       [0.43555546, 0.        ],
       [0.        , 0.70352918],
       [0.47797403, 0.        ],
       [0.        , 0.        ]])

a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate3(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             uint topk,
                             uint msize,
                             __global  float* xout,    // INPUT
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint bs = get_global_size(1);
      uint gid2 = get_global_id(1);



      uint idx = n*gid2+gid;
      float valx = x[idx];
      uint posx = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval = x[idx2];
        bool larger = tempval > valx;
        posx += (larger)?1:0;
      }
      
      uint idxy = n*gid2+gid;
      float valy = y[idx];
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        uint idx2 = n*gid2+i;
        float tempval2 = y[idx2];
        bool larger2 = tempval2 > valy;
        posy += (larger2)?1:0;
      }
      
      if (posx < topk) {
        xoutidx[posx+topk*gid2] = idx;
      }
      if (posy < topk) {
        youtidx[posy+topk*gid2] = idxy;
      }
      return;
      if (gid < topk) {
        for (uint j=0; j<topk; j++) {
          float res = x[xoutidx[gid+topk*gid2]+gid2*msize] * y[youtidx[j+topk*gid2]+gid2*msize];
          printf("\\nJ:%i  gid:(%i,%i)", j, gid, gid2);
          printf("\\nRES:%.2f - %i - %i -  %.2f - %.2f",res, xoutidx[gid+topk*gid2], youtidx[j+topk*gid2], x[xoutidx[gid+topk*gid2]+gid2*n], y[youtidx[j+topk*gid2]+gid2*n]);
          //barrier(CLK_GLOBAL_MEM_FENCE);
          xout[gid2*topk*topk+j*topk+gid] = res;
        }
      }
    }""").build()

In [187]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*topk*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topk*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate3(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topk,
                              __global  float* xout,
                              __global  uint* xoutidx,   
                              __global  uint* youtidx    
                              ) { 
      uint gid = get_global_id(0);
      
      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }
        
        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = xsum[i];
          bool larger = tempval > valx;
          posx += (larger)?1:0;
        }
        if (posx < topk) {
          xoutidx[posx] = gid;
        }
      }
      
      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          //}
          ysum[gid] += val;
        }
        
        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = ysum[i];
          bool larger = tempval > valy;
          posy += (larger)?1:0;
        }
      
        if (posy < topk) {
          youtidx[posy] = gid;
        }
      }
      
      if (gid < topk) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[posx] = valx;
        
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topk; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[posy] = valy;
      }
      
      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < topk) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topk; j++) {
          uint idxy = youtidx[j];
          xout[j*topk+gid] = 0;
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            xout[j*topk+gid] += x[xidx2] * y[yidx2];
            //if (gid == 0 && j == 1)
            //  printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
        }
      }
    }""").build()

In [188]:
a.shape, b.shape

((2, 6), (6, 4))

In [189]:
rows = a.shape[0]
msize = a.shape[1]

In [190]:
cols = b.shape[1]

In [191]:
mult = a.dot(b)

In [192]:
mult = mult.astype(np.float32)

In [193]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.genwupdate3  # Use this Kernel object for repeated calls
evt = knl(queue, [max(rows,cols)], None, a_buf, b_buf, x_sum_buf, y_sum_buf, np.uint32(rows), np.uint32(msize),np.uint32(cols), np.uint32(topk), x_cp_buf, x_idx_buf, y_idx_buf)

resxsum = np.zeros(a.shape[0]).astype(np.float32)
resysum = np.zeros(b.shape[1]).astype(np.float32)
resx = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, resxsum, x_sum_buf)
cl.enqueue_copy(queue, resysum, y_sum_buf)
cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec08ab80>

In [194]:
resx.reshape(topk,topk)

array([[0.       , 0.3408588],
       [0.       , 0.2588042]], dtype=float32)

In [195]:
resxsum

array([0.9135295, 1.6287293], dtype=float32)

In [196]:
resysum

array([0.48449844, 0.12835628, 0.44925922, 0.3379941 ], dtype=float32)

In [197]:
a.sum(axis=1)

array([0.9135295, 1.6287293], dtype=float32)

In [198]:
b.sum(axis=0)

array([0.48449844, 0.12835628, 0.44925922, 0.3379941 ], dtype=float32)

In [199]:
mult

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [200]:
resxidx

array([0, 1], dtype=uint32)

In [201]:
resyidx

array([0, 2], dtype=uint32)

In [202]:
idx = 1
xy0 = x[idx].reshape(dim,1)*y[idx]
xy0.shape

(8, 8)

In [203]:
xy0[0][0]

0.10856167

## Weight update kernel new2 (sparse ouput)

In [204]:
c=np.zeros(a.T.shape)
at = a.T
for row in range(at.shape[0]):
    for col in range(at.shape[1]):
        c[row][col] = at[row][col]

In [205]:
a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=c.astype(np.float32))
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
x_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, a.shape[0]*4)
y_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.shape[1]*4)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topkx*4)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, topky*4)
sdata_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
sidxs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*topkx*4)
snnzs_buf = cl.Buffer(ctx, mf.READ_WRITE, a.shape[0]*4)
sdatat_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
sidxst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*topky*4)
snnzst_buf = cl.Buffer(ctx, mf.READ_WRITE, b.shape[1]*4)

prg = cl.Program(ctx, """
    // sorts x and y in ascending order and returns sorted indices
    __kernel void genwupdate4(__global  float* x,     // INPUT MATRIX DATA
                              __global  float* y,    // INPUT
                              __global  float* xsum,    // INPUT
                              __global  float* ysum,    // INPUT
                              uint isize,
                              uint msize,
                              uint osize,
                              uint topkx,
                              uint topky,
                              __global  uint*  xoutidx,
                              __global  uint*  youtidx,
                              __global  float* matData,     // OUTPUT MATRIX DATA
                              __global  uint*  colIdx,
                              __global  uint*  rowNnz,
                              __global  float* matDatat,    // OUTPUT MATRIX DATA
                              __global  uint*  colIdxt,
                              __global  uint*  rowNnzt
                              ) {
      uint gid = get_global_id(0);

      // get for a: sum axis0  b: sum axis1 then get topk
      ///////////////////////////////////////////////////
      if (gid < isize) {
        xsum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = x[i*isize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, i*msize+gid);
          //}
          xsum[gid] += val;
        }

        float valx = xsum[gid];
        uint posx = 0;
        for (uint i = 0; i < isize; i++) {
          float tempval = fabs(xsum[i]);
          bool larger = tempval > fabs(valx);
          posx += (larger)?1:0;
        }
        if (posx < topkx) {
          xoutidx[posx] = gid;
        }
      }

      if (gid < osize) {
        ysum[gid] = 0;
        for (uint i=0; i<msize; i++) {
          float val = y[i*osize+gid];
          //if (gid == 0) {
          //  printf("\\nADD VALx: %.2f - %i", val, gid*osize+i);
          //}
          ysum[gid] += val;
        }

        float valy = ysum[gid];
        uint posy = 0;
        for (uint i = 0; i < osize; i++) {
          float tempval = fabs(ysum[i]);
          bool larger = tempval > fabs(valy);
          posy += (larger)?1:0;
        }

        if (posy < topky) {
          youtidx[posy] = gid;
        }
      }

      if (gid < topkx) {
        float valx = xoutidx[gid];
        uint posx = 0;
        for (uint i = 0; i < topkx; i++) {
          float tempval = xoutidx[i];
          bool larger = tempval < valx;
          posx += (larger)?1:0;
        }
        xoutidx[posx] = valx;
      }

      if (gid < topky) {
        float valy = youtidx[gid];
        uint posy = 0;
        for (uint i = 0; i < topky; i++) {
          float tempval = youtidx[i];
          bool larger = tempval < valy;
          posy += (larger)?1:0;
        }
        youtidx[posy] = valy;
      }

      // only calc matrix multiplications for used grads
      ///////////////////////////////////////////////////
      if (gid < isize) {
        for (uint i=0; i<topkx; i++) {
          matData[gid*topkx+i] = 0;
          colIdx[gid*topkx+i] = 0;
        }
        rowNnz[gid] = 0;
      }
      if (gid < osize) {
        for (uint i=0; i<topky; i++) {
          matDatat[gid*topky+i] = 0;
          colIdxt[gid*topky+i] = 0;
        }
        rowNnzt[gid] = 0;
      }


      if (gid < topky) {
        uint idxx = xoutidx[gid];
        for (uint j=0; j<topkx; j++) {
          uint idxy = youtidx[j];
          //printf("\\nIDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            uint colidx = idxy;
            matData[idxx*topkx+j] += x[xidx2] * y[yidx2];
            colIdx[idxx*topkx+j] = idxy;
            //if (gid == 0)
            //  printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnz[idxx] += 1;
        }
      }

      if (gid < topkx) {
        uint idxy = youtidx[gid];
        for (uint j=0; j<topky; j++) {
          uint idxx = xoutidx[j];
          //printf("\\nB-IDXX:%i  IDXY:%i", idxx, idxy);
          for (uint k=0; k<msize; k++) {
            uint xidx2 = isize*k+idxx;
            uint yidx2 = osize*k+idxy;
            uint colidx = idxy;
            float addval = x[xidx2] * y[yidx2];
            matDatat[idxy*topky+j] += addval;
            colIdxt[idxy*topky+j] = idxx;
            //if (gid == 0)
            //  printf("\\n ADD VAL:%.2f,%.2f - (%i,%i) - (%i,%i,%i)", x[xidx2], y[yidx2], idxx, idxy, gid, j, k);
          }
          rowNnzt[idxy] += 1;
          //printf("\\nAdd NNz:%i - %i", idxy, rowNnzt[idxy]);
        }
      }
    }""").build()

In [206]:
a.shape, b.shape

((2, 6), (6, 4))

In [207]:
rows = a.shape[0]
msize = a.shape[1]

In [208]:
cols = b.shape[1]

In [209]:
mult = a.dot(b)

In [210]:
mult = mult.astype(np.float32)

In [211]:
res_buf = cl.Buffer(ctx, mf.READ_WRITE, np.prod([rows,b.shape[1]])*4)
knl = prg.genwupdate4  # Use this Kernel object for repeated calls
evt = knl(queue, [max(rows,cols)], None, a_buf, b_buf, x_sum_buf, y_sum_buf, np.uint32(rows), np.uint32(msize),np.uint32(cols), 
          np.uint32(topkx),np.uint32(topky), x_idx_buf, y_idx_buf, sdata_buf, sidxs_buf, snnzs_buf, sdatat_buf, sidxst_buf, snnzst_buf)

In [212]:
resxsum = np.zeros(a.shape[0]).astype(np.float32)
resysum = np.zeros(b.shape[1]).astype(np.float32)
resxidx = np.zeros(topkx).astype(np.uint32)
resyidx = np.zeros(topky).astype(np.uint32)
resxdat = np.zeros(a.shape[0]*topkx).astype(np.float32)
resxcol = np.zeros(a.shape[0]*topkx).astype(np.uint32)
resxnnz = np.zeros(a.shape[0]).astype(np.uint32)
resxdatt = np.zeros(b.shape[1]*topky).astype(np.float32)
resxcolt = np.zeros(b.shape[1]*topky).astype(np.uint32)
resxnnzt = np.zeros(b.shape[1]).astype(np.uint32)

cl.enqueue_copy(queue, resxsum, x_sum_buf)
cl.enqueue_copy(queue, resysum, y_sum_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)
cl.enqueue_copy(queue, resxdat, sdata_buf)
cl.enqueue_copy(queue, resxcol, sidxs_buf)
cl.enqueue_copy(queue, resxnnz, snnzs_buf)
cl.enqueue_copy(queue, resxdatt, sdatat_buf)
cl.enqueue_copy(queue, resxcolt, sidxst_buf)
cl.enqueue_copy(queue, resxnnzt, snnzst_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec124c20>

## results

In [213]:
mult.T

array([[0.        , 0.3408588 ],
       [0.05590628, 0.        ],
       [0.        , 0.2588042 ],
       [0.12751123, 0.        ]], dtype=float32)

In [214]:
resxdat.reshape(a.shape[0],topkx)

array([[0.        , 0.05590628],
       [0.3408588 , 0.        ]], dtype=float32)

In [215]:
resxcol.reshape(a.shape[0],topkx)

array([[0, 1],
       [0, 1]], dtype=uint32)

In [216]:
resxnnz.reshape(a.shape[0])

array([2, 2], dtype=uint32)

In [217]:
resxdatt.reshape(b.shape[1],topky)

array([[0.        , 0.3408588 , 0.        , 0.        ],
       [0.05590628, 0.        , 0.05590628, 0.05590628],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ]], dtype=float32)

In [218]:
resxcolt.reshape(b.shape[1],topky)

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=uint32)

In [219]:
resxnnzt.reshape(b.shape[1])

array([4, 4, 0, 0], dtype=uint32)

In [220]:
resdense = to_dense(resxdat, resxcol, resxnnz, topk, mult.shape)

In [221]:
resdenset = to_dense(resxdatt, resxcolt, resxnnzt, topk, mult.T.shape)

In [222]:
resdense == resdenset.T

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True]])

## comp

In [223]:
resxsum

array([0.9135295, 1.6287293], dtype=float32)

In [224]:
a.sum(axis=1)

array([0.9135295, 1.6287293], dtype=float32)

In [225]:
resysum

array([0.48449844, 0.12835628, 0.44925922, 0.3379941 ], dtype=float32)

In [226]:
b.sum(axis=0)

array([0.48449844, 0.12835628, 0.44925922, 0.3379941 ], dtype=float32)

In [227]:
mult

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [228]:
resxidx

array([0, 1], dtype=uint32)

In [229]:
resyidx

array([0, 1, 2, 3], dtype=uint32)

In [230]:
asdf

NameError: name 'asdf' is not defined

## Prune Weights

In [231]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)

prg = cl.Program(ctx, """
    // prunes weights smaller than a constant C
    __kernel void prune(__global  float* matData,     // INPUT MATRIX DATA
                        __global  uint*  colIdx,
                        __global  uint*  rowNnz,
                        uint ellw,
                        float pruneval) { 
      uint gid = get_global_id(0);
      
      uint nnzs = rowNnz[gid];
      for (uint i=0; i<nnzs; i++) {
        uint idx = ellw * gid + i;
        float val = matData[idx];
        printf("\\nDATA:%.2f - %.2f", matData[idx], pruneval);
        if(fabs(val)<pruneval) {
          printf("\\nPRUNE(%i): %.2f", gid, matData[idx]);
          for (uint j=i; j<=nnzs-1; j++) {
            uint idx2 = ellw * gid + j;
            matData[idx2] = matData[idx2+1];
            colIdx[idx2] = colIdx[idx2+1];
          }
          matData[ellw*gid+nnzs] = 0;
          colIdx[ellw*gid+nnzs] = 0;
          rowNnz[gid] -= 1;
          nnzs = rowNnz[gid];
        }
      }
    }""").build()

In [232]:
a.shape

(2, 6)

In [233]:
rows = a.shape[0]
cols = a.shape[1]

pruneval = .35

In [234]:
knl = prg.prune  # Use this Kernel object for repeated calls
evt = knl(queue, [rows,], None, adata_buf, acols_buf, annzs_buf, np.uint32(ellwa), np.float32(pruneval))


DATA:0.44 - 0.35
DATA:0.93 - 0.35
DATA:0.48 - 0.35
DATA:0.70 - 0.35

In [235]:
resxdat = np.zeros(adata.shape).astype(np.float32)
resxcol = np.zeros(acols.shape).astype(np.uint32)
resxnnz = np.zeros(annz.shape).astype(np.uint32)

cl.enqueue_copy(queue, resxdat, adata_buf)
cl.enqueue_copy(queue, resxcol, acols_buf)
cl.enqueue_copy(queue, resxnnz, annzs_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec089c20>

In [236]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [237]:
adata.reshape((4,-1))

array([[0.43555546, 0.47797403],
       [0.        , 0.        ],
       [0.9252001 , 0.7035292 ],
       [0.        , 0.        ]], dtype=float32)

In [238]:
acols.reshape((4,-1))

array([[2, 4],
       [0, 0],
       [0, 3],
       [0, 0]], dtype=uint32)

In [239]:
resxdat.reshape((4,-1))

array([[0.43555546, 0.47797403],
       [0.        , 0.        ],
       [0.9252001 , 0.7035292 ],
       [0.        , 0.        ]], dtype=float32)

In [240]:
resxcol.reshape((4,-1))

array([[2, 4],
       [0, 0],
       [0, 3],
       [0, 0]], dtype=uint32)

In [241]:
resxnnz

array([2, 2], dtype=uint32)

## results

In [242]:
mult.T

array([[0.        , 0.3408588 ],
       [0.05590628, 0.        ],
       [0.        , 0.2588042 ],
       [0.12751123, 0.        ]], dtype=float32)

In [243]:
resxdat.reshape(a.shape[0],topk)

ValueError: cannot reshape array of size 8 into shape (2,2)

In [244]:
resxcol.reshape(a.shape[0],topk)

ValueError: cannot reshape array of size 8 into shape (2,2)

In [245]:
resxnnz.reshape(a.shape[0])

array([2, 2], dtype=uint32)

In [246]:
resxdatt.reshape(b.shape[1],topk)

ValueError: cannot reshape array of size 16 into shape (4,2)

In [247]:
resxcolt.reshape(b.shape[1],topk)

ValueError: cannot reshape array of size 16 into shape (4,2)

In [248]:
resxnnzt.reshape(b.shape[1])

array([4, 4, 0, 0], dtype=uint32)

### Update Vals (add sparse)

In [249]:
multdata, multcols, multnnz, multellw = to_data(mult)
multdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multdata)
multcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multcols)
multnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multnnz)


prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            __global  float* matDataAdd,     // INPUT MATRIX DATA
                            __global  uint*  colIdxAdd,
                            __global  uint*  rowNnzAdd,
                            uint ellwidthAdd
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];
      
      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*ellwidthAdd;
      
      uint nnzadd = rowNnzAdd[gid];
      printf("\\nNNZs: %i   GID:%i", nnzadd, gid);
      
      for (uint i=0; i<nnzadd; i++) {
        float addval = matDataAdd[baseidxd+i];
        uint addcol = colIdxAdd[baseidxd+i];
        
        uint refcol = colIdx[baseidxs+i];
        uint m = 0;
        while (addcol > refcol) {
          m += 1;
          refcol = colIdx[baseidxs+i+m];
        }
        
        //printf("\\nADD VAL:%.2f  ADDCOL:%i  idxs/d:(%i/%i)  gid/i:(%i/%i)", addval, addcol, baseidxs, baseidxd, gid,i);
        if (addval == 0.0) {
          //printf("\\nZERO VAL, CONT: %.2f - %i", addval, gid);
          continue;
        }
        if (addcol == refcol) {
          matData[baseidxs+i+m] += addval;
          printf("\\nINCREMENT: %.2f",addval);
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (addcol > refcol) {
            rowNnz[gid] += 1;
            printf("\\nSET VAL0:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i+m] = addval;
            colIdx[baseidxs+i+m] = addcol;
            continue;
          }
          for (uint j=nnz; j>i+m; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          
          printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i+m] = addval;
          colIdx[baseidxs+i+m] = addcol;
          if (nnz >= ellwidth)
            break;
        }
      }
    }""").build()

In [250]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [251]:
rows = a.shape[0]

In [252]:
mult = mult.astype(np.float32)

In [253]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, multdata_buf, multcols_buf, multnnzs_buf, np.float32(1), np.uint32(multellw), 
    sdata_buf, sidxs_buf, snnzs_buf, np.uint32(topk))

<pyopencl._cl.Event at 0x7f48ec0896d0>


NNZs: 2   GID:0
NNZs: 2   GID:1
INCREMENT: 0.34
SET VAL:0.06 idx:1/1  col:3

In [254]:
mult

array([[0.        , 0.05590628, 0.        , 0.12751123],
       [0.3408588 , 0.        , 0.2588042 , 0.        ]], dtype=float32)

In [255]:
data_res = np.empty_like(multdata)
cols_res = np.empty_like(multcols)
nnzs_res = np.empty_like(multnnz)
cl.enqueue_copy(queue, data_res, multdata_buf, is_blocking=True)
cl.enqueue_copy(queue, cols_res, multcols_buf, is_blocking=True)
cl.enqueue_copy(queue, nnzs_res, multnnzs_buf, is_blocking=True)

<pyopencl._cl.NannyEvent at 0x7f48ec033090>

In [256]:
adenseadd = to_dense(data_res, cols_res, nnzs_res, multellw, mult.shape)
adenseadd.T

array([[0.        , 0.68171757],
       [0.05590628, 0.        ],
       [0.        , 0.2588042 ],
       [0.12751123, 0.        ]])

In [257]:
mult-adenseadd

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.34085879,  0.        ,  0.        ,  0.        ]])

### Update Vals (add sparset)

In [258]:
multt=np.zeros(mult.T.shape)

for row in range(multt.shape[0]):
    for col in range(multt.shape[1]):
        multt[row][col] = mult[col][row]

In [259]:
multdata, multcols, multnnz, multellw = to_data(multt)
multdata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multdata)
multcols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multcols)
multnnzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=multnnz)

In [260]:
a.shape, b.shape

((2, 6), (6, 4))

In [261]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [262]:
rows = mult.T.shape[0]

In [263]:
mult = mult.astype(np.float32)

In [264]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, multdata_buf, multcols_buf, multnnzs_buf, np.float32(1), np.uint32(multellw), 
    sdatat_buf, sidxst_buf, snnzst_buf, np.uint32(topk))

<pyopencl._cl.Event at 0x7f48ec1242c0>


NNZs: 4   GID:0
NNZs: 4   GID:1
NNZs: 0   GID:2
NNZs: 0   GID:3
INCREMENT: 0.34
SET VAL:0.06 idx:4/4  col:0

In [265]:
mult.T

array([[0.        , 0.3408588 ],
       [0.05590628, 0.        ],
       [0.        , 0.2588042 ],
       [0.12751123, 0.        ]], dtype=float32)

In [266]:
data_res = np.empty_like(multdata)
cols_res = np.empty_like(multcols)
nnzs_res = np.empty_like(multnnz)
cl.enqueue_copy(queue, data_res, multdata_buf, is_blocking=True)
cl.enqueue_copy(queue, cols_res, multcols_buf, is_blocking=True)
cl.enqueue_copy(queue, nnzs_res, multnnzs_buf, is_blocking=True)

<pyopencl._cl.NannyEvent at 0x7f48ec033c20>

In [267]:
multt-data_res.reshape(multt.shape)

array([[-0.34085879,  0.34085879],
       [ 0.        ,  0.        ],
       [-0.05590628,  0.2588042 ],
       [ 0.        ,  0.        ]])

In [268]:
nnzs_res

array([1, 2, 1, 1], dtype=uint32)

In [269]:
adenseaddt = to_dense(data_res, cols_res, nnzs_res, multellw, multt.shape)
adenseaddt

array([[0.        , 0.34085879],
       [0.        , 0.        ],
       [0.05590628, 0.        ],
       [0.12751123, 0.        ]])

In [270]:
multt-adenseaddt

array([[ 0.        ,  0.        ],
       [ 0.05590628,  0.        ],
       [-0.05590628,  0.2588042 ],
       [ 0.        ,  0.        ]])

In [271]:
adenseaddt

array([[0.        , 0.34085879],
       [0.        , 0.        ],
       [0.05590628, 0.        ],
       [0.12751123, 0.        ]])

In [272]:
adenseadd.T == adenseaddt

array([[ True, False],
       [False,  True],
       [False, False],
       [ True,  True]])

### Update Vals (add topk to sparse)

In [273]:
matadd = np.random.randn(*a.shape).astype(np.float32)
matadd

array([[ 0.92007613,  1.0549185 ,  0.54028803,  0.30906352,  0.9708612 ,
         1.1547241 ],
       [ 0.42260882,  0.9992089 ,  0.88352025, -0.53019917,  0.81413484,
        -1.0198842 ]], dtype=float32)

In [274]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [275]:
a_added = a + matadd

In [276]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddense(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   awidth,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint nrows = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      uint baseidxd = gid*awidth;
      
      for (uint i=0; i<awidth; i++) {
        float addval = vector_x[baseidxd+i];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (i > colIdx[baseidxs+i]) {
            rowNnz[gid] += 1;
            //if (gid==1)
            //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i] = addval;
            colIdx[baseidxs+i] = i;
            continue;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
          if (nnz >= ellwidth)
            break;
        }
      }
    }""").build()

In [277]:
a.shape, b.shape

((2, 6), (6, 4))

In [278]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [279]:
rows = a.shape[0]

In [280]:
mult = mult.astype(np.float32)

In [281]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddense  # Use this Kernel object for repeated calls
knl(queue, [rows], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa),np.uint32(a.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7f48ec08e720>

In [282]:
matadd[0][0]

0.92007613

In [283]:
data_res = np.empty_like(adata)
cols_res = np.empty_like(acols)
nnzs_res = np.empty_like(annz)
cl.enqueue_copy(queue, data_res, adata_buf)
cl.enqueue_copy(queue, cols_res, acols_buf)
cl.enqueue_copy(queue, nnzs_res, annzs_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec022a90>

In [284]:
adenseadd = to_dense(data_res, cols_res, nnzs_res, ellwa, a.shape)
adenseadd

array([[0.92007613, 1.05491853, 0.43555546, 0.        , 0.47797403,
        0.        ],
       [1.34780896, 0.99920893, 0.88352025, 0.70352918, 0.        ,
        0.        ]])

In [285]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [286]:
matadd

array([[ 0.92007613,  1.0549185 ,  0.54028803,  0.30906352,  0.9708612 ,
         1.1547241 ],
       [ 0.42260882,  0.9992089 ,  0.88352025, -0.53019917,  0.81413484,
        -1.0198842 ]], dtype=float32)

In [287]:
a_added

array([[ 0.92007613,  1.0549185 ,  0.9758435 ,  0.30906352,  1.4488353 ,
         1.1547241 ],
       [ 1.347809  ,  0.9992089 ,  0.88352025,  0.17333001,  0.81413484,
        -1.0198842 ]], dtype=float32)

In [288]:
adenseadd

array([[0.92007613, 1.05491853, 0.43555546, 0.        , 0.47797403,
        0.        ],
       [1.34780896, 0.99920893, 0.88352025, 0.70352918, 0.        ,
        0.        ]])

In [289]:
adenseadd == a_added

array([[ True,  True, False, False, False, False],
       [ True,  True,  True, False, False, False]])

### update vals

In [290]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)
add_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=matadd)

prg = cl.Program(ctx, """
    // Every global_id_0 works on a row
    __kernel void adddenset(__global  float* matData,     // INPUT MATRIX DATA
                            __global  uint*  colIdx,
                            __global  uint*  rowNnz,
                            float  lr,
                            uint   ellwidth,
                            uint   aheight,
                            __global  float* vector_x    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint ncols = get_global_size(0);

      uint nnz    = rowNnz[gid];
      uint baseidxs = gid*ellwidth;
      
      for (uint i=0; i<aheight; i++) {
        if (nnz > ellwidth)
            break;
        uint baseidxd = i*ncols+gid;
        float addval = vector_x[baseidxd];
        //if (gid==1)
        //  printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[baseidxs+i]);
        if (addval == 0) {
          continue;
        }
        if (i == colIdx[baseidxs+i]) {
          printf("\\nADD VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] += addval;
        } else {
          if (rowNnz[gid] >= ellwidth) {
            break;
          }
          if (i > colIdx[baseidxs+i]) {
            rowNnz[gid] += 1;
            //if (gid==1)
            //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
            matData[baseidxs+i] = addval;
            colIdx[baseidxs+i] = i;
            continue;
          }
          for (uint j=nnz; j>i; j--) {
            //printf("\\nMOVE:%.2f", matData[baseidx+j-1]);
            colIdx[baseidxs+j] = colIdx[baseidxs+j-1];
            matData[baseidxs+j] = matData[baseidxs+j-1];
          }
          rowNnz[gid] += 1;
          nnz = rowNnz[gid];
          //if (gid==1)
          //  printf("\\nSET VAL:%.2f idx:%i/%i  col:%i", addval, baseidxs+i, baseidxd+i, colIdx[i]);
          matData[baseidxs+i] = addval;
          colIdx[baseidxs+i] = i;
        }
      }
    }""").build()

In [291]:
a.shape, b.shape

((2, 6), (6, 4))

In [292]:
res = np.zeros(a.shape[0]).astype(np.float32)
#res

In [293]:
cols = a.shape[1]

In [294]:
mult = mult.astype(np.float32)

In [295]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
knl = prg.adddenset  # Use this Kernel object for repeated calls
knl(queue, [cols], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat),np.uint32(a.T.shape[1]), add_buf)

<pyopencl._cl.Event at 0x7f48ec08ed10>


ADD VAL:1.05 idx:2/1  col:1
ADD VAL:0.54 idx:4/2  col:1
ADD VAL:0.97 idx:8/4  col:1
ADD VAL:1.15 idx:10/5  col:1
ADD VAL:0.42 idx:1/7  col:1
ADD VAL:-0.53 idx:7/10  col:1

In [296]:
matadd[0][0]

0.92007613

In [297]:
datat_res = np.empty_like(adatat)
colst_res = np.empty_like(acolst)
nnzst_res = np.empty_like(annzt)
cl.enqueue_copy(queue, datat_res, adatat_buf)
cl.enqueue_copy(queue, colst_res, acolst_buf)
cl.enqueue_copy(queue, nnzst_res, annzst_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec051090>

In [298]:
adenseaddt = to_dense(datat_res, colst_res, nnzst_res, ellwat, a.T.shape).T
adenseaddt

array([[0.92007613, 1.05491853, 0.97584349, 0.30906352, 1.44883525,
        1.15472412],
       [1.34780896, 0.        , 0.88352025, 0.17333001, 0.81413484,
        0.        ]])

In [299]:
a

array([[0.        , 0.        , 0.43555546, 0.        , 0.47797403,
        0.        ],
       [0.9252001 , 0.        , 0.        , 0.7035292 , 0.        ,
        0.        ]], dtype=float32)

In [300]:
matadd

array([[ 0.92007613,  1.0549185 ,  0.54028803,  0.30906352,  0.9708612 ,
         1.1547241 ],
       [ 0.42260882,  0.9992089 ,  0.88352025, -0.53019917,  0.81413484,
        -1.0198842 ]], dtype=float32)

In [301]:
a_added

array([[ 0.92007613,  1.0549185 ,  0.9758435 ,  0.30906352,  1.4488353 ,
         1.1547241 ],
       [ 1.347809  ,  0.9992089 ,  0.88352025,  0.17333001,  0.81413484,
        -1.0198842 ]], dtype=float32)

In [302]:
adenseaddt == a_added

array([[ True,  True,  True,  True,  True,  True],
       [ True, False,  True,  True,  True, False]])

### Make Random

In [303]:
rand = SparseTensor.uniform(2,4)
rand

<SparseTensor <GPUBuffer with shape (8,)> with grad None>

In [304]:
rand.to_numpy()

array([[0.00535888, 0.0038081 , 0.00178618, 0.00885832],
       [0.00457069, 0.00786512, 0.00743295, 0.00593193]])

In [305]:
rand.data

<GPUBuffer with shape (8,)>

### update vals

In [306]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)

In [307]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1);
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint col = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint row = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [308]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint topk = get_global_size(0);
      uint col = updateyidx[gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint row = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];m
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

RuntimeError: clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE

Build on <pyopencl.Device 'GeForce GTX 1080 Ti' on 'NVIDIA CUDA' at 0x557a9fbab750>:

<kernel>:17:32: error: use of undeclared identifier 'baseupdateidx'
        float val = updatevals[baseupdateidx+gid*topk+i];
                               ^
<kernel>:18:31: error: use of undeclared identifier 'baseidxidx'
        uint row = updatexidx[baseidxidx+i];
                              ^
<kernel>:32:49: error: use of undeclared identifier 'm'
                matData[idx2] = matData[idx2-1];m
                                                ^

(options: -I /home/fpaboim/.conda/envs/tinygrad/lib/python3.8/site-packages/pyopencl/cl)
(source saved as /tmp/tmpzpsmy200.cl)

In [None]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,1], None, adata_buf, acols_buf, annzs_buf, np.float32(1), np.uint32(ellwa), x_cp_buf, x_idx_buf, y_idx_buf)

resa = np.empty_like(adata)
resaidx = np.zeros(acols.shape).astype(np.uint32)
resannz = np.zeros(annz.shape).astype(np.uint32)

cl.enqueue_copy(queue, resa, adata_buf)
cl.enqueue_copy(queue, resaidx, acols_buf)
cl.enqueue_copy(queue, resannz, annzs_buf)

In [309]:
resa.shape, resaidx.shape, resannz.shape, ellwa, a.T.shape

NameError: name 'resa' is not defined

In [310]:
adenseadd = to_dense(resa, resaidx, resannz, ellwa, a.shape)
adenseadd

NameError: name 'resa' is not defined

In [311]:
adenseadd - adense

array([[0.92007613, 1.05491853, 0.        , 0.        , 0.        ,
        0.        ],
       [0.42260885, 0.99920893, 0.88352025, 0.        , 0.        ,
        0.        ]])

In [312]:
adenseadd == adense

array([[False, False,  True,  True,  True,  True],
       [False, False, False,  True,  True,  True]])

In [313]:
ellwa

4

In [314]:
adata2 = adata.reshape(-1, ellwa)
adata2

array([[0.43555546, 0.47797403, 0.        , 0.        ],
       [0.9252001 , 0.7035292 , 0.        , 0.        ]], dtype=float32)

In [315]:
resa = resa.reshape(-1, ellwa)
resa

NameError: name 'resa' is not defined

In [316]:
resa - adata2

NameError: name 'resa' is not defined

In [317]:
acols

array([2, 4, 0, 0, 0, 3, 0, 0], dtype=uint32)

In [318]:
resaidx

NameError: name 'resaidx' is not defined

In [319]:
resannz

NameError: name 'resannz' is not defined

In [320]:
annz

array([2, 2], dtype=uint32)

### update vals2

In [321]:
adatat_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adatat)
acolst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acolst)
annzst_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annzt)

In [322]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
    __kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         float lr,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,
                         __global  uint* updateyidx
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint gid2 = get_global_id(1);
      uint topk = get_global_size(0);
      uint bs = get_global_size(1);
      uint baseupdateidx = topk*topk*gid2;
      uint baseidxidx = topk*gid2;
      uint row = updateyidx[baseidxidx+gid];

      for (uint i=0; i<topk; i++) {
        float val = updatevals[baseupdateidx+gid*topk+i];
        uint col = updatexidx[baseidxidx+i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = row*ellwidth+i;
          if (colIdx[idx] >= col) {
            //printf("\\nFOUND:%i/%i  - idx:%i", colIdx[idx], col, idx);
            if (colIdx[idx] == col) {
              matData[idx] += -val*lr;
              printf("\\nUPDATE[%i,%i]: %f", row,col, val);
              break;
            } else {
              // insert new column
              printf("\\nINSERT[%i,%i]: %.2f", row,col, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = row*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = -val*lr;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }
    }""").build()

In [323]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,bs], None, adatat_buf, acolst_buf, annzst_buf, np.float32(1), np.uint32(ellwat), x_cp_buf, x_idx_buf, y_idx_buf)

resat = np.empty_like(adatat)
resaidxt = np.zeros(acolst.shape).astype(np.uint32)
resannzt = np.zeros(annzt.shape).astype(np.uint32)

cl.enqueue_copy(queue, resat, adatat_buf)
cl.enqueue_copy(queue, resaidxt, acolst_buf)
cl.enqueue_copy(queue, resannzt, annzst_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec022720>


UPDATE[2,0]: 0.000000
INSERT[0,0]: 0.00
INSERT[0,0]: 0.00
INSERT[0,0]: 0.00
INSERT[0,0]: 0.00
INSERT[0,0]: 0.00
INSERT[3,0]: 0.00
UPDATE[2,0]: 0.000000

In [324]:
ellwa

4

In [325]:
resat.shape, resaidxt.shape, resannzt.shape

((12,), (12,), (6,))

In [326]:
adenseaddt = to_dense(resat, resaidxt, resannzt, ellwat, a.T.shape)
adenseaddt

array([[-0.00000000e+00,  9.25200105e-01],
       [ 0.00000000e+00,  0.00000000e+00],
       [ 4.35555458e-01,  0.00000000e+00],
       [-7.00649232e-45,  7.03529179e-01],
       [ 0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00]])

In [327]:
adenseadd == adenseaddt.T

array([[False, False,  True, False, False,  True],
       [False, False, False,  True,  True,  True]])

In [328]:
adata2t = adatat.reshape(-1, ellwat)
adata2t

array([[0.9252001 , 0.        ],
       [0.        , 0.        ],
       [0.43555546, 0.        ],
       [0.7035292 , 0.        ],
       [0.47797403, 0.        ],
       [0.        , 0.        ]], dtype=float32)

In [329]:
resat = resat.reshape(-1, ellwat)
resat

array([[-0.0000000e+00, -0.0000000e+00],
       [-0.0000000e+00,  9.2520010e-01],
       [ 4.3555546e-01,  0.0000000e+00],
       [-7.0064923e-45,  7.0352918e-01],
       [ 0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00]], dtype=float32)

In [330]:
resat - adata2t

array([[-0.9252001 , -0.        ],
       [-0.        ,  0.9252001 ],
       [ 0.        ,  0.        ],
       [-0.7035292 ,  0.7035292 ],
       [-0.47797403,  0.        ],
       [ 0.        ,  0.        ]], dtype=float32)

In [331]:
acols

array([2, 4, 0, 0, 0, 3, 0, 0], dtype=uint32)

In [332]:
resaidx

NameError: name 'resaidx' is not defined

In [333]:
resannz

NameError: name 'resannz' is not defined

In [334]:
annz

array([2, 2], dtype=uint32)

# OTHER

import numpy as np
import pyopencl as cl

mf = cl.mem_flags

dim = 16
topk = 4

x = np.random.rand(dim).astype(np.float32)
y = np.random.rand(dim).astype(np.float32)
x.shape,y.shape

dim1 = 4
dim2 = 8
dim3 = 1

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.2

a = np.zeros((dim1,dim2))
b = np.random.rand(dim2,dim3).flatten().astype(np.float32)

a.shape, b.shape

In [335]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
val_out_buf = cl.Buffer(ctx, mf.READ_WRITE, 4*topk*topk)
x_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                         __global  float* y,    // INPUT
                         __global  float* xout,    // INPUT
                         uint topk,
                         __global  uint* xoutidx,    // INPUT
                         __global  uint* youtidx    // INPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  uint n = get_global_size(0);
  
  xout[gid] = x[gid];
  xoutidx[gid] = gid;
  youtidx[gid] = gid;
  
  float valx = x[gid];
  float valy = y[gid];
  uint posx = 0;
  uint posy = 0;
  for (uint i = 0; i < n; i++) {
    float tempval = x[i];
    float tempval2 = y[i];
    bool larger = tempval > valx;
    bool larger2 = tempval2 > valy;
      
    posx += (larger)?1:0;
    posy += (larger2)?1:0;
  }
  //printf("posx:%i", posx);
  if (posx < topk) {
    xoutidx[posx] = gid;
  }
  if (posy < topk) {
    youtidx[posy] = gid;
  }
  if (gid < topk) {
    uint i = gid;
    for (uint j=0; j<topk; j++) {
      xout[gid*topk+j] = x[xoutidx[gid]] * y[youtidx[j]];
    }
  }
}""").build()

In [336]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
event = knl(queue, [dim,], None, x_buf, y_buf, val_out_buf, np.uint32(topk), x_idx_buf, y_idx_buf)

#event.wait()
val_out = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, val_out, val_out_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf, wait_for=[event])
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7f48ec02f9f0>

In [337]:
val_out

array([0.668712  , 0.61362845, 0.45689002, 0.41925478], dtype=float32)

In [338]:
resxidx

array([7, 2], dtype=uint32)

In [339]:
resyidx

array([2, 6], dtype=uint32)

In [340]:
asdf

NameError: name 'asdf' is not defined

In [341]:
from __future__ import division

KERNEL_CODE = """
// Thread block size
#define BLOCK_SIZE %(block_size)d
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA %(w_a)d // Matrix A width
#define HA %(h_a)d // Matrix A height
#define WB %(w_b)d // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height
/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */
/* Matrix multiplication: C = A * B.
 * Device code.
 */
#define AS(j, i) As[i + j * BLOCK_SIZE]
#define BS(j, i) Bs[i + j * BLOCK_SIZE]
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! WA is A's width and WB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(16,16,1))) 
void
matrixMul( __global float* C, __global float* A, __global float* B)
{
    __local float As[BLOCK_SIZE*BLOCK_SIZE];
    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);
    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);
    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + WA - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * WB;
    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0f;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {
        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + WA * ty + tx];
        BS(ty, tx) = B[b + WB * ty + tx];
        // Synchronize to make sure the matrices are loaded
        barrier(CLK_LOCAL_MEM_FENCE);
        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
"""


In [342]:
a2 = np.random.rand(4,4)

In [343]:
a2

array([[0.17847956, 0.57971078, 0.46363438, 0.67322183],
       [0.47612445, 0.95251115, 0.85325763, 0.89850711],
       [0.0255691 , 0.1564229 , 0.47689815, 0.73472184],
       [0.35630948, 0.56445363, 0.0454695 , 0.64044131]])

In [344]:
a2.sum(axis=1)

array([1.89504655, 3.18040034, 1.393612  , 1.60667391])

In [345]:
b2 = np.random.rand(4,4)

In [346]:
b2

array([[0.68120638, 0.39563748, 0.13667301, 0.41645775],
       [0.73846283, 0.58430764, 0.88343323, 0.57874961],
       [0.17797046, 0.59175169, 0.05387774, 0.74341313],
       [0.20103766, 0.90467534, 0.47356788, 0.44165318]])

In [347]:
b2.sum(axis=0)

array([1.79867734, 2.47637215, 1.54755186, 2.18027368])

In [348]:
matmul = a2.dot(b2)
matmul

array([[0.76753245, 1.29274625, 0.88032491, 1.05183904],
       [1.36022152, 2.06270609, 1.37802907, 1.78070263],
       [0.36551088, 1.04840521, 0.51531866, 0.78020273],
       [0.79639337, 1.07708206, 0.85309721, 0.79172073]])