In [1]:
from tinygrad.densetensor import DenseTensor
from tinygrad.sparsetensor import SparseTensor
import numpy as np

%load_ext autoreload
%autoreload 2

DEVICE:GPU


In [2]:
x_init = np.random.randn(1,3).astype(np.float32)
x2_init = np.random.randn(3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(3,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)

In [3]:
x = DenseTensor(x_init)
W = DenseTensor(W_init)
m = DenseTensor(m_init)
out = x.dot(W).relu()
out = out.logsoftmax()
out = out.mul(m).add(m).sum()
out.backward()

out.cpu().data, x

(array([0.03697176], dtype=float32),
 <DenseTensor <GPUBuffer with shape (1, 3)> with grad <GPUBuffer with shape (1, 3)>>)

In [4]:
x2 = DenseTensor(x2_init)#.gpu()
W = SparseTensor(W_init)
out = W.dot(x2).relu().sum()

out.backward()

out.cpu().data, x

SPARSE!


(array([2.9896617], dtype=float32),
 <DenseTensor <GPUBuffer with shape (1, 3)> with grad <GPUBuffer with shape (1, 3)>>)

In [5]:
import numpy as np
import pyopencl as cl

mf = cl.mem_flags

In [6]:
dim1 = 16
dim2 = 16
dim3 = 16

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.2

a = np.zeros((dim1,dim2))
b = np.random.rand(dim3).flatten().astype(np.float32)

a.shape, b.shape

((16, 16), (16,))

In [7]:
def fill_sparse(mat, sparsity=0.1):
    indices = np.array(range(mat.shape[1]))
    nrows = int(mat.shape[1]*sparsity)
    for row in range(mat.shape[0]):
        lim = nrows #+ int(np.random.random()*3)
        mat[row][np.random.permutation(indices)[:lim]] = np.random.random(lim)
    return mat

a = fill_sparse(a, sparsity)
#b = fill_sparse(b, sparsity)

In [8]:
a

array([[0.        , 0.        , 0.        , 0.22884285, 0.72049601,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37543965, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.42571469, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.12234535, 0.        ,
        0.03941203],
       [0.        , 0.        , 0.        , 0.6174901 , 0.07481382,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.37348408, 0.        , 0.        ,
        0.        ],
       [0.        , 0.25003672, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.26944973, 0.        ,
        0.52407222],
       [0.        , 0.92721524, 0.        , 0.        , 0.        ,
        0.        , 0.        , 

In [9]:
b

array([0.9134758 , 0.48745564, 0.6517366 , 0.6415785 , 0.6639154 ,
       0.9750189 , 0.27635226, 0.44200438, 0.84631807, 0.29869252,
       0.435801  , 0.03999306, 0.46944958, 0.06950701, 0.4052567 ,
       0.9115287 ], dtype=float32)

In [10]:
x2_init.T

array([ 0.5816005 ,  0.16773574, -1.1323636 ], dtype=float32)

In [11]:
mult = a.dot(b)
mult

array([0.78878602, 0.25194608, 0.62117036, 0.61831732, 0.53253507,
       1.15707201, 1.24639114, 0.84228357, 0.63913486, 0.47997026,
       0.56223503, 0.62352903, 1.29664743, 0.31691541, 0.3673037 ,
       0.54798406])

In [12]:
mult.shape

(16,)

In [13]:
def to_data(mat):
    ellwidth = int(mat.shape[1]/2)
    all_rows = []
    all_idxs = []
    all_nnzs = []
    for row in range(mat.shape[0]):
        rowdata = []
        colidxs = []
        all_nnzs.append(0)
        for col in range(mat.shape[1]):
            val = mat[row][col]
            if val != 0:
                rowdata.append(val)
                colidxs.append(col)
                all_nnzs[-1] += 1
        rowdata = np.array(rowdata)
        rowdata.resize(ellwidth)
        all_rows.append(rowdata)
        colidxs = np.array(colidxs)
        colidxs.resize(ellwidth)
        all_idxs.append(colidxs)
    all_rows = np.array(all_rows).astype(np.float32).flatten()
    all_idxs = np.array(all_idxs).astype(np.uint32).flatten()
    all_nnzs = np.array(all_nnzs).astype(np.uint32)
    return all_rows, all_idxs, all_nnzs, ellwidth

In [14]:
adata, acols, annz, ellwa = to_data(a)
adata, acols, annz, ellwa

(array([0.22884285, 0.720496  , 0.37543964, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.4257147 , 0.12234534,
        0.03941203, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.6174901 , 0.07481382, 0.37348408, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.25003672,
        0.26944974, 0.52407223, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.9272152 , 0.6210204 , 0.80167836,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.71759963, 0.07308079, 0.99252075, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.42398492, 0.9372269 ,
        0.3767659 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.81243616, 0.01329434, 0.0506534 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.97979206,
        0.6906237 , 0.96958005, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.9060216 , 0.03

In [15]:
#acols = acols.astype(np.uint32)
#annz = annz.astype(np.uint32)

In [16]:
adata, acols, annz, b

(array([0.22884285, 0.720496  , 0.37543964, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.4257147 , 0.12234534,
        0.03941203, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.6174901 , 0.07481382, 0.37348408, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.25003672,
        0.26944974, 0.52407223, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.9272152 , 0.6210204 , 0.80167836,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.71759963, 0.07308079, 0.99252075, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.42398492, 0.9372269 ,
        0.3767659 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.81243616, 0.01329434, 0.0506534 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.97979206,
        0.6906237 , 0.96958005, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.9060216 , 0.03

In [17]:
adata_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=annz)
b_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void SpMVNaive(__global  float* matData,     // INPUT MATRIX DATA
                        __global  uint*  colIdx,
                        __global  uint*  rowNnz,
                        uint   ellwidth,
                        __global  float* vector_x,    // INPUT
                        __global  float* vector_y    // OUTPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  
  
  uint nnz    = rowNnz[gid];
  float sum = 0;
  for (uint i = 0; i < nnz; i++) {
    uint index   = gid * ellwidth + i;
    uint col     = colIdx[index];
    float aval  = matData[index];
    float xval  = vector_x[col];
    printf("aval, xval: %.2f,%.2f:%i-%i \\n", aval, xval, col, index);
    sum  += aval * xval;
  }
  printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
  vector_y[gid] = sum;
}""").build()

In [18]:
a.shape, b.shape

((16, 16), (16,))

In [19]:
res = np.zeros(a.shape[0]).astype(np.float32)
res

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [20]:
rows = a.shape[0]

In [21]:
ellw = np.array([ellwa]).astype(np.uint32)
ellw

array([8], dtype=uint32)

In [22]:
mult = mult.astype(np.float32)

In [23]:
mult

array([0.788786  , 0.2519461 , 0.62117034, 0.6183173 , 0.5325351 ,
       1.157072  , 1.2463912 , 0.84228355, 0.6391349 , 0.47997025,
       0.56223506, 0.623529  , 1.2966474 , 0.3169154 , 0.3673037 ,
       0.54798406], dtype=float32)

In [24]:
mult.nbytes

64

In [25]:
len(mult)*4

64

In [26]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, mult.nbytes)
knl = prg.SpMVNaive  # Use this Kernel object for repeated calls
knl(queue, [rows,], None, adata_buf, acols_buf, annzs_buf, ellw, b_buf, res_buf)

res_np = np.empty_like(a)
cl.enqueue_copy(queue, res, res_buf)

<pyopencl._cl.NannyEvent at 0x7efbc820ff90>

aval, xval: 0.23,0.64:3-0 
aval, xval: 0.43,0.49:1-8 
aval, xval: 0.62,0.64:3-16 
aval, xval: 0.25,0.49:1-24 
aval, xval: 0.93,0.49:1-32 
aval, xval: 0.72,0.91:0-40 
aval, xval: 0.42,0.65:2-48 
aval, xval: 0.81,0.98:5-56 
aval, xval: 0.98,0.28:6-64 
aval, xval: 0.91,0.49:1-72 
aval, xval: 0.19,0.65:2-80 
aval, xval: 0.66,0.66:4-88 
aval, xval: 0.68,0.66:4-96 
aval, xval: 0.05,0.64:3-104 
aval, xval: 0.01,0.91:0-112 
aval, xval: 0.34,0.28:6-120 
aval, xval: 0.72,0.66:4-1 
aval, xval: 0.12,0.07:13-9 
aval, xval: 0.07,0.66:4-17 
aval, xval: 0.27,0.07:13-25 
aval, xval: 0.62,0.04:11-33 
aval, xval: 0.07,0.49:1-41 
aval, xval: 0.94,0.85:8-49 
aval, xval: 0.01,0.30:9-57 
aval, xval: 0.69,0.44:10-65 
aval, xval: 0.04,0.64:3-73 
aval, xval: 0.63,0.64:3-81 
aval, xval: 0.21,0.44:7-89 
aval, xval: 0.66,0.98:5-97 
aval, xval: 0.28,0.41:14-105 
aval, xval: 0.10,0.66:4-113 
aval, xval: 0.29,0.44:7-121 
aval, xval: 0.38,0.44:10-2 
aval, xval: 0.04,0.91:15-10 
aval, xval: 0.37,0.47:12-18 
aval, xval:

In [27]:
res_buf

<pyopencl._cl.Buffer at 0x7efbc8c75ea0>

In [28]:
res

array([0.788786  , 0.2519461 , 0.62117034, 0.61831737, 0.532535  ,
       1.157072  , 1.2463912 , 0.84228355, 0.6391348 , 0.47997025,
       0.56223506, 0.623529  , 1.2966474 , 0.31691542, 0.3673037 ,
       0.54798406], dtype=float32)

In [29]:
mult

array([0.788786  , 0.2519461 , 0.62117034, 0.6183173 , 0.5325351 ,
       1.157072  , 1.2463912 , 0.84228355, 0.6391349 , 0.47997025,
       0.56223506, 0.623529  , 1.2966474 , 0.3169154 , 0.3673037 ,
       0.54798406], dtype=float32)

In [30]:
(res-mult).sum()

-2.9802322e-08

## Weight update kernel

In [31]:
dim = 16
topk = 4

x = np.random.rand(dim).astype(np.float32)
y = np.random.rand(dim).astype(np.float32)
x.shape,y.shape

((16,), (16,))

In [32]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, x.nbytes)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, x.nbytes)
y_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, x.nbytes)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, x.nbytes)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void genwupdate(__global  float* x,     // INPUT MATRIX DATA
                         __global  float* y,    // INPUT
                         __global  float* xout,    // INPUT
                         __global  uint* xoutidx,    // INPUT
                         __global  float* yout,    // INPUT
                         __global  uint* youtidx    // INPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  uint n = get_global_size(0);
  
  xout[gid] = x[gid];
  xoutidx[gid] = gid;
  yout[gid] = y[gid];
  youtidx[gid] = gid;
  
  float valx = x[gid];
  float valy = y[gid];
  uint idx = xoutidx[gid];
  uint posx = 0;
  uint posy = 0;
  for (uint i = 0; i < n; i++) {
    float tempval = x[i];
    float tempval2 = y[i];
    bool smaller = tempval > valx;
    bool smaller2 = tempval2 > valy;
      
    posx += (smaller)?1:0;
    posy += (smaller2)?1:0;
  }
  //printf("posx:%i", posx);
  xout[posx] = valx;
  xoutidx[posx] = gid;
  yout[posy] = valy;
  youtidx[posy] = gid;
}""").build()

In [33]:
knl = prg.genwupdate  # Use this Kernel object for repeated calls
knl(queue, [dim,], None, x_buf, y_buf, x_cp_buf, x_idx_buf, y_cp_buf, y_idx_buf)

resx = np.zeros(dim).astype(np.float32)
resxidx = np.zeros(dim).astype(np.uint32)
resy = np.zeros(dim).astype(np.float32)
resyidx = np.zeros(dim).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resy, y_cp_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7efbc820fe50>

In [34]:
x

array([0.38436326, 0.6282593 , 0.23239243, 0.84487927, 0.34840786,
       0.0562746 , 0.5742207 , 0.1643646 , 0.7778896 , 0.2617748 ,
       0.15786238, 0.94446903, 0.43403378, 0.5977317 , 0.28563315,
       0.32617348], dtype=float32)

In [35]:
resx

array([0.94446903, 0.84487927, 0.7778896 , 0.6282593 , 0.5977317 ,
       0.5742207 , 0.43403378, 0.38436326, 0.34840786, 0.32617348,
       0.28563315, 0.2617748 , 0.23239243, 0.1643646 , 0.15786238,
       0.0562746 ], dtype=float32)

In [36]:
resxidx

array([11,  3,  8,  1, 13,  6, 12,  0,  4, 15, 14,  9,  2,  7, 10,  5],
      dtype=uint32)

In [37]:
y

array([0.34142965, 0.912634  , 0.22059757, 0.38525775, 0.5819617 ,
       0.17507966, 0.40275997, 0.48376763, 0.7341737 , 0.14274661,
       0.5187501 , 0.41498157, 0.20583646, 0.07330994, 0.31233174,
       0.910271  ], dtype=float32)

In [38]:
resy

array([0.912634  , 0.910271  , 0.7341737 , 0.5819617 , 0.5187501 ,
       0.48376763, 0.41498157, 0.40275997, 0.38525775, 0.34142965,
       0.31233174, 0.22059757, 0.20583646, 0.17507966, 0.14274661,
       0.07330994], dtype=float32)

In [39]:
resyidx

array([ 1, 15,  8,  4, 10,  7, 11,  6,  3,  0, 14,  2, 12,  5,  9, 13],
      dtype=uint32)

## Weight update kernel

dim = 16
topk = 4

x = np.random.rand(dim).astype(np.float32)
y = np.random.rand(dim).astype(np.float32)
x.shape,y.shape

In [40]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
x_cp_buf = cl.Buffer(ctx, mf.WRITE_ONLY, 4*topk*topk)
x_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, 4*topk)
y_idx_buf = cl.Buffer(ctx, mf.WRITE_ONLY, 4*topk)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                             __global  float* y,    // INPUT
                             __global  float* xout,    // INPUT
                             uint topk,
                             __global  uint* xoutidx,    // INPUT
                             __global  uint* youtidx    // INPUT
                            ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);

      xout[gid] = x[gid];
      xoutidx[gid] = gid;
      youtidx[gid] = gid;

      float valx = x[gid];
      float valy = y[gid];
      uint posx = 0;
      uint posy = 0;
      for (uint i = 0; i < n; i++) {
        float tempval = x[i];
        float tempval2 = y[i];
        bool larger = tempval > valx;
        bool larger2 = tempval2 > valy;

        posx += (larger)?1:0;
        posy += (larger2)?1:0;
      }
      //printf("posx:%i", posx);
      if (posx < topk) {
        xoutidx[posx] = gid;
      }
      if (posy < topk) {
        youtidx[posy] = gid;
      }
      if (gid < topk) {
        uint i = gid;
        for (uint j=0; j<topk; j++) {
          xout[gid*topk+j] = x[xoutidx[gid]] * y[youtidx[j]];
        }
      }
    }""").build()

In [41]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
knl(queue, [dim,], None, x_buf, y_buf, x_cp_buf, np.uint32(topk), x_idx_buf, y_idx_buf)

resx = np.zeros(dim).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, resx, x_cp_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf)
cl.enqueue_copy(queue, resyidx, y_idx_buf)

<pyopencl._cl.NannyEvent at 0x7efbc821f4f0>

In [42]:
x

array([0.38436326, 0.6282593 , 0.23239243, 0.84487927, 0.34840786,
       0.0562746 , 0.5742207 , 0.1643646 , 0.7778896 , 0.2617748 ,
       0.15786238, 0.94446903, 0.43403378, 0.5977317 , 0.28563315,
       0.32617348], dtype=float32)

In [43]:
resx

array([0.86195457, 0.85972273, 0.6934043 , 0.54964477, 0.77106553,
       0.7690691 , 0.62028813, 0.49168736, 0.7099285 , 0.70809036,
       0.5711061 , 0.45270196, 0.5733708 , 0.57188624, 0.46125147,
       0.36562285], dtype=float32)

In [44]:
resx.reshape(4,4)

array([[0.86195457, 0.85972273, 0.6934043 , 0.54964477],
       [0.77106553, 0.7690691 , 0.62028813, 0.49168736],
       [0.7099285 , 0.70809036, 0.5711061 , 0.45270196],
       [0.5733708 , 0.57188624, 0.46125147, 0.36562285]], dtype=float32)

In [45]:
resxidx

array([11,  3,  8,  1], dtype=uint32)

In [46]:
resyidx

array([ 1, 15,  8,  4], dtype=uint32)

In [47]:
x*y

array([0.13123302, 0.5733708 , 0.0512652 , 0.3254963 , 0.20276003,
       0.00985254, 0.23127311, 0.07951427, 0.5711061 , 0.03736747,
       0.08189112, 0.39193726, 0.08933998, 0.04381968, 0.0892123 ,
       0.29690626], dtype=float32)

In [48]:
x.reshape(dim,1)*y[7]

array([[0.1859425 ],
       [0.3039315 ],
       [0.11242393],
       [0.40872523],
       [0.16854845],
       [0.02722383],
       [0.27778938],
       [0.07951427],
       [0.3763178 ],
       [0.12663817],
       [0.07636871],
       [0.45690355],
       [0.20997149],
       [0.28916326],
       [0.13818008],
       [0.15779217]], dtype=float32)

In [49]:
y

array([0.34142965, 0.912634  , 0.22059757, 0.38525775, 0.5819617 ,
       0.17507966, 0.40275997, 0.48376763, 0.7341737 , 0.14274661,
       0.5187501 , 0.41498157, 0.20583646, 0.07330994, 0.31233174,
       0.910271  ], dtype=float32)

In [50]:
resy

array([0.912634  , 0.910271  , 0.7341737 , 0.5819617 , 0.5187501 ,
       0.48376763, 0.41498157, 0.40275997, 0.38525775, 0.34142965,
       0.31233174, 0.22059757, 0.20583646, 0.17507966, 0.14274661,
       0.07330994], dtype=float32)

In [51]:
resyidx

array([ 1, 15,  8,  4], dtype=uint32)

### update vals

In [52]:
prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void addvals(__global  float* matData,     // INPUT MATRIX DATA
                         __global  uint*  colIdx,
                         __global  uint*  rowNnz,
                         uint   ellwidth,
                         __global  float* updatevals,    // INPUT
                         __global  uint* updatexidx,   
                         __global  uint* updateyidx   
                         ) { // LOCAL SHARED BUFFER
      uint gid = get_global_id(0);
      uint n = get_global_size(0);
      uint row = updateyidx[gid];
      
      for (uint i=0; i<n; i++) {
        float val = updatevals[gid*n+i];
        uint col = updatexidx[i];
        for (uint i=0; i<rowNnz[row]; i++) {
          uint idx = gid*ellwidth+i;
          if (colIdx[idx] >= col) {
            if (colIdx[idx] == col) {
              matData[idx] += val;
              printf("UPDATE[%i]: %.2f", idx, val);
              break;
            } else {
              // insert new column
              printf("INSERT[%i]: %.2f", idx, val);
              for (uint j=rowNnz[row]+1; j>i; j--) {
                uint idx2 = gid*ellwidth+j;
                matData[idx2] = matData[idx2-1];
                colIdx[idx2] = colIdx[idx2-1];
              }
              matData[idx] = val;
              colIdx[idx] = col;
              rowNnz[row] += 1;
              break;
            }
          }
        }
        if (rowNnz[row] >= ellwidth) {
          break;
        }
      }

      
    }""").build()

In [53]:
knl = prg.addvals  # Use this Kernel object for repeated calls
knl(queue, [topk,], None, adata_buf, acols_buf, annzs_buf, np.uint32(ellw), x_cp_buf, x_idx_buf, y_idx_buf)

resa = np.empty_like(adata)
resaidx = np.zeros(acols.shape).astype(np.uint32)
resannz = np.zeros(annz.shape).astype(np.uint32)

cl.enqueue_copy(queue, resa, adata_buf)
cl.enqueue_copy(queue, resaidx, acols_buf)
cl.enqueue_copy(queue, resannz, annzs_buf)

INSERT[9]: 0.77INSERT[25]: 0.57INSERT[18]: 0.71UPDATE[0]: 0.86UPDATE[16]: 0.71INSERT[9]: 0.77INSERT[25]: 0.57INSERT[2]: 0.69INSERT[10]: 0.62INSERT[18]: 0.57INSERT[26]: 0.46UPDATE[8]: 0.49UPDATE[24]: 0.37INSERT[0]: 0.55INSERT[16]: 0.45

<pyopencl._cl.NannyEvent at 0x7efbc823d360>

In [54]:
resa

array([0.54964477, 1.0885656 , 0.720496  , 0.6934043 , 0.37543964,
       0.        , 0.        , 0.        , 0.917402  , 0.7690691 ,
       0.62028813, 0.77106553, 0.12234534, 0.03941203, 0.        ,
       0.        , 0.45270196, 1.3255805 , 0.07481382, 0.5711061 ,
       0.7099285 , 0.37348408, 0.        , 0.        , 0.6156596 ,
       0.57188624, 0.46125147, 0.5733708 , 0.26944974, 0.52407223,
       0.        , 0.        , 0.9272152 , 0.6210204 , 0.80167836,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.71759963, 0.07308079, 0.99252075, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.42398492, 0.9372269 ,
       0.3767659 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.81243616, 0.01329434, 0.0506534 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.97979206,
       0.6906237 , 0.96958005, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.9060216 , 0.03547956, 0.38911

In [55]:
adata

array([0.22884285, 0.720496  , 0.37543964, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.4257147 , 0.12234534,
       0.03941203, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.6174901 , 0.07481382, 0.37348408, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.25003672,
       0.26944974, 0.52407223, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.9272152 , 0.6210204 , 0.80167836,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.71759963, 0.07308079, 0.99252075, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.42398492, 0.9372269 ,
       0.3767659 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.81243616, 0.01329434, 0.0506534 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.97979206,
       0.6906237 , 0.96958005, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.9060216 , 0.03547956, 0.38911

In [56]:
acols

array([ 3,  4, 10,  0,  0,  0,  0,  0,  1, 13, 15,  0,  0,  0,  0,  0,  3,
        4, 12,  0,  0,  0,  0,  0,  1, 13, 15,  0,  0,  0,  0,  0,  1, 11,
       13,  0,  0,  0,  0,  0,  0,  1, 12,  0,  0,  0,  0,  0,  2,  8, 12,
        0,  0,  0,  0,  0,  5,  9, 15,  0,  0,  0,  0,  0,  6, 10, 13,  0,
        0,  0,  0,  0,  1,  3, 11,  0,  0,  0,  0,  0,  2,  3, 11,  0,  0,
        0,  0,  0,  4,  7, 14,  0,  0,  0,  0,  0,  4,  5,  9,  0,  0,  0,
        0,  0,  3, 14, 15,  0,  0,  0,  0,  0,  0,  4, 15,  0,  0,  0,  0,
        0,  6,  7,  8,  0,  0,  0,  0,  0], dtype=uint32)

In [57]:
resaidx

array([ 1,  3,  4,  8, 10,  0,  0,  0,  1,  3,  8, 11, 13, 15,  0,  0,  1,
        3,  4,  8, 11, 12,  0,  0,  1,  3,  8, 11, 13, 15,  0,  0,  1, 11,
       13,  0,  0,  0,  0,  0,  0,  1, 12,  0,  0,  0,  0,  0,  2,  8, 12,
        0,  0,  0,  0,  0,  5,  9, 15,  0,  0,  0,  0,  0,  6, 10, 13,  0,
        0,  0,  0,  0,  1,  3, 11,  0,  0,  0,  0,  0,  2,  3, 11,  0,  0,
        0,  0,  0,  4,  7, 14,  0,  0,  0,  0,  0,  4,  5,  9,  0,  0,  0,
        0,  0,  3, 14, 15,  0,  0,  0,  0,  0,  0,  4, 15,  0,  0,  0,  0,
        0,  6,  7,  8,  0,  0,  0,  0,  0], dtype=uint32)

In [58]:
resannz

array([3, 5, 3, 3, 6, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 6], dtype=uint32)

In [59]:
asdf

NameError: name 'asdf' is not defined

# OTHER

import numpy as np
import pyopencl as cl

mf = cl.mem_flags

dim = 16
topk = 4

x = np.random.rand(dim).astype(np.float32)
y = np.random.rand(dim).astype(np.float32)
x.shape,y.shape

dim1 = 4
dim2 = 8
dim3 = 1

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

sparsity = 0.2

a = np.zeros((dim1,dim2))
b = np.random.rand(dim2,dim3).flatten().astype(np.float32)

a.shape, b.shape

In [None]:
x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
y_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=y)
val_out_buf = cl.Buffer(ctx, mf.READ_WRITE, 4*topk*topk)
x_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)
y_idx_buf = cl.Buffer(ctx, mf.READ_WRITE, topk*4)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void genwupdate2(__global  float* x,     // INPUT MATRIX DATA
                         __global  float* y,    // INPUT
                         __global  float* xout,    // INPUT
                         uint topk,
                         __global  uint* xoutidx,    // INPUT
                         __global  uint* youtidx    // INPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  uint n = get_global_size(0);
  
  xout[gid] = x[gid];
  xoutidx[gid] = gid;
  youtidx[gid] = gid;
  
  float valx = x[gid];
  float valy = y[gid];
  uint posx = 0;
  uint posy = 0;
  for (uint i = 0; i < n; i++) {
    float tempval = x[i];
    float tempval2 = y[i];
    bool larger = tempval > valx;
    bool larger2 = tempval2 > valy;
      
    posx += (larger)?1:0;
    posy += (larger2)?1:0;
  }
  //printf("posx:%i", posx);
  if (posx < topk) {
    xoutidx[posx] = gid;
  }
  if (posy < topk) {
    youtidx[posy] = gid;
  }
  if (gid < topk) {
    uint i = gid;
    for (uint j=0; j<topk; j++) {
      xout[gid*topk+j] = x[xoutidx[gid]] * y[youtidx[j]];
    }
  }
}""").build()

In [None]:
knl = prg.genwupdate2  # Use this Kernel object for repeated calls
event = knl(queue, [dim,], None, x_buf, y_buf, val_out_buf, np.uint32(topk), x_idx_buf, y_idx_buf)

#event.wait()
val_out = np.zeros(topk*topk).astype(np.float32)
resxidx = np.zeros(topk).astype(np.uint32)
resyidx = np.zeros(topk).astype(np.uint32)

cl.enqueue_copy(queue, val_out, val_out_buf)
cl.enqueue_copy(queue, resxidx, x_idx_buf, wait_for=[event])
cl.enqueue_copy(queue, resyidx, y_idx_buf)

In [None]:
val_out

In [None]:
resxidx

In [None]:
resyidx

In [None]:
from __future__ import division

KERNEL_CODE = """
// Thread block size
#define BLOCK_SIZE %(block_size)d
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA %(w_a)d // Matrix A width
#define HA %(h_a)d // Matrix A height
#define WB %(w_b)d // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height
/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */
/* Matrix multiplication: C = A * B.
 * Device code.
 */
#define AS(j, i) As[i + j * BLOCK_SIZE]
#define BS(j, i) Bs[i + j * BLOCK_SIZE]
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! WA is A's width and WB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(16,16,1))) 
void
matrixMul( __global float* C, __global float* A, __global float* B)
{
    __local float As[BLOCK_SIZE*BLOCK_SIZE];
    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);
    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);
    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + WA - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * WB;
    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0f;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {
        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + WA * ty + tx];
        BS(ty, tx) = B[b + WB * ty + tx];
        // Synchronize to make sure the matrices are loaded
        barrier(CLK_LOCAL_MEM_FENCE);
        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
"""
