In [7]:
from tinygrad.tensor import Tensor
from tinygrad.sparsetensor import SparseTensor
import numpy as np

In [8]:
x_init = np.random.randn(1,3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(3,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)

In [10]:
x = Tensor(x_init)
W = Tensor(W_init)
m = Tensor(m_init)
out = x.dot(W).relu()
out = out.logsoftmax()
out = out.mul(m).add(m).sum()
out.backward()

out.cpu().data, x

(array([0.17237905], dtype=float32),
 <Tensor <GPUBuffer with shape (1, 3)> with grad <GPUBuffer with shape (1, 3)>>)

In [12]:
x = Tensor(x_init)
W = SparseTensor(W_init)
m = Tensor(m_init)
out = x.dot(W).relu()
out = out.logsoftmax()
out = out.mul(m).add(m).sum()
out.backward()

out.cpu().data, x

NameError: name 'to_ell' is not defined

In [2]:
import pyopencl as cl
from time import time
import numpy

block_size = 16

ctx = cl.create_some_context()

for dev in ctx.devices:
    assert dev.local_mem_size > 0

queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

#queue = cl.CommandQueue(ctx)

if False:
    a_height = 4096
    #a_height = 1024
    a_width = 2048
    #a_width = 256
    #b_height == a_width
    b_width = a_height

elif False:
    # like PyCUDA
    a_height = 2516
    a_width = 1472
    b_height = a_width
    b_width = 2144

else:
    # CL SDK
    a_width = 128*block_size
    a_height = 128*block_size
    b_width = 50*block_size
    b_height = a_width

c_width = b_width
c_height = a_height

h_a = numpy.random.rand(a_height, a_width).astype(numpy.float32)
h_b = numpy.random.rand(b_height, b_width).astype(numpy.float32)
h_c = numpy.empty((c_height, c_width)).astype(numpy.float32)

In [3]:

kernel_params = {"block_size": block_size,
        "w_a":a_width, "h_a":a_height, "w_b":b_width}

prg = cl.Program(ctx, KERNEL_CODE % kernel_params,
        ).build(options="-cl-mad-enable -cl-fast-relaxed-math")
kernel = prg.matrixMul
#print prg.binaries[0]

assert a_width % block_size == 0
assert a_height % block_size == 0
assert b_width % block_size == 0

# transfer host -> device -----------------------------------------------------
mf = cl.mem_flags

t1 = time()

d_a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_a)
d_b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_b)
d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes)

push_time = time()-t1

In [4]:
# warmup ----------------------------------------------------------------------
for i in range(5):
    event = kernel(queue, h_c.shape, (block_size, block_size), 
            d_c_buf, d_a_buf, d_b_buf)
    event.wait()

queue.finish()

# actual benchmark ------------------------------------------------------------
t1 = time()

count = 20
for i in range(count):
    event = kernel(queue, h_c.shape, (block_size, block_size),
            d_c_buf, d_a_buf, d_b_buf)

event.wait()

gpu_time = (time()-t1)/count

# transfer device -> host -----------------------------------------------------
t1 = time()
cl.enqueue_copy(queue, d_c_buf, h_c)#.wait()
pull_time = time()-t1

# timing output ---------------------------------------------------------------
gpu_total_time = gpu_time+push_time+pull_time

print("GPU push+compute+pull total [s]:", gpu_total_time)
print("GPU push [s]:", push_time)
print("GPU pull [s]:", pull_time)
print("GPU compute (host-timed) [s]:", gpu_time)
print("GPU compute (event-timed) [s]: ", (event.profile.end-event.profile.start)*1e-9)

gflop = h_c.size * (a_width * 2.) / (1000**3.)
gflops = gflop / gpu_time

print("GFlops/s:", gflops)

# cpu comparison --------------------------------------------------------------
t1 = time()
h_c_cpu = numpy.dot(h_a,h_b)
cpu_time = time()-t1

print("---------------------------")
print("GPU==CPU:",numpy.allclose(h_c, h_c_cpu))
print("CPU time (s)", cpu_time)

print("GPU speedup (with transfer): ", cpu_time/gpu_total_time)
print("GPU speedup (without transfer): ", cpu_time/gpu_time)

GPU push+compute+pull total [s]: 0.014005398750305176
GPU push [s]: 0.0073621273040771484
GPU pull [s]: 0.001420736312866211
GPU compute (host-timed) [s]: 0.005222535133361817
GPU compute (event-timed) [s]:  0.005093408000000001
GFlops/s: 1284.986357895521
---------------------------
GPU==CPU: False
CPU time (s) 0.029859066009521484
GPU speedup (with transfer):  2.131968288933641
GPU speedup (without transfer):  5.717350912352943


In [5]:
import numpy as np
import pyopencl as cl

In [6]:
dim1 = 4
dim2 = 8
dim3 = 1

sparsity = 0.2

a = np.zeros((dim1,dim2))
b = np.random.rand(dim2,dim3).flatten().astype(np.float32)

a.shape, b.shape

((4, 8), (8,))

In [7]:
def fill_sparse(mat, sparsity=0.1):
    indices = np.array(range(mat.shape[1]))
    nrows = int(mat.shape[1]*sparsity)
    for row in range(mat.shape[0]):
        lim = nrows #+ int(np.random.random()*3)
        mat[row][np.random.permutation(indices)[:lim]] = np.random.random(lim)
    return mat

a = fill_sparse(a, sparsity)
#b = fill_sparse(b, sparsity)

In [8]:
a

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.46130043],
       [0.        , 0.        , 0.        , 0.43101542, 0.        ,
        0.        , 0.        , 0.        ],
       [0.00360091, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.18287891, 0.        , 0.        ]])

In [9]:
b

array([0.4183415 , 0.49459058, 0.40111175, 0.0073482 , 0.24470419,
       0.3563211 , 0.7307064 , 0.02117306], dtype=float32)

In [10]:
mult = a.dot(b)
mult

array([0.00976714, 0.00316719, 0.00150641, 0.06516361])

In [11]:
mult.shape

(4,)

In [12]:
def to_data(mat):
    ellwidth = int(mat.shape[1]/2)
    all_rows = []
    all_idxs = []
    all_nnzs = []
    for row in range(mat.shape[0]):
        rowdata = []
        colidxs = []
        all_nnzs.append(0)
        for col in range(mat.shape[1]):
            val = mat[row][col]
            if val != 0:
                rowdata.append(val)
                colidxs.append(col)
                all_nnzs[-1] += 1
        rowdata = np.array(rowdata)
        rowdata.resize(ellwidth)
        all_rows.append(rowdata)
        colidxs = np.array(colidxs)
        colidxs.resize(ellwidth)
        all_idxs.append(colidxs)
    all_rows = np.array(all_rows).astype(np.float32).flatten()
    all_idxs = np.array(all_idxs).astype(np.uint32).flatten()
    all_nnzs = np.array(all_nnzs).astype(np.uint32)
    return all_rows, all_idxs, all_nnzs, ellwidth

In [13]:
adata, acols, annz, ellwa = to_data(a)
adata, acols, annz, ellwa

(array([0.46130043, 0.        , 0.        , 0.        , 0.4310154 ,
        0.        , 0.        , 0.        , 0.00360091, 0.        ,
        0.        , 0.        , 0.18287891, 0.        , 0.        ,
        0.        ], dtype=float32),
 array([7, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0], dtype=uint32),
 array([1, 1, 1, 1], dtype=uint32),
 4)

In [14]:
#acols = acols.astype(np.uint32)
#annz = annz.astype(np.uint32)

In [15]:
adata, acols, annz, b

(array([0.46130043, 0.        , 0.        , 0.        , 0.4310154 ,
        0.        , 0.        , 0.        , 0.00360091, 0.        ,
        0.        , 0.        , 0.18287891, 0.        , 0.        ,
        0.        ], dtype=float32),
 array([7, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0], dtype=uint32),
 array([1, 1, 1, 1], dtype=uint32),
 array([0.4183415 , 0.49459058, 0.40111175, 0.0073482 , 0.24470419,
        0.3563211 , 0.7307064 , 0.02117306], dtype=float32))

In [16]:
mf = cl.mem_flags
adata_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=adata)
acols_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=acols)
annzs_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=annz)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)

prg = cl.Program(ctx, """
// Every global_id_0 works on a row
__kernel void SpMVNaive(__global  float* matData,     // INPUT MATRIX DATA
                        __global  uint*  colIdx,
                        __global  uint*  rowNnz,
                        uint   ellwidth,
                        __global  float* vector_x,    // INPUT
                        __global  float* vector_y    // OUTPUT
                        ) { // LOCAL SHARED BUFFER
  uint gid = get_global_id(0);
  
  
  uint nnz    = rowNnz[gid];
  float sum = 0;
  for (uint i = 0; i < nnz; i++) {
    uint index   = gid * ellwidth + i;
    uint col     = colIdx[index];
    float aval  = matData[index];
    float xval  = vector_x[col];
    printf("aval, xval: %.2f,%.2f:%i-%i \\n", aval, xval, col, index);
    sum  += aval * xval;
  }
  printf("SUM/NNZ: %.2f %i \\n", sum, nnz);
  vector_y[gid] = sum;
}""").build()

In [17]:
a.shape, b.shape

((4, 8), (8,))

In [18]:
res = np.zeros(a.shape[0]).astype(np.float32)
res

array([0., 0., 0., 0.], dtype=float32)

In [19]:
rows = a.shape[0]

In [20]:
ellw = np.array([ellwa]).astype(np.uint32)
ellw

array([4], dtype=uint32)

In [21]:
mult = mult.astype(np.float32)

In [22]:
res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, mult.nbytes)
knl = prg.SpMVNaive  # Use this Kernel object for repeated calls
knl(queue, [rows,], None, adata_buf, acols_buf, annzs_buf, ellw, b_buf, res_buf)

res_np = np.empty_like(a)
cl.enqueue_copy(queue, res, res_buf)

<pyopencl._cl.NannyEvent at 0x7f786db89ca8>

In [23]:
res_buf

<pyopencl._cl.Buffer at 0x7f7864092f68>

In [24]:
res

array([0.00976714, 0.00316719, 0.00150641, 0.06516361], dtype=float32)

In [25]:
mult

array([0.00976714, 0.00316719, 0.00150641, 0.06516361], dtype=float32)

In [26]:
(res-mult).sum()

-2.3283064e-10

In [28]:
asfd

NameError: name 'asfd' is not defined

In [30]:
from tinygrad import SparseTensor

TypeError: Expected maxsize to be an integer or None

In [27]:
class SparseTensor:
    def __init__(self, dense_data, _children=(), _op=''):
        data, idxs, nnzs, ellw = to_ell(dense_data)
        self.data = data
        self.idxs = idxs
        self.nnzs = nnzs
        self.ellw = ellw
        self.shape = dense_data.shape
        self.grad = 0
        # internal variables used for autograd graph construction
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc
        
        def _backward(in_grad=0.0):
            self.grad = in_grad
            return (in_grad,)
        self._backward = _backward
        


    def __add__(self, other):
        other = other if isinstance(other, SparseTensor) else SparseTensor(other)
        out = SparseTensor(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    def backward(self):
        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"

In [1]:
from __future__ import division

KERNEL_CODE = """
// Thread block size
#define BLOCK_SIZE %(block_size)d
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA %(w_a)d // Matrix A width
#define HA %(h_a)d // Matrix A height
#define WB %(w_b)d // Matrix B width
#define HB WA  // Matrix B height
#define WC WB  // Matrix C width
#define HC HA  // Matrix C height
/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */
/* Matrix multiplication: C = A * B.
 * Device code.
 */
#define AS(j, i) As[i + j * BLOCK_SIZE]
#define BS(j, i) Bs[i + j * BLOCK_SIZE]
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! WA is A's width and WB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(16,16,1))) 
void
matrixMul( __global float* C, __global float* A, __global float* B)
{
    __local float As[BLOCK_SIZE*BLOCK_SIZE];
    __local float Bs[BLOCK_SIZE*BLOCK_SIZE];
    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);
    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);
    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;
    // Index of the last sub-matrix of A processed by the block
    int aEnd   = aBegin + WA - 1;
    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;
    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;
    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * WB;
    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0.0f;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {
        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[a + WA * ty + tx];
        BS(ty, tx) = B[b + WB * ty + tx];
        // Synchronize to make sure the matrices are loaded
        barrier(CLK_LOCAL_MEM_FENCE);
        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
"""
