In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp

In [None]:
tol = 1e-10
mnorm = lambda x: np.max(np.abs(x))

In [None]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

In [None]:
n = 16*10**3
# x_d = cl.clrandom.rand(queue, n, dtype=np.double)
# A_d = cl.clrandom.rand(queue, (n, n), dtype=np.double)

x_d = cl.array.arange(queue, n, dtype=np.double)/n
A_d = cl.array.arange(queue, n*n, dtype=np.double).reshape((n,n))/(n*n)
x = x_d.get()
A = A_d.get()
b = A.dot(x)

## Plain Kernel

In [None]:
knl = lp.make_kernel(
        "[n] -> { [i,j]: 0<=i,j<n }",
        "out[i] = sum(j, A[i,j]*x[j])", 
        [lp.GlobalArg("A", shape=("n, n"), dtype=np.double),
         lp.GlobalArg("x", shape=("n",), dtype=np.double),
         lp.GlobalArg("out", shape=("n",), dtype=np.double),
         lp.ValueArg("n", np.int, approximately=n)],
         assumptions="n>=0")
knl = lp.set_options(knl, "write_cl")

knl_single = knl

In [None]:
evt, (b_d,) = knl_single(queue, A=A_d, x=x_d, n=n); evt.wait()

In [None]:
mnorm(b_d.get()-b)<tol

In [None]:
%timeit evt, _ = knl_single(queue, A=A_d, x=x_d, n=n); evt.wait()

## Tiled Kernel

In [None]:
knl = knl_single
tile_size = 16
#knl = lp.split_iname(knl, "i", tile_size, outer_tag="g.0", inner_tag="l.0")
#knl = lp.split_iname(knl, "i", tile_size, outer_tag="g.0")
#knl = lp.split_iname(knl, "i", tile_size, inner_tag="l.0")
knl = lp.split_iname(knl, "i", tile_size)
knl = lp.split_iname(knl, "j", tile_size)

knl = lp.set_loop_priority(knl, "j_outer,i_inner,j_inner")
knl_tile = knl

In [None]:
evt, (b_d,) = knl_tile(queue, A=A_d, x=x_d); evt.wait()

In [None]:
mnorm(b_d.get()-b)<tol

In [None]:
b_d.get()

In [None]:
%timeit evt, _ = knl_tile(queue, A=A_d, x=x_d, n=n); evt.wait()

In [None]:
#lp.show_dependency_graph(knl_tile)