In [1]:
versioninfo()

Julia Version 0.5.0
Commit 3c9d753 (2016-09-19 18:14 UTC)
Platform Info:
  System: Darwin (x86_64-apple-darwin13.4.0)
  CPU: Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.7.1 (ORCJIT, haswell)


In [2]:
using OpenCL
using Gadfly

In [3]:
diff_kernel = "
inline int xy_to_i(int x, int y, int col_size, int mesh_size) {
    return (x % col_size) + (y % col_size) * col_size; 
}

__kernel void diff(__global const float *a,
__global float *diff,
int col_size,
int mesh_size)
    {
      int gid = get_global_id(0);
    int x_idx = gid % col_size;
    int y_idx = gid / col_size;
    
    diff[gid] = - 4. * a[gid]
    + a[xy_to_i(x_idx - 1, y_idx, col_size, mesh_size)]
    + a[xy_to_i(x_idx + 1, y_idx, col_size, mesh_size)]
    + a[xy_to_i(x_idx, y_idx - 1, col_size, mesh_size)]
    + a[xy_to_i(x_idx, y_idx + 1, col_size, mesh_size)];
    }
";

In [4]:
sum_kernel = "
   __kernel void sum(__global float *a,
__global const float *diff,
float step_size)
    {
      int gid = get_global_id(0);
    a[gid] += step_size * diff[gid];
    }
";

In [5]:
function initial_cond()
    a = zeros(Float32, (5000, 5000))
    a[Int(size(a)[1] / 2), Int(size(a)[2] / 2)] = 10
    return a
end

initial_cond (generic function with 1 method)

In [12]:
function do_openCL(input)
#    device, ctx, queue = cl.create_compute_context()
#    print(device, ctx, queue)
    ctx    = cl.Context(cl.devices()[3])
    device = cl.devices(ctx)
    queue  = cl.CmdQueue(ctx)
    print(ctx, device, queue)

    input_buff = cl.Buffer(Float32, ctx, (:rw, :copy), hostbuf=input)
    diff_buff = cl.Buffer(Float32, ctx, :rw, length(input))

    diff_p = cl.Program(ctx, source=diff_kernel) |> cl.build!
    diff_k = cl.Kernel(diff_p, "diff")

    sum_p = cl.Program(ctx, source=sum_kernel) |> cl.build!
    sum_k = cl.Kernel(sum_p, "sum")

    step_size = 0.001

    for i in 1:1000
        queue(diff_k, length(input), nothing, input_buff, diff_buff, size(input)[1], length(input))
        queue(sum_k, length(input), nothing, input_buff, diff_buff, Float32(step_size))
    end
    r = cl.read(queue, input_buff);
    r = reshape(r, size(input))
    return r
end




do_openCL (generic function with 1 method)

In [13]:
@time do_openCL(initial_cond())

OpenCL.Context(@0x00007fce5a9e5950 on AMD Radeon R9 M370X Compute Engine)OpenCL.cl.Device[OpenCL.Device(AMD Radeon R9 M370X Compute Engine on Apple @0x0000000001021c00)]OpenCL.CmdQueue(@0x00007fce5a95b860) 25.156822 seconds (109.65 k allocations: 194.293 MB, 0.51% gc time)


5000×5000 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  

In [None]:
cl.devices(cl.create_some_context())

In [None]:
cl.platforms()

In [None]:
cl.Context(cl.devices()[3])

In [None]:
function diff_step(input, output)
    maxX, maxY = size(input)
    for iter in eachindex(input)
        x, y = ind2sub(input, iter)
        output[iter] = - 4. * input[iter] + input[x, y - 1 == 0 ? maxY : y - 1] + 
        input[x, y + 1 > maxY ? 1 : y + 1] +
        input[x - 1 == 0 ? maxX : x - 1, y] +
        input[x + 1 > maxX ? 1 : x + 1, y]
    end
    return output
end

In [None]:
function sum_step(input, diff, step_size)
    return input + (step_size * diff)
end
    

In [None]:
function do_naive(input)
    step_size = 0.001

    diff = similar(input)
    for i in 1:10000
        diff = diff_step(input, diff)
        input = sum_step(input, diff, step_size)
    end
    return input
end

In [None]:
res = do_naive(initial_cond())

In [None]:
@time do_openCL(initial_cond())

In [None]:
@time do_naive(initial_cond())

In [None]:
cl.devices()

In [None]:
Pkg.add('PyPlot')

In [None]:
initial_cond()[1,2]