In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl
from time import time


a_np = np.random.rand(50000).astype(np.float32)
b_np = np.random.rand(50000).astype(np.float32)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

prg = cl.Program(ctx, """
__kernel void sum(
    __global const float *a_g, __global const float *b_g, __global float *res_g, __global float *work_item)
{
  int gid = get_global_id(0);
  work_item[gid] = get_global_id(0);
  res_g[gid] = a_g[gid] + b_g[gid];
}
""").build()

res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
work_item_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)


start = time()
prg.sum(queue, a_np.shape, None, a_g, b_g, res_g, work_item_g)

res_np = np.empty_like(a_np)
work_item = np.empty_like(a_np)

cl.enqueue_copy(queue, res_np, res_g)
cl.enqueue_copy(queue, work_item, work_item_g)

# Check on CPU with Numpy:
print(res_np - (a_np + b_np))

print(np.linalg.norm(res_np - (a_np + b_np)))

print("Temp de traitement : "+str(time()-start))

print("Work Item : "+str(work_item))


[ 0.  0.  0. ...,  0.  0.  0.]
0.0
Temp de traitement : 1.0935630798339844
Work Item : [  0.00000000e+00   1.00000000e+00   2.00000000e+00 ...,   4.99970000e+04
   4.99980000e+04   4.99990000e+04]


In [1]:
# example provided by Roger Pau Monn'e

from __future__ import print_function
from __future__ import absolute_import
import pyopencl as cl
import numpy
import numpy.linalg as la
import datetime
from time import time

data_points = 2**23 # ~8 million data points, ~32 MB data
workers = 2**8 # 256 workers, play with this to see performance differences
               # eg: 2**0 => 1 worker will be non-parallel execution on gpu
               # data points must be a multiple of workers

a = numpy.random.rand(data_points).astype(numpy.float32)
b = numpy.random.rand(data_points).astype(numpy.float32)
c_result = numpy.empty_like(a)

# Speed in normal CPU usage
time1 = time()
c_temp = (a+b) # adds each element in a to its corresponding element in b
c_temp = c_temp * c_temp # element-wise multiplication
c_result = c_temp * (a/2.0) # element-wise half a and multiply
time2 = time()

print("Execution time of test without OpenCL: ", time2 - time1, "s")


for platform in cl.get_platforms():
    for device in platform.get_devices():
        print("===============================================================")
        print("Platform name:", platform.name)
        print("Platform profile:", platform.profile)
        print("Platform vendor:", platform.vendor)
        print("Platform version:", platform.version)
        print("---------------------------------------------------------------")
        print("Device name:", device.name)
        #print("Device type:", cl.device_type.to_string(device.type))
        print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
        print("Device max clock speed:", device.max_clock_frequency, 'MHz')
        print("Device compute units:", device.max_compute_units)
        print("Device max work group size:", device.max_work_group_size)
        print("Device max work item sizes:", device.max_work_item_sizes)

        # Simnple speed test
        ctx = cl.Context([device])
        queue = cl.CommandQueue(ctx, 
                properties=cl.command_queue_properties.PROFILING_ENABLE)

        mf = cl.mem_flags
        a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
        b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
        dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

        prg = cl.Program(ctx, """
            __kernel void sum(__global const float *a,
            __global const float *b, __global float *c)
            {
                        int gid = get_global_id(0);
                        float a_temp;
                        float b_temp;
                        float c_temp;
                        a_temp = a[gid]; // my a element (by global ref)
                        b_temp = b[gid]; // my b element (by global ref)
                        
                        c_temp = a_temp+b_temp; // sum of my elements
                        c_temp = c_temp * c_temp; // product of sums
                        c_temp = c_temp * (a_temp/2.0f); // times 1/2 my a
                        c[gid] = c_temp; // store result in global memory
                }
                """).build()

        global_size=(data_points,)
        local_size=(workers,)
        preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \
            cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \
            device)

        print("Data points:", data_points)
        print("Workers:", workers)
        print("Preferred work group size multiple:", preferred_multiple)

        if (workers % preferred_multiple):
            print("Number of workers not a preferred multiple (%d*N)." \
                    % (preferred_multiple))
            print("Performance may be reduced.")

        exec_evt = prg.sum(queue, global_size, local_size, a_buf, b_buf, dest_buf)
        exec_evt.wait()
        elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)

        print("Execution time of test: %g s" % elapsed)

        c = numpy.empty_like(a)
        cl.enqueue_read_buffer(queue, dest_buf, c).wait()
        equal = numpy.all( c == c_result)

        if not equal:
                print("Results doesn't match!!")
        else:
            print("Results OK")

Execution time of test without OpenCL:  0.08559370040893555 s
Platform name: Portable Computing Language
Platform profile: FULL_PROFILE
Platform vendor: The pocl project
Platform version: OpenCL 2.0 pocl 0.13, LLVM 3.8.0
---------------------------------------------------------------
Device name: pthread-Intel(R) Xeon(R) CPU E3-1240 v5 @ 3.50GHz
Device memory:  1515 MB
Device max clock speed: 4294 MHz
Device compute units: 3
Device max work group size: 4096
Device max work item sizes: [4096, 4096, 4096]
Data points: 8388608
Workers: 256
Preferred work group size multiple: 8
Execution time of test: 0.0235249 s
Results doesn't match!!




In [3]:
cl.VERSION

(2016, 2)

In [4]:
cl.get_cl_header_version()

(2, 0)

In [18]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl
from time import time


a_np = np.zeros(50).astype(np.int)
b_np = np.ones(50).astype(np.int)

info_np = np.zeros(10).astype(np.int)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

prg = cl.Program(ctx, """
__kernel void sum(
    __global const int *a_g, __global int *b_g, __global int *res_g, __global int *info)
{
  int gid = get_global_id(0);
  
  info[0] = get_global_size(0);
  info[1]  = get_global_size(1);
  info[2] = get_local_size(0);
  info[3]  = get_local_size(1);
  info[4]  = get_local_id(0);
  info[5]  = get_local_id(1);
  info[6]  = get_global_id(0);
  info[7]  = get_global_id(1);
  info[8]  = get_work_dim();
  info[9]  = get_num_groups(0);
  info[10]  = get_num_groups(1);
  info[11]  = get_group_id(0);
  
  
  //for (y=get_global_id(0); y<25; y+=get_global_size(0)){
      //info[11] = info[11]+1;
  //}
  //res_g[gid] = a_g[gid] + get_global_id(0);
    res_g[gid] = a_g[gid] + get_group_id(0);
}
""").build()

res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
info_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)


start = time()
prg.sum(queue, a_np.shape, None, a_g, b_g, res_g, info_g)

res_np = np.empty_like(a_np)
info = np.empty_like(a_np)


cl.enqueue_copy(queue, res_np, res_g)
cl.enqueue_copy(queue, info, info_g)

        
# Check on CPU with Numpy:
print("\n a_np : "+str(a_np))

print("\n b_np : "+str(b_np))

print("\n res_np : "+str(res_np))

print("\n Temp de traitement : "+str(time()-start))
print("----------------------------------------------")
print("get_global_size(0) [Nombre de thread] : "+str(info[0]))
print("get_global_size(1) : "+str(info[1]) )
print("get_local_size(0) : "+str(info[2]) )
print("get_local_size(1) : "+str(info[3]) )
print("get_local_id(0) : "+str(info[4]) )
print("get_local_id(1) : "+str(info[5]) )
print("get_global_id(0) [] : "+str(info[6] ) )
print("get_global_id(1) : "+str(info[7]))
print("get_work_dim() : "+str(info[8]))
print("get_num_groups(0) [Nombre de Work Group-(Coeur 0)] : " +str(info[9]))
print("get_num_groups(1) [Nombre de Work Group-(Coeur 1)] : " +str(info[10]))
print("get_group_id(0) : " +str(info[11]))



 a_np : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]

 b_np : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]

 res_np : [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]

 Temp de traitement : 0.0038101673126220703
----------------------------------------------
get_global_size(0) [Nombre de thread] : 50
get_global_size(1) : 1
get_local_size(0) : 1
get_local_size(1) : 1
get_local_id(0) : 0
get_local_id(1) : 0
get_global_id(0) : 15
get_global_id(1) : 0
get_work_dim() : 1
get_num_groups(0) [Nombre de Work Group-(Coeur 0)] : 50
get_num_groups(1) [Nombre de Work Group-(Coeur 1)] : 1
get_group_id(0) : 15


In [8]:
#!/usr/bin/python
import pyopencl as cl
kcode = """kernel void test() { printf("Hello from DSP (%d)\\n", get_group_id(0)); }"""
ctx   = cl.create_some_context()
Q     = cl.CommandQueue(ctx)
prg   = cl.Program(ctx, kcode).build(options="")
prg.test(Q, [8], [1]).wait()