In [17]:
!uname -a
!whoami
!echo $PYTHONPATH
import pyopencl

Linux ucdeneleccalc-01.ucdenver.pvt 3.17.4-301.fc21.x86_64 #1 SMP Thu Nov 27 19:09:10 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
yaffe.kravitz
:/usr/lib64/python2.7/site-packages/cffi-1.5.2-py2.7-linux-x86_64.egg


In [18]:
import pyopencl as cl
import pyopencl.array as cla
import numpy as n




In [19]:
def test1():
    '''you might think the Array class is something that simplifies code.
       this example shows it is not (change the comments and see the errors).
       instead, it seems to be something that looks like a numpy array,
       but which farms out work to the GPU (operation by operation).'''
    
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    a = n.array([0], dtype=n.int32)

    # this works
    a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, a.nbytes)
    # this alternative (plus below) fails
    #a_dev = cla.to_device(queue, a)

    prg = cl.Program(ctx, """
        __kernel void test1(__global int* a) {
            a[0] = 1;
        }
        """).build()

    event = prg.test1(queue, (1,), None, a_dev)
    event.wait()

    # this works
    cl.enqueue_copy(queue, a, a_dev)
    # this alternative(plus above) fails
    #a = a_dev.get()
    
    print(a)




In [20]:
def test3():
    '''constants (like b) don't need buffering.'''
    
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    a = n.array([0], dtype=n.int32)
    b = n.int32(4)

    a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, a.nbytes)

    prg = cl.Program(ctx, """
        __kernel void test1(__global int* a, const int b) {
            a[0] = b;
        }
        """).build()

    event = prg.test1(queue, (1,), None, a_dev, b)
    event.wait()

    cl.enqueue_copy(queue, a, a_dev)

    print(a)

In [21]:
# Copying Bytes

def test4():

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    a = n.array([0], dtype=n.int32)
    b = n.uint8(129)

    a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, a.nbytes)

    prg = cl.Program(ctx, """
        __kernel void test1(__global int* a, const uchar b) {
            a[0] = b;
        }
        """).build()

    event = prg.test1(queue, (1,), None, a_dev, b)
    event.wait()

    cl.enqueue_copy(queue, a, a_dev)

    print(a)

In [22]:
# Using Array

def test5():

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    a = n.array([0], dtype=n.int32)

    a_array = cla.to_device(queue, a)

    prg = cl.Program(ctx, """
        __kernel void test1(__global int* a) {
            a[0] = 1;
        }
        """).build()

    event = prg.test1(queue, (1,), None, a_array.data)
    event.wait()

    a = a_array.get()

    print(a)

In [23]:
# Struct and packing

def test6():

    # i have intel and amd installed (running on cpu).
    # switching gives different error messages (useful at times!)
#    p = cl.get_platforms();
#    print(p)
#    d = p[0].get_devices() # 1 is amd
#    print(d)
#    ctx = cl.Context(devices=d)
    
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    for use_struct in (True, False):

        if use_struct:
            a = s.pack('=ii',1,2)
            print(a, len(a))
            a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, len(a))
        else:
            a = n.array([(1,2)], dtype=n.dtype('2i4', align=True))
#            a = n.array([(1,2)], dtype=n.dtype('2i4'))
            print(a, a.itemsize, a.nbytes)
            a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, a.nbytes)

        b = n.array([0], dtype='i4')
        print(b, b.itemsize, b.nbytes)
        b_dev = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, b.nbytes)

        c = n.array([0], dtype='i4')
        print(c, c.itemsize, c.nbytes)
        c_dev = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, c.nbytes)

        prg = cl.Program(ctx, """
            typedef struct s {
                int f0;
                int f1 __attribute__ ((packed));
            } s;
            __kernel void test(__global const s *a, __global int *b, __global
            int *c) {
                *b = a->f0;
                *c = a->f1;
            }
            """).build()

        cl.enqueue_copy(queue, a_dev, a)
        event = prg.test(queue, (1,), None, a_dev, b_dev, c_dev)
        event.wait()
        cl.enqueue_copy(queue, b, b_dev)
        print(b)
        cl.enqueue_copy(queue, c, c_dev)
        print(c)


if __name__ == '__main__':
    test6()

('\x01\x00\x00\x00\x02\x00\x00\x00', 8)
(array([0], dtype=int32), 4, 4)
(array([0], dtype=int32), 4, 4)
[1]
[2]
(array([[1, 2]], dtype=int32), 4, 8)
(array([0], dtype=int32), 4, 4)
(array([0], dtype=int32), 4, 4)
[1]
[2]


In [24]:
# Bytes in struct
# Like taking candy from a baby :o)  (note that writing to bytes is an oencl
# extension, which is why I am using ints as output).

import struct as s

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

for use_struct in (True, False):

    if use_struct:
        a = s.pack('=bb',1,2)
        print(a, len(a))
        a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, len(a))
    else:
#            a = n.array([(1,2)], dtype=n.dtype('2i1', align=True))
        a = n.array([(1,2)], dtype=n.dtype('2i1'))
        print(a, a.itemsize, a.nbytes)
        a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, a.nbytes)

    b = n.array([0], dtype='i4')
    print(b, b.itemsize, b.nbytes)
    b_dev = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, b.nbytes)

    c = n.array([0], dtype='i4')
    print(c, c.itemsize, c.nbytes)
    c_dev = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, c.nbytes)

    prg = cl.Program(ctx, """
        typedef struct s {
            char f0;
            char f1 __attribute__ ((packed));
        } s;
        __kernel void test(__global const s *a, __global int *b, __global
        int *c) {
            *b = a->f0;
            *c = a->f1;
        }
        """).build()

    cl.enqueue_copy(queue, a_dev, a)
    event = prg.test(queue, (1,), None, a_dev, b_dev, c_dev)
    event.wait()
    cl.enqueue_copy(queue, b, b_dev)
    print(b)
    cl.enqueue_copy(queue, c, c_dev)
    print(c)



('\x01\x02', 2)
(array([0], dtype=int32), 4, 4)
(array([0], dtype=int32), 4, 4)
[1]
[2]
(array([[1, 2]], dtype=int8), 1, 2)
(array([0], dtype=int32), 4, 4)
(array([0], dtype=int32), 4, 4)
[1]
[2]


In [28]:
# Trickier Alignment

# This shows the importance of numpy's "align" keyword.

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

for use_struct in (True, False):

    if use_struct:
        a = s.pack('=bi',1,2)
        print(a, len(a))
        a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, len(a))
    else:
        a = n.array([(1,2)], dtype=n.dtype('i4', align=True))
        # this no longer works - without align=True we get the wrong value
        #a = n.array([(1,2)], dtype=n.dtype('i1i4'))
        print(a, a.itemsize, a.nbytes)
        a_dev = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, a.nbytes)

    b = n.array([0], dtype='i4')
    print(b, b.itemsize, b.nbytes)
    b_dev = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, b.nbytes)

    c = n.array([0], dtype='i4')
    print(c, c.itemsize, c.nbytes)
    c_dev = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, c.nbytes)

    prg = cl.Program(ctx, """
        typedef struct s {
            char f0;
            int f1 __attribute__ ((packed));
        } s;
        __kernel void test(__global const s *a, __global int *b, __global
        int *c) {
            *b = a->f0;
            *c = a->f1;
        }
        """).build()

    cl.enqueue_copy(queue, a_dev, a)
    event = prg.test(queue, (1,), None, a_dev, b_dev, c_dev)
    event.wait()
    cl.enqueue_copy(queue, b, b_dev)
    print(b)
    cl.enqueue_copy(queue, c, c_dev)
    print(c)

('\x01\x02\x00\x00\x00', 5)
(array([0], dtype=int32), 4, 4)
(array([0], dtype=int32), 4, 4)
[1]
[2]
(array([[1, 2]], dtype=int32), 4, 8)
(array([0], dtype=int32), 4, 4)
(array([0], dtype=int32), 4, 4)
[1]
[33554432]


In [31]:
test3()
test4()
test5()

[4]
[129]
[1]
