In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function
import pyopencl as cl
import numpy as np

## Seleccionar plataforma y dispositivo a usar

In [9]:
ID_P = 1
ID_D = 0
platforms = cl.get_platforms()
list(platforms)

[<pyopencl.Platform 'NVIDIA CUDA' at 0x556573a956f0>,
 <pyopencl.Platform 'Portable Computing Language' at 0x7f65a8600020>]

In [8]:
devices = [platforms[ID_P].get_devices()[ID_D]]
list(devices)

[<pyopencl.Device 'pthread-AMD Ryzen Threadripper 1950X 16-Core Processor' on 'Portable Computing Language' at 0x556573a8dd80>]

## Crear contexto y mostrar los dispositivos asignados 

In [10]:
context = cl.Context(devices=devices,properties=[(cl.context_properties.PLATFORM, platforms[ID_P])])
context.get_info(cl.context_info.DEVICES)

[<pyopencl.Device 'pthread-AMD Ryzen Threadripper 1950X 16-Core Processor' on 'Portable Computing Language' at 0x556573a8dd80>]

## Creación de una cola de comandos

In [11]:
queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
queue.get_info(cl.command_queue_info.CONTEXT)

<pyopencl.Context at 0x556573a8b600 on <pyopencl.Device 'pthread-AMD Ryzen Threadripper 1950X 16-Core Processor' on 'Portable Computing Language' at 0x556573a8dd80>>

## Código fuente



In [12]:
kernel='''
__kernel void vector_mul(__global uchar *a, __global uchar *b, __global uchar *output){
    
    uint index = get_global_id(0);
    output[index] = a[index]*b[index];
}

__kernel void vector_subtract(__global uchar *a, __global uchar *b, __global uchar *output){
    
    uint index = get_global_id(0);
    output[index]+=0;
    output[index] -= a[index]*b[index];
}
'''

## Compilación del programa

In [13]:
prg = cl.Program(context, kernel)
exe = prg.build(options=[])

## Inicialización de memoria en el host

In [19]:
host_vector_a = np.random.randint(0,127,size=10*(2**20),dtype=np.uint8)
host_vector_a

array([ 93,  85,  29, ...,  29,  19, 113], dtype=uint8)

In [18]:
host_vector_b = np.random.randint(0,127,size=10*(2**20),dtype=np.uint8)
host_vector_b

array([113,  45, 108, ..., 117,   0,   2], dtype=uint8)

In [20]:
host_output = np.zeros_like(host_vector_a)
host_output

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

## Inicialización de memoria en el Dispositivo

In [21]:
device_vector_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=host_vector_a.nbytes)
device_vector_b = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=host_vector_b.nbytes)
device_output = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=host_vector_b.nbytes)
device_output

<pyopencl._cl.Buffer at 0x7f659ab6b4d0>

## copiar datos del host al dispositivo

In [22]:
cl.enqueue_copy(queue, device_vector_a, host_vector_a, is_blocking=False, wait_for=None)
cl.enqueue_copy(queue, device_vector_b, host_vector_b, is_blocking=False, wait_for=None)

<pyopencl._cl.NannyEvent at 0x7f659ab6b1d0>

## Ejecución de los kernels

In [25]:
# con wait_for = None
for i in range(40):
    add_event = exe.vector_mul(queue, host_vector_a.shape, None, device_vector_a, device_vector_b, device_output, wait_for=None)
    sub_event = exe.vector_subtract(queue, host_vector_a.shape, None, device_vector_a, device_vector_b, device_output, wait_for=[add_event])
    cl.enqueue_copy(queue, host_output, device_output, is_blocking=False, wait_for=None)
    print(host_output.sum())

198761895
216449771
198682335
228634861
253745002
223196771
210322693
276210453
291280024
154588534
223803003
277432065
212980290
325992873
275889409
233820391
66796535
238652164
146819926
247846186
180724846
190682054
255475189
249713270
232952570
220864417
209844467
0
243547793
208499020
231080777
335875794
196807937
181055301
255596509
262075248
179787549
222045477
265053171
320464013


In [27]:
# con wait_for=[sub_event]
for i in range(40):
    add_event = exe.vector_mul(queue, host_vector_a.shape, None, device_vector_a, device_vector_b, device_output, wait_for=None)
    sub_event = exe.vector_subtract(queue, host_vector_a.shape, None, device_vector_a, device_vector_b, device_output, wait_for=[add_event])
    cl.enqueue_copy(queue, host_output, device_output, is_blocking=False, wait_for=[sub_event])
    print(host_output.sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
