In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function
import pyopencl as cl
import numpy as np

## Seleccionar plataforma y dispositivo a usar

In [2]:
ID_P = 0
ID_D = 2
platforms = cl.get_platforms()
devices = [platforms[ID_P].get_devices()[ID_D]]
list(devices)

[<pyopencl.Device 'GeForce RTX 2080' on 'NVIDIA CUDA' at 0x56535e7d9240>]

## Crear contexto y mostrar los dispositivos asignados 

In [3]:
context = cl.Context(devices=devices,properties=[(cl.context_properties.PLATFORM, platforms[ID_P])])

In [4]:
context.get_info(cl.context_info.DEVICES)

[<pyopencl.Device 'GeForce RTX 2080' on 'NVIDIA CUDA' at 0x56535e7d9240>]

## Creación de una cola de comandos

In [5]:
queue = cl.CommandQueue(context, properties=None)
queue.get_info(cl.command_queue_info.CONTEXT)

<pyopencl.Context at 0x56535e84cea0 on <pyopencl.Device 'GeForce RTX 2080' on 'NVIDIA CUDA' at 0x56535e7d9240>>

## Código fuente



In [6]:
kernel='''
__kernel void vector_sum(__global  uchar *a, __global uchar *b, __global uchar *output){
    
    uint index = get_global_id(0);
    output[index]=a[index]+b[index];

}
'''

## Compilación del programa
https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clCreateProgramWithSource.html<br>
https://documen.tician.de/pyopencl/runtime_program.html#program

In [22]:
# Crear el programa
prg = cl.Program(context, kernel)
# compilar
exe = prg.build()

## Preparación de la memoria

In [24]:

host_vector_a = np.random.randint(128, size=10*(2**20), dtype=np.uint8)
host_vector_b = np.random.randint(128, size=10*(2**20), dtype=np.uint8)

print("Host Vector A \nTotal registros: ",len(host_vector_a),'\nContenido resumen: ',host_vector_a)
print("\n")
print("Host Vector B \nTotal registros: ",len(host_vector_b),'\nContenido resumen: ',host_vector_b)

Host Vector A 
Total registros:  10485760 
Contenido resumen:  [ 11  91 108 ...  16  87  93]


Host Vector B 
Total registros:  10485760 
Contenido resumen:  [ 94  36   1 ... 126  25  72]


In [30]:
device_vector_a = cl.Buffer(context, cl.mem_flags.READ_ONLY, size=host_vector_a.nbytes)
device_vector_b = cl.Buffer(context, cl.mem_flags.READ_ONLY, size=host_vector_b.nbytes)
device_output = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=host_vector_b.nbytes)
type(device_output)

pyopencl._cl.Buffer

In [26]:
cl.enqueue_copy(queue, device_vector_a, host_vector_a, wait_for=None)
cl.enqueue_copy(queue, device_vector_b, host_vector_b, wait_for=None)

<pyopencl._cl.NannyEvent at 0x7f2c88688e90>

## Ejecución del kernel

https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clEnqueueNDRangeKernel.html<br>
https://documen.tician.de/pyopencl/runtime_program.html#pyopencl.Kernel.__call__

In [31]:
exe.vector_sum(queue, host_vector_a.shape, None, device_vector_a, device_vector_b, device_output)

<pyopencl._cl.Event at 0x7f2dac57be30>

## Imprimir resultados

In [32]:
host_output = np.zeros_like(host_vector_a)
cl.enqueue_copy(queue, host_output, device_output)

<pyopencl._cl.NannyEvent at 0x7f2c88688830>

In [33]:
values=np.equal(host_vector_a+host_vector_b,host_output)

In [34]:
print(host_output.shape[0])
print(values.sum())

10485760
610
