This notebook assumes GPU support and pyopencl installed and set up correctly!

In [1]:
import matmul
import matplotlib.pyplot as plt
import numpy as np
import pyopencl
%matplotlib inline

In [2]:
o = matmul.matmul()

In [3]:
n = 1024
m = 512
p = 1024
A_host = np.random.randint(0,1000, size=(n,m), dtype=np.int32)
B_host = np.random.randint(0,1000, size=(m,p), dtype=np.int32)

print(o.queue.device)
A_device = pyopencl.array.to_device(o.queue, A_host)
B_device = pyopencl.array.to_device(o.queue, B_host)
C_device = pyopencl.array.empty(o.queue, (n,p), np.int32)
pyopencl.enqueue_barrier(o.queue).wait()
o.queue.finish()

print(A_device.shape, A_device.dtype)

<pyopencl.Device 'NVIDIA GeForce GTX 1650 Ti' on 'NVIDIA CUDA' at 0x263a920>
(1024, 512) int32


In [4]:
C_device2 = o.matmul(A_device, B_device)
C_host2 = C_device2.get()
print(C_host2.shape, C_host2.dtype)

(1024, 1024) int32


In [5]:
C_np = np.matmul(A_host, B_host)
print(C_np.shape, C_np.dtype)
print(np.array_equal(C_host2, C_np))

(1024, 1024) int32
True


In [6]:
# https://github.com/stefanv/PyOpenCL/blob/master/examples/matrix-multiply.py
with open('matmul.cl', 'r') as file:
    program_opencl = file.read()

block_size = 16
kernel_params = {"block_size": block_size,
        "w_a":m, "h_a":n, "w_b":p}

prg = pyopencl.Program(o.ctx, program_opencl % kernel_params).build(options="-cl-mad-enable -cl-fast-relaxed-math")
kernel = prg.matrixMul

In [7]:
event = kernel(o.queue, C_device.shape, (block_size, block_size), 
            C_device.data, A_device.data, B_device.data)
event.wait()
C_host = C_device.get()
print(C_host.shape, C_host.dtype)
print(np.array_equal(C_host, C_host2))

(1024, 1024) int32
True


In [8]:
from time import time

# warmup ----------------------------------------------------------------------
for i in range(5):
    event = kernel(o.queue, C_device.shape, (block_size, block_size), 
            C_device.data, A_device.data, B_device.data)
    event.wait()

o.queue.finish()

# actual benchmark ------------------------------------------------------------
t1 = time()

count = 1000
for i in range(count):
    event = kernel(o.queue, C_device.shape, (block_size, block_size), 
            C_device.data, A_device.data, B_device.data)

#event.wait()
o.queue.finish()

gpu_time = (time()-t1)/count

# transfer device -> host -----------------------------------------------------
t1 = time()
foo = C_device.get()
pull_time = time()-t1

print(f"opencl time compute (host) {(gpu_time * 1000):.3f} ms")
print(f"opencl time download (host) {(pull_time * 1000):.3f} ms")

opencl time compute (host) 3.014 ms
opencl time download (host) 1.641 ms


In [9]:

# warmup ----------------------------------------------------------------------
for i in range(5):
    C_device2 = o.matmul(A_device, B_device)

o.queue.finish()

# actual benchmark ------------------------------------------------------------
t1 = time()

count = 1000
for i in range(count):
    C_device2 = o.matmul(A_device, B_device)

o.queue.finish()

gpu_time = (time()-t1)/count

# transfer device -> host -----------------------------------------------------
t1 = time()
foo = C_device2.get()
pull_time = time()-t1

print(f"futhark time compute (host) {(gpu_time * 1000):.3f} ms")
print(f"futhark time download (host) {(pull_time * 1000):.3f} ms")

futhark time compute (host) 1.499 ms
futhark time download (host) 1.617 ms


Compare with: `futhark bench matmul.fut --backend=opencl`