# Matrix multiplication
clEsperanto brings operations for multiplying images and matrices which are also available with numpy. Let's see how numpy performs in comparison with our OpenCL stuff. When doing similar comparisons with ImageJ, we saw more performance benefits when GPU-accelerating 3D operations compared to 2D operations. https://clij.github.io/clij-benchmarking/benchmarking_operations_jmh

**Note:** benchmarking results vary heavily depending on image size, kernel size, used operations, parameters and used hardware. Use this notebook to adapt it to your use-case scenario and benchmark on your target hardware. If you have different scenarios or use-cases, you are very welcome to submit your notebook as pull-request!

In [1]:
import pyclesperanto_prototype as cle
import time
import numpy as np
import cupy as cp

# to measure kernel execution duration properly, we need to set this flag. It will slow down exection of workflows a bit though
cle.set_wait_for_kernel_finish(True)

# selet a GPU with the following in the name. This will fallback to any other GPU if none with this name is found
cle.select_device('RTX')

<NVIDIA GeForce RTX 3050 Ti Laptop GPU on Platform: NVIDIA CUDA (1 refs)>

## Matrix multiplication

In [2]:
# test data
test_matrix1 = np.random.random([1024, 512])
test_matrix2 = np.random.random([512, 1024])

In [5]:
# multiply with clesperanto
result_matrix = cle.create([1024, 1024])

test_matrix1_gpu = cle.push_zyx(test_matrix1)
test_matrix2_gpu = cle.push_zyx(test_matrix2)

for i in range(0, 10):
    start_time = time.time()
    cle.multiply_matrix(test_matrix1_gpu, test_matrix2_gpu, result_matrix)
    print("clEsperanto matrix multiplication duration: " + str(time.time() - start_time))


clEsperanto matrix multiplication duration: 0.028005361557006836
clEsperanto matrix multiplication duration: 0.010002374649047852
clEsperanto matrix multiplication duration: 0.011002540588378906
clEsperanto matrix multiplication duration: 0.011009454727172852
clEsperanto matrix multiplication duration: 0.010985374450683594
clEsperanto matrix multiplication duration: 0.01000213623046875
clEsperanto matrix multiplication duration: 0.011009693145751953
clEsperanto matrix multiplication duration: 0.009986162185668945
clEsperanto matrix multiplication duration: 0.010003328323364258
clEsperanto matrix multiplication duration: 0.010001420974731445


In [3]:
# multiply with numpy
result_matrix = None

for i in range(0, 10):
    start_time = time.time()
    result_matrix = np.matmul(test_matrix1, test_matrix2, out=result_matrix)
    print("Numpy matrix multiplication duration: " + str(time.time() - start_time))

print(result_matrix.shape)

Numpy matrix multiplication duration: 0.012001752853393555
Numpy matrix multiplication duration: 0.007002592086791992
Numpy matrix multiplication duration: 0.011000394821166992
Numpy matrix multiplication duration: 0.0050013065338134766
Numpy matrix multiplication duration: 0.006001472473144531
Numpy matrix multiplication duration: 0.0050013065338134766
Numpy matrix multiplication duration: 0.006000995635986328
Numpy matrix multiplication duration: 0.00500178337097168
Numpy matrix multiplication duration: 0.005000591278076172
Numpy matrix multiplication duration: 0.006001710891723633
(1024, 1024)


In [4]:
# multiply with numpy
result_matrix = None
cu_test_matrix1 = cp.asarray(test_matrix1)
cu_test_matrix2 = cp.asarray(test_matrix2)

for i in range(0, 10):
    start_time = time.time()
    result_matrix = cp.matmul(cu_test_matrix1, cu_test_matrix2, out=result_matrix)
    cp.cuda.stream.get_current_stream().synchronize() # we need to wait here to measure time properly
    print("cupy matrix multiplication duration: " + str(time.time() - start_time))

print(result_matrix.shape)

cupy matrix multiplication duration: 0.6461458206176758
cupy matrix multiplication duration: 0.0200042724609375
cupy matrix multiplication duration: 0.020011425018310547
cupy matrix multiplication duration: 0.023001909255981445
cupy matrix multiplication duration: 0.02199578285217285
cupy matrix multiplication duration: 0.020005226135253906
cupy matrix multiplication duration: 0.0200045108795166
cupy matrix multiplication duration: 0.0200042724609375
cupy matrix multiplication duration: 0.020003795623779297
cupy matrix multiplication duration: 0.01900482177734375
(1024, 1024)


## Elementwise multiplication

In [6]:
# test data
import numpy as np

test_image1 = np.random.random([100, 512, 512])
test_image2 = np.random.random([100, 512, 512])

In [7]:
# multiply with numpy
result_image = None

for i in range(0, 10):
    start_time = time.time()
    result_image = np.multiply(test_image1, test_image2)
    print("Numpy elementwise multiplication duration: " + str(time.time() - start_time))
    

Numpy elementwise multiplication duration: 0.062011003494262695
Numpy elementwise multiplication duration: 0.07701683044433594
Numpy elementwise multiplication duration: 0.07101678848266602
Numpy elementwise multiplication duration: 0.07201623916625977
Numpy elementwise multiplication duration: 0.0800178050994873
Numpy elementwise multiplication duration: 0.06601476669311523
Numpy elementwise multiplication duration: 0.0690155029296875
Numpy elementwise multiplication duration: 0.07401680946350098
Numpy elementwise multiplication duration: 0.07201623916625977
Numpy elementwise multiplication duration: 0.07001614570617676


In [8]:
# multiply with pyclesperanto
result_image = None

test_image1_gpu = cle.push_zyx(test_image1)
test_image2_gpu = cle.push_zyx(test_image2)

for i in range(0, 10):
    start_time = time.time()
    result_image = cle.multiply_images(test_image1_gpu, test_image2_gpu, result_image)
    print("clEsperanto elementwise multiplication duration: " + str(time.time() - start_time))

clEsperanto elementwise multiplication duration: 0.03593897819519043
clEsperanto elementwise multiplication duration: 0.003011941909790039
clEsperanto elementwise multiplication duration: 0.003000497817993164
clEsperanto elementwise multiplication duration: 0.003000497817993164
clEsperanto elementwise multiplication duration: 0.003000974655151367
clEsperanto elementwise multiplication duration: 0.003000497817993164
clEsperanto elementwise multiplication duration: 0.003000497817993164
clEsperanto elementwise multiplication duration: 0.003000974655151367
clEsperanto elementwise multiplication duration: 0.002990245819091797
clEsperanto elementwise multiplication duration: 0.003000974655151367


In [9]:
# multiply with pyclesperanto while _not_ reusing memory
result_image = None

test_image1_gpu = cle.push_zyx(test_image1)
test_image2_gpu = cle.push_zyx(test_image2)

for i in range(0, 10):
    start_time = time.time()
    result_image = cle.multiply_images(test_image1_gpu, test_image2_gpu)
    print("clEsperanto elementwise multiplication duration (+ memory allocation): " + str(time.time() - start_time))

clEsperanto elementwise multiplication duration (+ memory allocation): 0.010002374649047852
clEsperanto elementwise multiplication duration (+ memory allocation): 0.03779125213623047
clEsperanto elementwise multiplication duration (+ memory allocation): 0.013278961181640625
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010860681533813477
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010389566421508789
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010413408279418945
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010461091995239258
clEsperanto elementwise multiplication duration (+ memory allocation): 0.01034092903137207
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010332584381103516
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010344982147216797
