# Matrix multiplication
clEsperanto brings operations for multiplying images and matrices which are also available with numpy. Let's see how numpy performs in comparison with our OpenCL stuff. When doing similar comparisons with ImageJ, we saw more performance benefits when GPU-accelerating 3D operations compared to 2D operations. https://clij.github.io/clij-benchmarking/benchmarking_operations_jmh

**Note:** benchmarking results vary heavily depending on image size, kernel size, used operations, parameters and used hardware. Use this notebook to adapt it to your use-case scenario and benchmark on your target hardware. If you have different scenarios or use-cases, you are very welcome to submit your notebook as pull-request!

In [1]:
import pyclesperanto_prototype as cle
import time
import numpy as np
import cupy as cp

# to measure kernel execution duration properly, we need to set this flag. It will slow down exection of workflows a bit though
cle.set_wait_for_kernel_finish(True)

# selet a GPU with the following in the name. This will fallback to any other GPU if none with this name is found
cle.select_device('RTX')

<NVIDIA GeForce RTX 3050 Ti Laptop GPU on Platform: NVIDIA CUDA (1 refs)>

## Matrix multiplication

In [2]:
# test data
test_matrix1 = np.random.random([1024, 512])
test_matrix2 = np.random.random([512, 1024])

In [3]:
# multiply with clesperanto
result_matrix = cle.create([1024, 1024])

test_matrix1_gpu = cle.push_zyx(test_matrix1)
test_matrix2_gpu = cle.push_zyx(test_matrix2)

for i in range(0, 10):
    start_time = time.time()
    cle.multiply_matrix(test_matrix1_gpu, test_matrix2_gpu, result_matrix)
    print("clEsperanto matrix multiplication duration: " + str(time.time() - start_time))


clEsperanto matrix multiplication duration: 0.02301478385925293
clEsperanto matrix multiplication duration: 0.01399540901184082
clEsperanto matrix multiplication duration: 0.014003276824951172
clEsperanto matrix multiplication duration: 0.01000213623046875
clEsperanto matrix multiplication duration: 0.009001731872558594
clEsperanto matrix multiplication duration: 0.010002374649047852
clEsperanto matrix multiplication duration: 0.011009454727172852
clEsperanto matrix multiplication duration: 0.009995222091674805
clEsperanto matrix multiplication duration: 0.010002374649047852
clEsperanto matrix multiplication duration: 0.010009527206420898


In [4]:
# multiply with numpy
result_matrix = None
cu_test_matrix1 = cp.asarray(test_matrix1)
cu_test_matrix2 = cp.asarray(test_matrix2)

for i in range(0, 10):
    start_time = time.time()
    result_matrix = cp.matmul(cu_test_matrix1, cu_test_matrix2, out=result_matrix)
    cp.cuda.stream.get_current_stream().synchronize() # we need to wait here to measure time properly
    print("cupy matrix multiplication duration: " + str(time.time() - start_time))

print(result_matrix.shape)

cupy matrix multiplication duration: 0.4006175994873047
cupy matrix multiplication duration: 0.018997907638549805
cupy matrix multiplication duration: 0.01900482177734375
cupy matrix multiplication duration: 0.01900339126586914
cupy matrix multiplication duration: 0.019004106521606445
cupy matrix multiplication duration: 0.01900458335876465
cupy matrix multiplication duration: 0.019004106521606445
cupy matrix multiplication duration: 0.019004106521606445
cupy matrix multiplication duration: 0.0200045108795166
cupy matrix multiplication duration: 0.019004344940185547
(1024, 1024)


In [5]:
# multiply with numpy
result_matrix = None

for i in range(0, 10):
    start_time = time.time()
    result_matrix = np.matmul(test_matrix1, test_matrix2, out=result_matrix)
    print("Numpy matrix multiplication duration: " + str(time.time() - start_time))

print(result_matrix.shape)

Numpy matrix multiplication duration: 0.011002540588378906
Numpy matrix multiplication duration: 0.007002115249633789
Numpy matrix multiplication duration: 0.006000995635986328
Numpy matrix multiplication duration: 0.011002540588378906
Numpy matrix multiplication duration: 0.00400090217590332
Numpy matrix multiplication duration: 0.0040013790130615234
Numpy matrix multiplication duration: 0.004000663757324219
Numpy matrix multiplication duration: 0.006001472473144531
Numpy matrix multiplication duration: 0.0110015869140625
Numpy matrix multiplication duration: 0.0042684078216552734
(1024, 1024)


## Elementwise multiplication

In [6]:
# test data
import numpy as np

test_image1 = np.random.random([100, 512, 512])
test_image2 = np.random.random([100, 512, 512])

In [7]:
# multiply with numpy
result_image = None

for i in range(0, 10):
    start_time = time.time()
    result_image = np.multiply(test_image1, test_image2)
    print("Numpy elementwise multiplication duration: " + str(time.time() - start_time))
    

Numpy elementwise multiplication duration: 0.06801581382751465
Numpy elementwise multiplication duration: 0.08701801300048828
Numpy elementwise multiplication duration: 0.089019775390625
Numpy elementwise multiplication duration: 0.07501673698425293
Numpy elementwise multiplication duration: 0.08201789855957031
Numpy elementwise multiplication duration: 0.08601880073547363
Numpy elementwise multiplication duration: 0.08401870727539062
Numpy elementwise multiplication duration: 0.07001566886901855
Numpy elementwise multiplication duration: 0.08402299880981445
Numpy elementwise multiplication duration: 0.07301664352416992


In [8]:
# multiply with pyclesperanto
result_image = None

test_image1_gpu = cle.push_zyx(test_image1)
test_image2_gpu = cle.push_zyx(test_image2)

for i in range(0, 10):
    start_time = time.time()
    result_image = cle.multiply_images(test_image1_gpu, test_image2_gpu, result_image)
    print("clEsperanto elementwise multiplication duration: " + str(time.time() - start_time))

clEsperanto elementwise multiplication duration: 0.01300358772277832
clEsperanto elementwise multiplication duration: 0.0030014514923095703
clEsperanto elementwise multiplication duration: 0.0030007362365722656
clEsperanto elementwise multiplication duration: 0.003000497817993164
clEsperanto elementwise multiplication duration: 0.002000570297241211
clEsperanto elementwise multiplication duration: 0.004003763198852539
clEsperanto elementwise multiplication duration: 0.00499272346496582
clEsperanto elementwise multiplication duration: 0.003000497817993164
clEsperanto elementwise multiplication duration: 0.0030007362365722656
clEsperanto elementwise multiplication duration: 0.003000974655151367


In [9]:
# multiply with pyclesperanto while _not_ reusing memory
result_image = None

test_image1_gpu = cle.push_zyx(test_image1)
test_image2_gpu = cle.push_zyx(test_image2)

for i in range(0, 10):
    start_time = time.time()
    result_image = cle.multiply_images(test_image1_gpu, test_image2_gpu)
    print("clEsperanto elementwise multiplication duration (+ memory allocation): " + str(time.time() - start_time))

clEsperanto elementwise multiplication duration (+ memory allocation): 0.010004997253417969
clEsperanto elementwise multiplication duration (+ memory allocation): 0.011002779006958008
clEsperanto elementwise multiplication duration (+ memory allocation): 0.01453709602355957
clEsperanto elementwise multiplication duration (+ memory allocation): 0.011003971099853516
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010394573211669922
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010910749435424805
clEsperanto elementwise multiplication duration (+ memory allocation): 0.011003494262695312
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010691165924072266
clEsperanto elementwise multiplication duration (+ memory allocation): 0.01069784164428711
clEsperanto elementwise multiplication duration (+ memory allocation): 0.010003089904785156
