In [1]:
import pyclesperanto as cle
import numpy as np

cle.select_device(1, "gpu")

(OpenCL) Apple M2 Max (OpenCL 1.2 )
	Vendor:                      Apple
	Driver Version:              1.2 1.0
	Device Type:                 GPU
	Compute Units:               30
	Global Memory Size:          21845 MB
	Local Memory Size:           0 MB
	Maximum Buffer Size:         4096 MB
	Max Clock Frequency:         1000 MHz
	Image Support:               Yes

In [5]:
arr_a = cle.push(np.random.rand(10000,10000).astype(np.float32))
arr_b = cle.push(np.random.rand(10000,10000).astype(np.float32))
arr_c = cle.create_like(arr_a)

In [6]:
source = """
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel void multiply_matrix_optimized(
    IMAGE_src0_TYPE  src0,
    IMAGE_src1_TYPE  src1,
    IMAGE_dst_TYPE   dst
)
{
  const int x = get_global_id(0);
  const int y = get_global_id(1);
  const int local_x = get_local_id(0);
  const int local_y = get_local_id(1);

  const int src0_width = GET_IMAGE_WIDTH(src0);
  const int src1_width = GET_IMAGE_WIDTH(src1);
  const int src0_height = GET_IMAGE_HEIGHT(src0);
  const int src1_height = GET_IMAGE_HEIGHT(src1);

  __local float tile_src0[TILE_SIZE][TILE_SIZE];
  __local float tile_src1[TILE_SIZE][TILE_SIZE];

  // Work coarsening: compute 4 output elements per work item using vector type
  float4 sum = (float4)(0.0f);

  // Process matrix in tiles
  for (int tile = 0; tile < (src0_width + TILE_SIZE - 1) / TILE_SIZE; ++tile) {
      // Load tiles into local memory
      const int tile_col = tile * TILE_SIZE + local_x;
      const int tile_row = tile * TILE_SIZE + local_y;

      if (tile_col < src0_width && y < src0_height) {
          tile_src0[local_y][local_x] = READ_IMAGE(src0, sampler, POS_src0_INSTANCE(tile_col, y, 0, 0)).x;
      } else {
          tile_src0[local_y][local_x] = 0;
      }

      if (tile_row < src1_height && x < src1_width) {
          tile_src1[local_y][local_x] = READ_IMAGE(src1, sampler, POS_src1_INSTANCE(x, tile_row, 0, 0)).x;
      } else {
          tile_src1[local_y][local_x] = 0;
      }

      barrier(CLK_LOCAL_MEM_FENCE); // do we need both?

      // Compute partial dot product with unrolled loops (4x unrolling)
      // This reduces loop overhead and improves instruction-level parallelism
      for (int i = 0; i < TILE_SIZE; i += 4) {
          // Unroll 4 iterations at a time
          sum.x += tile_src0[local_y][i]     * tile_src1[i][local_x];
          sum.y += tile_src0[local_y][i + 1] * tile_src1[i + 1][local_x];
          sum.z += tile_src0[local_y][i + 2] * tile_src1[i + 2][local_x];
          sum.w += tile_src0[local_y][i + 3] * tile_src1[i + 3][local_x];
      }

      barrier(CLK_LOCAL_MEM_FENCE); // do we need both?
  }

  // Accumulate the 4 partial sums (work coarsening output elements)
  const float final_sum = sum.x + sum.y + sum.z + sum.w;
  
  WRITE_IMAGE(dst, POS_dst_INSTANCE(x, y, 0, 0), CONVERT_dst_PIXEL_TYPE(final_sum));
}
"""

In [8]:
tile_size = 32
param_dict = {'src0': arr_a, 'src1': arr_b, 'dst': arr_c}
dict_dict = {'TILE_SIZE': tile_size}
global_range = (
    int(((arr_a.shape[0] + tile_size - 1) / tile_size) * tile_size),
    int(((arr_a.shape[1] + tile_size - 1) / tile_size) * tile_size),
)
local_range = (tile_size, tile_size)

In [9]:
cle.execute(
        kernel_source=source,
        kernel_name="multiply_matrix_optimized",
        global_size=global_range,
        local_size=local_range,
        parameters=param_dict,
        constants=dict_dict
    )

TypeError: execute() got an unexpected keyword argument 'local_size'. Did you mean 'global_size'?