From 3ae94b82abeea419a4ac3da31d55f7f193ec0c1e Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Fri, 26 Apr 2024 10:23:29 +0100
Subject: [PATCH] Example changes for Intel PVC pipeline

---
 examples/CMakeLists.txt                       |   1 +
 examples/sycl/CMakeLists.txt                  |  31 ++
 examples/sycl/pvc/CMakeLists.txt              |  34 ++
 .../sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp    | 420 ++++++++++++++++++
 include/cutlass/cutlass.h                     |  15 +-
 include/cutlass/kernel_hardware_info.h        |  10 +-
 include/cutlass/pipeline/sm90_pipeline.hpp    |   2 +-
 include/cutlass/relatively_equal.h            |   6 +-
 .../util/include/cutlass/util/device_memory.h |  24 +-
 .../util/reference/device/gemm_complex.h      |  94 +++-
 .../reference/device/kernel/tensor_foreach.h  |  10 +-
 .../util/reference/device/tensor_compare.h    |  63 ++-
 .../util/reference/device/tensor_foreach.h    |  32 +-
 13 files changed, 707 insertions(+), 35 deletions(-)
 create mode 100644 examples/sycl/CMakeLists.txt
 create mode 100644 examples/sycl/pvc/CMakeLists.txt
 create mode 100644 examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0ca87da08..861794e06 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -149,6 +149,7 @@ foreach(EXAMPLE
   57_hopper_grouped_gemm
   58_ada_fp8_gemm
   59_ampere_gather_scatter_conv
+  sycl
   )
 
   add_subdirectory(${EXAMPLE})
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
new file mode 100644
index 000000000..737d6467f
--- /dev/null
+++ b/examples/sycl/CMakeLists.txt
@@ -0,0 +1,31 @@
+
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+add_subdirectory(pvc)
diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt
new file mode 100644
index 000000000..76b356459
--- /dev/null
+++ b/examples/sycl/pvc/CMakeLists.txt
@@ -0,0 +1,34 @@
+
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  pvc_bfloat_dpas_gemm_cute
+  pvc_bfloat_dpas_gemm_cute.cpp
+)
diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
new file mode 100644
index 000000000..432a4070c
--- /dev/null
+++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
@@ -0,0 +1,420 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+template <typename T>
+static void fill_matrix(std::vector<T> &M)
+{
+  std::random_device dev;
+  std::mt19937 rng(dev());
+  std::uniform_real_distribution<float> dist(1.0, 2.0);
+  std::generate(std::begin(M), std::end(M), [&]
+                { return static_cast<T>(dist(rng)); });
+}
+
+template <typename T>
+static void vnni_matrix(
+    T* dst, const T* src,
+    int batch, int numRows, int numCols, int factor)
+{
+    for (int b = 0; b < batch; b++) {
+      for (int r = 0; r < numRows / factor; r++) {
+          for (int c = 0; c < numCols; c++) {
+              for (int k = 0; k < factor; k++) {
+                  dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
+                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
+              }
+          }
+      }
+    }
+}
+
+template <typename DstT, typename SrcT>
+static void compute_reference(
+    std::vector<DstT>& C,
+    const std::vector<SrcT>& A, const std::vector<SrcT>& B,
+    int batch, int M, int N, int K)
+{
+    for (int b = 0; b < batch; b++) {
+      for (int m = 0; m < M; m++) {
+          for (int n = 0; n < N; n++) {
+              DstT sum = 0;
+              for (int k = 0; k < K; k++) {
+                  sum = std::fma(static_cast<DstT>(A[(b * M + m) * K + k]),
+                                static_cast<DstT>(B[(b * K + k) * N + n]), sum);
+              }
+              C[(b * M + m) * N + n] = sum;
+          }
+      }
+    }
+}
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l, iterations;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(4096), n(4096), k(4096), l(1), iterations(100),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 4096);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "PVC GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementB> block_B_vnni;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    syclcompat::wait();
+
+    // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
+    // need to set a larger error margin for comparison to succeed
+    bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), 0.5f, 0.5f);
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_B_vnni.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    // TODO: Enable initialization on device directly once RNG is 
+    // available through SYCL.
+    std::vector<ElementA> a(K * M * L);
+    std::vector<ElementB> b(K * N * L);
+    std::vector<ElementB> b_vnni(b.size());
+    std::vector<ElementC> c(M * N * L);
+    std::vector<ElementC> d(M * N * L, ElementC{0});
+
+    fill_matrix(a);
+    fill_matrix(b);
+    fill_matrix(c);
+    vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
+
+    syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA));
+    syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB));
+    syclcompat::memcpy(block_B_vnni.get(), b_vnni.data(), b.size() * sizeof(ElementB));
+    syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC));
+    syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC));
+  }
+
+  void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B_vnni.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    gemm_op.can_implement(arguments);
+
+    gemm_op.initialize(arguments, workspace.get());
+
+    // Run the GEMM
+    gemm_op.run();
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "PVC GEMM Example : " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (passed && options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        gemm_op.run();
+      }
+      syclcompat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      printf("PVC GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using GmemTiledCopyA = XE_2D_LOAD;
+  using GmemTiledCopyB = XE_2D_LOAD;
+
+  using TileShape = Shape<_32, _32, _16>;
+
+  using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_BF16BF16F32F32_NN>,
+          Layout<Shape<_8,_16,_1>>>;
+
+  using DispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated;
+
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+          // memory access. For a byte, it's 16
+          // elements. This becomes the vector width of
+          // math instructions in the epilogue too
+          ElementAccumulator,                                // <- data type of accumulator
+          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          cutlass::gemm::TagToStrideC_t<LayoutC>,
+          cutlass::gemm::TagToStrideC_t<LayoutD>,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+// Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>,
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>,
+          TiledMma,
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>,
+  CollectiveMainloop,
+  CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index 32851afdb..a43af992d 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -178,7 +178,7 @@ CUTLASS_HOST_DEVICE uint BlockDimX() {
 #if defined(__CUDA_ARCH__) 
   return blockDim.x;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_range::x();
+  return syclcompat::local_range::x();
 #else
   return 0;
 #endif
@@ -188,7 +188,7 @@ CUTLASS_HOST_DEVICE uint BlockDimY() {
 #if defined(__CUDA_ARCH__) 
   return blockDim.y;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_range::y();
+  return syclcompat::local_range::y();
 #else
   return 0;
 #endif
@@ -198,7 +198,7 @@ CUTLASS_HOST_DEVICE uint BlockDimZ() {
 #if defined(__CUDA_ARCH__) 
   return blockDim.z;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_range::z();
+  return syclcompat::local_range::z();
 #else
   return 0;
 #endif
@@ -208,7 +208,7 @@ CUTLASS_HOST_DEVICE uint GridDimX() {
 #if defined(__CUDA_ARCH__) 
   return gridDim.x;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::global_range::x();
+  return syclcompat::work_group_range::x();
 #else
   return 0;
 #endif
@@ -218,7 +218,7 @@ CUTLASS_HOST_DEVICE uint GridDimY() {
 #if defined(__CUDA_ARCH__) 
   return gridDim.y;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::global_range::y();
+  return syclcompat::work_group_range::y();
 #else
   return 0;
 #endif
@@ -228,7 +228,7 @@ CUTLASS_HOST_DEVICE uint GridDimZ() {
 #if defined(__CUDA_ARCH__) 
   return gridDim.z;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::global_range::z();
+  return syclcompat::work_group_range::z();
 #else
   return 0;
 #endif
@@ -372,6 +372,9 @@ CUTLASS_DEVICE int atomicCAS(int *address, int compare, int val) {
 CUTLASS_HOST_DEVICE bool thread0() {
   #if defined(__CUDA_ARCH__)
     return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
+  #elif defined(CUTLASS_ENABLE_SYCL)
+    return (!syclcompat::local_id::x() && !syclcompat::local_id::y() && !syclcompat::local_id::z()) &&
+    (!syclcompat::work_group_id::x() && !syclcompat::work_group_id::y() && !syclcompat::work_group_id::z());
   #else
     return false;
   #endif
diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h
index b69399ff0..de1001675 100644
--- a/include/cutlass/kernel_hardware_info.h
+++ b/include/cutlass/kernel_hardware_info.h
@@ -59,7 +59,15 @@ struct KernelHardwareInfo {
   // Methods
   //
 
-#if !defined(__CUDACC_RTC__)
+#if defined (CUTLASS_ENABLE_SYCL)
+  static inline int
+  query_device_multiprocessor_count(int device_id = 0) {
+    syclcompat::device_ext dev;
+    int multiprocessor_count = dev.get_max_compute_units();
+    return multiprocessor_count;
+  }
+
+#elif !defined(__CUDACC_RTC__)
   static inline int
   query_device_multiprocessor_count(int device_id = 0) {
     cudaError_t result = cudaGetDevice(&device_id);
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
index e48d27560..ffe95376b 100644
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -417,7 +417,7 @@ private :
     }
 
     // Most likely you have elected more than one leader
-    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
+    if (params_.is_leader && (ThreadIdxX() % 32 != 0)) {
       asm volatile ("brkpt;\n" ::);
     }
     #endif
diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index 81e80281b..510b4295a 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -55,8 +55,10 @@ namespace detail {
 template <typename T>
 CUTLASS_HOST_DEVICE
 bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
-  
-#if defined(__CUDACC_RTC__)
+
+#if defined (CUTLASS_ENABLE_SYCL)
+  using sycl::abs;
+#elif defined(__CUDACC_RTC__)
   using cuda::std::abs;
 #else
   using std::abs;
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 4ccc6447a..e8d3cccbb 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -58,11 +58,19 @@ T* allocate(size_t count = 1) {
 
   bytes = count * sizeof(T);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  if (bytes > 0) {
+    ptr = reinterpret_cast<T*>(syclcompat::malloc(bytes));
+    if ((void*)ptr == nullptr) {
+      throw std::runtime_error("Failed to allocate memory");
+    }
+  }
+#else
   cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
-
   if (cuda_error != cudaSuccess) {
     throw cuda_exception("Failed to allocate memory", cuda_error);
   }
+#endif
 
   return ptr;
 }
@@ -71,10 +79,14 @@ T* allocate(size_t count = 1) {
 template <typename T>
 void free(T* ptr) {
   if (ptr) {
+#if defined(CUTLASS_ENABLE_SYCL)
+    syclcompat::free(reinterpret_cast<void*>(ptr));
+#else
     cudaError_t cuda_error = (cudaFree(ptr));
     if (cuda_error != cudaSuccess) {
       throw cuda_exception("Failed to free device memory", cuda_error);
     }
+#endif
   }
 }
 
@@ -87,10 +99,14 @@ void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
   size_t bytes = count * sizeof_bits<T>::value / 8;
   if (bytes == 0 && count > 0)
     bytes = 1;
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::memcpy(dst, src, bytes);
+#else
   cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
   if (cuda_error != cudaSuccess) {
     throw cuda_exception("cudaMemcpy() failed", cuda_error);
-  }
+  }  
+#endif
 }
 
 template <typename T>
@@ -140,12 +156,16 @@ class DeviceAllocation {
   /// Delete functor for CUDA device memory
   struct deleter {
     void operator()(T* ptr) {
+#ifdef CUTLASS_ENABLE_SYCL
+      syclcompat::free(reinterpret_cast<void*>(ptr));
+#else
       cudaError_t cuda_error = (cudaFree(ptr));
       if (cuda_error != cudaSuccess) {
         // noexcept
         //                throw cuda_exception("cudaFree() failed", cuda_error);
         return;
       }
+#endif
     }
   };
 
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index b4d41bd28..f49439b62 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -102,16 +102,16 @@ __global__ void GemmComplex(
   ConvertOp convert_op;
   InnerProductOp inner_product_op;
   
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-  int batch_idx = blockIdx.z;
+  int row_block = (BlockIdxX() * BlockDimX() + ThreadIdxX()) * kMblock;
+  int col_block = (BlockIdxY() * BlockDimY() + ThreadIdxY()) * kNblock; 
+  int batch_idx = BlockIdxZ();
 
   tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
   tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
   tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
   tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
 
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+  for (; batch_idx < batch_count; batch_idx += GridDimZ()) {
 
     // Compute matrix product using blocks
     ComputeType accum[kMblock][kNblock];
@@ -171,10 +171,10 @@ __global__ void GemmComplex(
       }
     }
 
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
+    tensor_a.add_pointer_offset(batch_stride_A * GridDimZ());
+    tensor_b.add_pointer_offset(batch_stride_B * GridDimZ());
+    tensor_c.add_pointer_offset(batch_stride_C * GridDimZ());
+    tensor_d.add_pointer_offset(batch_stride_D * GridDimZ());
 
   } // for (batch_idx)
 }
@@ -236,6 +236,42 @@ void GemmComplex(
   );
 
   if (grid.y <= std::numeric_limits<uint16_t>::max()) {
+#if defined(CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+  syclcompat::launch<kernel::GemmComplex<
+                      ElementA,
+                      LayoutA,
+                      ElementB,
+                      LayoutB,
+                      ElementC,
+                      LayoutC,
+                      ScalarType,
+                      ComputeType,
+                      ElementD,
+                      ConvertOp,
+                      InnerProductOp,
+                      kMblock,
+                      kNblock
+                    >>(sycl_grid, sycl_block, 
+                        problem_size,
+                        alpha,
+                        tensor_a,
+                        transform_a,
+                        tensor_b,
+                        transform_b,
+                        beta,
+                        tensor_c,
+                        tensor_d,
+                        initial_accum,
+                        batch_count,
+                        batch_stride_A,
+                        batch_stride_B,
+                        batch_stride_C,
+                        batch_stride_D
+                    );
+#else
     kernel::GemmComplex<
       ElementA,
       LayoutA,
@@ -267,6 +303,7 @@ void GemmComplex(
       batch_stride_C,
       batch_stride_D
     );
+#endif
   } else {
     // Using bigger thread tile size
     int const kBigMblock = 4;
@@ -274,11 +311,47 @@ void GemmComplex(
 
     dim3 Bigblock(16, 8);
     dim3 Biggrid(
-      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
-      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
+      (problem_size.m() + Bigblock.x * kBigMblock - 1) / (Bigblock.x * kBigMblock),
+      (problem_size.n() + Bigblock.y * kBigNblock - 1) / (Bigblock.y * kBigNblock),
       batch_count % std::numeric_limits<uint16_t>::max()
     );
 
+#if defined (CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(Bigblock.x, Bigblock.y, Bigblock.z);
+  const auto sycl_grid = syclcompat::dim3(Biggrid.x, Biggrid.y, Biggrid.z);
+
+  syclcompat::launch<kernel::GemmComplex<
+                      ElementA,
+                      LayoutA,
+                      ElementB,
+                      LayoutB,
+                      ElementC,
+                      LayoutC,
+                      ScalarType,
+                      ComputeType,
+                      ElementD,
+                      ConvertOp,
+                      InnerProductOp,
+                      kBigMblock,
+                      kBigNblock
+                    >>(sycl_grid, sycl_block, 
+                        problem_size,
+                        alpha,
+                        tensor_a,
+                        transform_a,
+                        tensor_b,
+                        transform_b,
+                        beta,
+                        tensor_c,
+                        tensor_d,
+                        initial_accum,
+                        batch_count,
+                        batch_stride_A,
+                        batch_stride_B,
+                        batch_stride_C,
+                        batch_stride_D
+                    );
+#else
     kernel::GemmComplex<
       ElementA,
       LayoutA,
@@ -310,6 +383,7 @@ void GemmComplex(
       batch_stride_C,
       batch_stride_D
     );
+#endif
   }
 }
 
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index a64a419d8..3c52294b7 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -94,7 +94,7 @@ __global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
 
   Func func(params);
 
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX();
   int64_t max_index = 1;
 
   CUTLASS_PRAGMA_UNROLL
@@ -107,7 +107,7 @@ __global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
     Coord<Rank> coord;
 
     detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index); 
-    index += blockDim.x * gridDim.x;
+    index += BlockDimX() * GridDimX();
   }
 }
 
@@ -119,7 +119,7 @@ __global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start
 
   Func func(params);
 
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
+  int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX() + start;
 
   if (index < end) {
     Coord<Rank> coord;
@@ -143,9 +143,9 @@ __global__ void BlockForEach(
 
   Func func(params);
 
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t index = ThreadIdxX() + BlockIdxX() * BlockDimX();
 
-  for (; index < capacity; index += blockDim.x * gridDim.x) {
+  for (; index < capacity; index += BlockDimX() * GridDimX()) {
     ReferenceFactory<Element>::get(ptr, index) = func();
   }
 }
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index e6b36990f..041f76813 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -59,9 +59,9 @@ __global__ void BlockCompareEqual(
   Element const *ptr_B,
   size_t capacity) {
 
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX();
 
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+  for (; idx < capacity; idx += GridDimX() * BlockDimX()) {
 
     Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
     Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
@@ -83,9 +83,9 @@ __global__ void BlockCompareRelativelyEqual(
   Element epsilon,
   Element nonzero_floor) {
 
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX();
 
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+  for (; idx < capacity; idx += GridDimX() * BlockDimX()) {
 
     Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
     Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
@@ -114,6 +114,13 @@ bool BlockCompareEqual(
   int equal_flag = 1;
   int *device_equal_flag = nullptr;
 
+#if defined (CUTLASS_ENABLE_SYCL)
+  device_equal_flag = reinterpret_cast<int*>(syclcompat::malloc(sizeof(int)));
+  if (device_equal_flag == nullptr) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+  syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
+#else
   if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
     throw std::runtime_error("Failed to allocate device flag.");
   }
@@ -126,9 +133,14 @@ bool BlockCompareEqual(
 
     throw std::runtime_error("Failed to copy equality flag to device.");
   }
+#endif
 
   if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+    block_size = 128;
+    grid_size = (capacity + block_size - 1) / block_size;
+    grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error.
+#else
     // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
     cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
       &grid_size,
@@ -142,11 +154,23 @@ bool BlockCompareEqual(
     // Limit block size. This has the effect of increasing the number of items processed by a
     // single thread and reduces the impact of initialization overhead.
     block_size = (block_size < 128 ? block_size : 128);
+#endif
   }
 
   dim3 grid(grid_size, 1, 1);
   dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+  syclcompat::launch<kernel::BlockCompareEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity);
+  syclcompat::wait();
+
+  syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
+
+  syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
+#else
   kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
 
   if (cudaMemcpy(
@@ -161,6 +185,7 @@ bool BlockCompareEqual(
   }
 
   cudaFree(device_equal_flag);
+#endif
 
   return equal_flag;
 }
@@ -181,6 +206,13 @@ bool BlockCompareRelativelyEqual(
   int equal_flag = 1;
   int *device_equal_flag = nullptr;
 
+#if defined (CUTLASS_ENABLE_SYCL)
+  device_equal_flag = reinterpret_cast<int*>(syclcompat::malloc(sizeof(int)));
+  if (device_equal_flag == nullptr) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+  syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
+#else
   if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
     throw std::runtime_error("Failed to allocate device flag.");
   }
@@ -193,9 +225,14 @@ bool BlockCompareRelativelyEqual(
 
     throw std::runtime_error("Failed to copy equality flag to device.");
   }
+#endif
 
   if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+    block_size = 128;
+    grid_size = (capacity + block_size - 1) / block_size;
+    grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error.
+#else
     // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
     cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
       &grid_size,
@@ -209,11 +246,24 @@ bool BlockCompareRelativelyEqual(
     // Limit block size. This has the effect of increasing the number of items processed by a
     // single thread and reduces the impact of initialization overhead.
     block_size = (block_size < 128 ? block_size : 128);
+#endif
   }
 
   dim3 grid(grid_size, 1, 1);
   dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+  syclcompat::launch<kernel::BlockCompareRelativelyEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity,
+                                                                  epsilon, nonzero_floor);
+  syclcompat::wait();
+
+  syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
+
+  syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
+#else
   kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
     device_equal_flag, 
     ptr_A, 
@@ -235,6 +285,7 @@ bool BlockCompareRelativelyEqual(
   }
 
   cudaFree(device_equal_flag);
+#endif
 
   return equal_flag;
 }
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 3911b0240..be5bda948 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -51,7 +51,11 @@ struct TensorForEach {
     cudaStream_t stream = nullptr) {
 
     if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+      // TODO: query the queue for block size
+      block_size = 128;
+      grid_size = (size(size) + block_size - 1) / block_size;
+#else
       // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
       cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
         &grid_size,
@@ -65,12 +69,19 @@ struct TensorForEach {
       // Limit block size. This has the effect of increasing the number of items processed by a
       // single thread and reduces the impact of initialization overhead.
       block_size = (block_size < 128 ? block_size : 128);
+#endif
     }
 
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    syclcompat::launch<kernel::TensorForEach<Func, Rank, Params>>(sycl_grid, sycl_block, 0, size, params);
+#else
     kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
+#endif
   }
 };
 
@@ -93,8 +104,14 @@ struct TensorDiagonalForEach {
     dim3 block(block_size, 1, 1);
     dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    syclcompat::launch<kernel::TensorDiagonalForEach<Func, Rank, Params>>(sycl_grid, sycl_block, 0, size, params, start, end);
+#else
     kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
       size, params, start, end);
+#endif
   }
 };
 
@@ -114,7 +131,11 @@ struct BlockForEach {
     cudaStream_t stream = nullptr) {
 
     if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+      // TODO: query the queue for block size
+      block_size = 128;
+      grid_size = (capacity + block_size - 1) / block_size;
+#else
       // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
       cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
         &grid_size,
@@ -128,12 +149,19 @@ struct BlockForEach {
       // Limit block size. This has the effect of increasing the number of items processed by a
       // single thread and reduces the impact of initialization overhead.
       block_size = (block_size < 128 ? block_size : 128);
+#endif
     }
 
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    syclcompat::launch<kernel::BlockForEach<Element, Func>>(sycl_grid, sycl_block, 0, ptr, capacity, params);
+#else
     kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
+#endif
   }
 };