From c4677bcb9099f42b762d4af092ba9c6c974767dd Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Fri, 26 Apr 2024 10:23:29 +0100
Subject: [PATCH 1/7] Example changes for Intel PVC pipeline

---
 examples/CMakeLists.txt                       |   1 +
 examples/sycl/CMakeLists.txt                  |  31 ++
 examples/sycl/pvc/CMakeLists.txt              |  34 ++
 .../sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp    | 400 ++++++++++++++++++
 include/cutlass/kernel_hardware_info.h        |  10 +-
 include/cutlass/pipeline/sm90_pipeline.hpp    |   2 +-
 include/cutlass/relatively_equal.h            |   6 +-
 .../util/include/cutlass/util/device_memory.h |  24 +-
 .../util/reference/device/gemm_complex.h      |  94 +++-
 .../reference/device/kernel/tensor_foreach.h  |  10 +-
 .../util/reference/device/tensor_compare.h    |  63 ++-
 .../util/reference/device/tensor_foreach.h    |  32 +-
 12 files changed, 678 insertions(+), 29 deletions(-)
 create mode 100644 examples/sycl/CMakeLists.txt
 create mode 100644 examples/sycl/pvc/CMakeLists.txt
 create mode 100644 examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0ca87da083..861794e062 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -149,6 +149,7 @@ foreach(EXAMPLE
   57_hopper_grouped_gemm
   58_ada_fp8_gemm
   59_ampere_gather_scatter_conv
+  sycl
   )
 
   add_subdirectory(${EXAMPLE})
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
new file mode 100644
index 0000000000..737d6467fd
--- /dev/null
+++ b/examples/sycl/CMakeLists.txt
@@ -0,0 +1,31 @@
+
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+add_subdirectory(pvc)
diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt
new file mode 100644
index 0000000000..76b3564591
--- /dev/null
+++ b/examples/sycl/pvc/CMakeLists.txt
@@ -0,0 +1,34 @@
+
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  pvc_bfloat_dpas_gemm_cute
+  pvc_bfloat_dpas_gemm_cute.cpp
+)
diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
new file mode 100644
index 0000000000..6974c211e6
--- /dev/null
+++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
@@ -0,0 +1,400 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+template <typename T>
+static void fill_matrix(std::vector<T> &M)
+{
+  std::random_device dev;
+  std::mt19937 rng(dev());
+  std::uniform_real_distribution<float> dist(1.0, 2.0);
+  std::generate(std::begin(M), std::end(M), [&]
+                { return static_cast<T>(dist(rng)); });
+}
+
+template <typename T>
+static void vnni_matrix(
+    T* dst, const T* src,
+    int batch, int numRows, int numCols, int factor)
+{
+    for (int b = 0; b < batch; b++) {
+      for (int r = 0; r < numRows / factor; r++) {
+          for (int c = 0; c < numCols; c++) {
+              for (int k = 0; k < factor; k++) {
+                  dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
+                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
+              }
+          }
+      }
+    }
+}
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+
+  int m, n, k, l, iterations;
+  float alpha, beta;
+
+  Options():
+    help(false),
+    error(false),
+    m(4096), n(4096), k(4096), l(1), iterations(100),
+    alpha(1.f), beta(0.f)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m, 4096);
+    cmd.get_cmd_line_argument("n", n, 4096);
+    cmd.get_cmd_line_argument("k", k, 4096);
+    cmd.get_cmd_line_argument("l", l, 1);
+    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+    cmd.get_cmd_line_argument("beta", beta, 0.f);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "PVC GEMM Example\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+      << "  --alpha=<s32>               Epilogue scalar alpha\n"
+      << "  --beta=<s32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class Gemm
+>
+struct ExampleRunner {
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using LayoutD = typename Gemm::LayoutD;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementAcc = typename Gemm::ElementAccumulator;
+
+  using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideA stride_A;
+  StrideB stride_B;
+  StrideC stride_C;
+  StrideD stride_D;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementB> block_B_vnni;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementOutput> block_D;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+  //
+  // Methods
+  //
+
+  bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+    auto [M, N, K, L] = problem_size;
+
+    cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+    cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+    cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+    cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+    cutlass::reference::device::GemmComplex(
+          {M, N, K},
+          alpha,
+          ref_A,
+          cutlass::ComplexTransform::kNone,
+          ref_B,
+          cutlass::ComplexTransform::kNone,
+          beta,
+          ref_C,
+          ref_D,
+          ElementAccumulator(0),
+          L,     // batch_count
+          M * K, // batch_stride_A
+          K * N, // batch_stride_B
+          M * N, // batch_stride_C
+          M * N  // batch_stride_D
+        );
+
+    syclcompat::wait();
+
+    // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
+    // need to set a larger error margin for comparison to succeed
+    bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), 0.5f, 0.5f);
+
+    return passed;
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  void initialize(const ProblemShapeType& problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    block_A.reset(M * K * L);
+    block_B.reset(K * N * L);
+    block_B_vnni.reset(K * N * L);
+    block_C.reset(M * N * L);
+    block_D.reset(M * N * L);
+    block_ref_D.reset(M * N * L);
+
+    // TODO: Enable initialization on device directly once RNG is 
+    // available through SYCL.
+    std::vector<ElementA> a(K * M * L);
+    std::vector<ElementB> b(K * N * L);
+    std::vector<ElementB> b_vnni(b.size());
+    std::vector<ElementC> c(M * N * L);
+    std::vector<ElementC> d(M * N * L, ElementC{0});
+
+    fill_matrix(a);
+    fill_matrix(b);
+    fill_matrix(c);
+    vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
+
+    syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA));
+    syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB));
+    syclcompat::memcpy(block_B_vnni.get(), b_vnni.data(), b.size() * sizeof(ElementB));
+    syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC));
+    syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC));
+  }
+
+  void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+    ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+    initialize(problem_size);
+
+    typename Gemm::GemmKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {block_A.get(), stride_A, block_B_vnni.get(), stride_B},
+      {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+      hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    gemm_op.can_implement(arguments);
+
+    gemm_op.initialize(arguments, workspace.get());
+
+    // Run the GEMM
+    gemm_op.run();
+
+    syclcompat::wait();
+
+    // Verify that the result is correct
+    bool passed = verify(problem_size, options.alpha, options.beta);
+    std::cout << "PVC GEMM Example : " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (passed && options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        gemm_op.run();
+      }
+      syclcompat::wait();
+
+      float cute_time = timer.seconds() / options.iterations;
+      double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+      printf("PVC GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+    }
+
+    return;
+  }
+
+};
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run examples
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using GmemTiledCopyA = XE_2D_LOAD;
+  using GmemTiledCopyB = XE_2D_LOAD;
+
+  using TileShape = Shape<_32, _32, _16>;
+
+  using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_BF16BF16F32F32_NN>,
+          Layout<Shape<_8,_16,_1>>>;
+
+  using DispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated;
+
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+          // memory access. For a byte, it's 16
+          // elements. This becomes the vector width of
+          // math instructions in the epilogue too
+          ElementAccumulator,                                // <- data type of accumulator
+          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          cutlass::gemm::TagToStrideC_t<LayoutC>,
+          cutlass::gemm::TagToStrideC_t<LayoutD>,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+// Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>,
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>,
+          TiledMma,
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>,
+  CollectiveMainloop,
+  CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  ExampleRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h
index b69399ff0a..de10016753 100644
--- a/include/cutlass/kernel_hardware_info.h
+++ b/include/cutlass/kernel_hardware_info.h
@@ -59,7 +59,15 @@ struct KernelHardwareInfo {
   // Methods
   //
 
-#if !defined(__CUDACC_RTC__)
+#if defined (CUTLASS_ENABLE_SYCL)
+  static inline int
+  query_device_multiprocessor_count(int device_id = 0) {
+    syclcompat::device_ext dev;
+    int multiprocessor_count = dev.get_max_compute_units();
+    return multiprocessor_count;
+  }
+
+#elif !defined(__CUDACC_RTC__)
   static inline int
   query_device_multiprocessor_count(int device_id = 0) {
     cudaError_t result = cudaGetDevice(&device_id);
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
index e48d275609..ffe95376b9 100644
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -417,7 +417,7 @@ private :
     }
 
     // Most likely you have elected more than one leader
-    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
+    if (params_.is_leader && (ThreadIdxX() % 32 != 0)) {
       asm volatile ("brkpt;\n" ::);
     }
     #endif
diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index 81e80281b9..510b4295a5 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -55,8 +55,10 @@ namespace detail {
 template <typename T>
 CUTLASS_HOST_DEVICE
 bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
-  
-#if defined(__CUDACC_RTC__)
+
+#if defined (CUTLASS_ENABLE_SYCL)
+  using sycl::abs;
+#elif defined(__CUDACC_RTC__)
   using cuda::std::abs;
 #else
   using std::abs;
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 4ccc6447ac..e8d3cccbb8 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -58,11 +58,19 @@ T* allocate(size_t count = 1) {
 
   bytes = count * sizeof(T);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  if (bytes > 0) {
+    ptr = reinterpret_cast<T*>(syclcompat::malloc(bytes));
+    if ((void*)ptr == nullptr) {
+      throw std::runtime_error("Failed to allocate memory");
+    }
+  }
+#else
   cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
-
   if (cuda_error != cudaSuccess) {
     throw cuda_exception("Failed to allocate memory", cuda_error);
   }
+#endif
 
   return ptr;
 }
@@ -71,10 +79,14 @@ T* allocate(size_t count = 1) {
 template <typename T>
 void free(T* ptr) {
   if (ptr) {
+#if defined(CUTLASS_ENABLE_SYCL)
+    syclcompat::free(reinterpret_cast<void*>(ptr));
+#else
     cudaError_t cuda_error = (cudaFree(ptr));
     if (cuda_error != cudaSuccess) {
       throw cuda_exception("Failed to free device memory", cuda_error);
     }
+#endif
   }
 }
 
@@ -87,10 +99,14 @@ void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
   size_t bytes = count * sizeof_bits<T>::value / 8;
   if (bytes == 0 && count > 0)
     bytes = 1;
+#if defined(CUTLASS_ENABLE_SYCL)
+  syclcompat::memcpy(dst, src, bytes);
+#else
   cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
   if (cuda_error != cudaSuccess) {
     throw cuda_exception("cudaMemcpy() failed", cuda_error);
-  }
+  }  
+#endif
 }
 
 template <typename T>
@@ -140,12 +156,16 @@ class DeviceAllocation {
   /// Delete functor for CUDA device memory
   struct deleter {
     void operator()(T* ptr) {
+#ifdef CUTLASS_ENABLE_SYCL
+      syclcompat::free(reinterpret_cast<void*>(ptr));
+#else
       cudaError_t cuda_error = (cudaFree(ptr));
       if (cuda_error != cudaSuccess) {
         // noexcept
         //                throw cuda_exception("cudaFree() failed", cuda_error);
         return;
       }
+#endif
     }
   };
 
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index b4d41bd28e..f49439b629 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -102,16 +102,16 @@ __global__ void GemmComplex(
   ConvertOp convert_op;
   InnerProductOp inner_product_op;
   
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-  int batch_idx = blockIdx.z;
+  int row_block = (BlockIdxX() * BlockDimX() + ThreadIdxX()) * kMblock;
+  int col_block = (BlockIdxY() * BlockDimY() + ThreadIdxY()) * kNblock; 
+  int batch_idx = BlockIdxZ();
 
   tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
   tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
   tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
   tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
 
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+  for (; batch_idx < batch_count; batch_idx += GridDimZ()) {
 
     // Compute matrix product using blocks
     ComputeType accum[kMblock][kNblock];
@@ -171,10 +171,10 @@ __global__ void GemmComplex(
       }
     }
 
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
+    tensor_a.add_pointer_offset(batch_stride_A * GridDimZ());
+    tensor_b.add_pointer_offset(batch_stride_B * GridDimZ());
+    tensor_c.add_pointer_offset(batch_stride_C * GridDimZ());
+    tensor_d.add_pointer_offset(batch_stride_D * GridDimZ());
 
   } // for (batch_idx)
 }
@@ -236,6 +236,42 @@ void GemmComplex(
   );
 
   if (grid.y <= std::numeric_limits<uint16_t>::max()) {
+#if defined(CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+  syclcompat::launch<kernel::GemmComplex<
+                      ElementA,
+                      LayoutA,
+                      ElementB,
+                      LayoutB,
+                      ElementC,
+                      LayoutC,
+                      ScalarType,
+                      ComputeType,
+                      ElementD,
+                      ConvertOp,
+                      InnerProductOp,
+                      kMblock,
+                      kNblock
+                    >>(sycl_grid, sycl_block, 
+                        problem_size,
+                        alpha,
+                        tensor_a,
+                        transform_a,
+                        tensor_b,
+                        transform_b,
+                        beta,
+                        tensor_c,
+                        tensor_d,
+                        initial_accum,
+                        batch_count,
+                        batch_stride_A,
+                        batch_stride_B,
+                        batch_stride_C,
+                        batch_stride_D
+                    );
+#else
     kernel::GemmComplex<
       ElementA,
       LayoutA,
@@ -267,6 +303,7 @@ void GemmComplex(
       batch_stride_C,
       batch_stride_D
     );
+#endif
   } else {
     // Using bigger thread tile size
     int const kBigMblock = 4;
@@ -274,11 +311,47 @@ void GemmComplex(
 
     dim3 Bigblock(16, 8);
     dim3 Biggrid(
-      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
-      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
+      (problem_size.m() + Bigblock.x * kBigMblock - 1) / (Bigblock.x * kBigMblock),
+      (problem_size.n() + Bigblock.y * kBigNblock - 1) / (Bigblock.y * kBigNblock),
       batch_count % std::numeric_limits<uint16_t>::max()
     );
 
+#if defined (CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(Bigblock.x, Bigblock.y, Bigblock.z);
+  const auto sycl_grid = syclcompat::dim3(Biggrid.x, Biggrid.y, Biggrid.z);
+
+  syclcompat::launch<kernel::GemmComplex<
+                      ElementA,
+                      LayoutA,
+                      ElementB,
+                      LayoutB,
+                      ElementC,
+                      LayoutC,
+                      ScalarType,
+                      ComputeType,
+                      ElementD,
+                      ConvertOp,
+                      InnerProductOp,
+                      kBigMblock,
+                      kBigNblock
+                    >>(sycl_grid, sycl_block, 
+                        problem_size,
+                        alpha,
+                        tensor_a,
+                        transform_a,
+                        tensor_b,
+                        transform_b,
+                        beta,
+                        tensor_c,
+                        tensor_d,
+                        initial_accum,
+                        batch_count,
+                        batch_stride_A,
+                        batch_stride_B,
+                        batch_stride_C,
+                        batch_stride_D
+                    );
+#else
     kernel::GemmComplex<
       ElementA,
       LayoutA,
@@ -310,6 +383,7 @@ void GemmComplex(
       batch_stride_C,
       batch_stride_D
     );
+#endif
   }
 }
 
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index a64a419d8a..3c52294b7c 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -94,7 +94,7 @@ __global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
 
   Func func(params);
 
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX();
   int64_t max_index = 1;
 
   CUTLASS_PRAGMA_UNROLL
@@ -107,7 +107,7 @@ __global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
     Coord<Rank> coord;
 
     detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index); 
-    index += blockDim.x * gridDim.x;
+    index += BlockDimX() * GridDimX();
   }
 }
 
@@ -119,7 +119,7 @@ __global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start
 
   Func func(params);
 
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
+  int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX() + start;
 
   if (index < end) {
     Coord<Rank> coord;
@@ -143,9 +143,9 @@ __global__ void BlockForEach(
 
   Func func(params);
 
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t index = ThreadIdxX() + BlockIdxX() * BlockDimX();
 
-  for (; index < capacity; index += blockDim.x * gridDim.x) {
+  for (; index < capacity; index += BlockDimX() * GridDimX()) {
     ReferenceFactory<Element>::get(ptr, index) = func();
   }
 }
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index e6b36990f0..041f768138 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -59,9 +59,9 @@ __global__ void BlockCompareEqual(
   Element const *ptr_B,
   size_t capacity) {
 
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX();
 
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+  for (; idx < capacity; idx += GridDimX() * BlockDimX()) {
 
     Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
     Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
@@ -83,9 +83,9 @@ __global__ void BlockCompareRelativelyEqual(
   Element epsilon,
   Element nonzero_floor) {
 
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX();
 
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+  for (; idx < capacity; idx += GridDimX() * BlockDimX()) {
 
     Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
     Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
@@ -114,6 +114,13 @@ bool BlockCompareEqual(
   int equal_flag = 1;
   int *device_equal_flag = nullptr;
 
+#if defined (CUTLASS_ENABLE_SYCL)
+  device_equal_flag = reinterpret_cast<int*>(syclcompat::malloc(sizeof(int)));
+  if (device_equal_flag == nullptr) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+  syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
+#else
   if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
     throw std::runtime_error("Failed to allocate device flag.");
   }
@@ -126,9 +133,14 @@ bool BlockCompareEqual(
 
     throw std::runtime_error("Failed to copy equality flag to device.");
   }
+#endif
 
   if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+    block_size = 128;
+    grid_size = (capacity + block_size - 1) / block_size;
+    grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error.
+#else
     // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
     cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
       &grid_size,
@@ -142,11 +154,23 @@ bool BlockCompareEqual(
     // Limit block size. This has the effect of increasing the number of items processed by a
     // single thread and reduces the impact of initialization overhead.
     block_size = (block_size < 128 ? block_size : 128);
+#endif
   }
 
   dim3 grid(grid_size, 1, 1);
   dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+  syclcompat::launch<kernel::BlockCompareEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity);
+  syclcompat::wait();
+
+  syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
+
+  syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
+#else
   kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
 
   if (cudaMemcpy(
@@ -161,6 +185,7 @@ bool BlockCompareEqual(
   }
 
   cudaFree(device_equal_flag);
+#endif
 
   return equal_flag;
 }
@@ -181,6 +206,13 @@ bool BlockCompareRelativelyEqual(
   int equal_flag = 1;
   int *device_equal_flag = nullptr;
 
+#if defined (CUTLASS_ENABLE_SYCL)
+  device_equal_flag = reinterpret_cast<int*>(syclcompat::malloc(sizeof(int)));
+  if (device_equal_flag == nullptr) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+  syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
+#else
   if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
     throw std::runtime_error("Failed to allocate device flag.");
   }
@@ -193,9 +225,14 @@ bool BlockCompareRelativelyEqual(
 
     throw std::runtime_error("Failed to copy equality flag to device.");
   }
+#endif
 
   if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+    block_size = 128;
+    grid_size = (capacity + block_size - 1) / block_size;
+    grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error.
+#else
     // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
     cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
       &grid_size,
@@ -209,11 +246,24 @@ bool BlockCompareRelativelyEqual(
     // Limit block size. This has the effect of increasing the number of items processed by a
     // single thread and reduces the impact of initialization overhead.
     block_size = (block_size < 128 ? block_size : 128);
+#endif
   }
 
   dim3 grid(grid_size, 1, 1);
   dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+
+  syclcompat::launch<kernel::BlockCompareRelativelyEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity,
+                                                                  epsilon, nonzero_floor);
+  syclcompat::wait();
+
+  syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
+
+  syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
+#else
   kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
     device_equal_flag, 
     ptr_A, 
@@ -235,6 +285,7 @@ bool BlockCompareRelativelyEqual(
   }
 
   cudaFree(device_equal_flag);
+#endif
 
   return equal_flag;
 }
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 3911b0240c..be5bda948a 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -51,7 +51,11 @@ struct TensorForEach {
     cudaStream_t stream = nullptr) {
 
     if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+      // TODO: query the queue for block size
+      block_size = 128;
+      grid_size = (size(size) + block_size - 1) / block_size;
+#else
       // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
       cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
         &grid_size,
@@ -65,12 +69,19 @@ struct TensorForEach {
       // Limit block size. This has the effect of increasing the number of items processed by a
       // single thread and reduces the impact of initialization overhead.
       block_size = (block_size < 128 ? block_size : 128);
+#endif
     }
 
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    syclcompat::launch<kernel::TensorForEach<Func, Rank, Params>>(sycl_grid, sycl_block, 0, size, params);
+#else
     kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
+#endif
   }
 };
 
@@ -93,8 +104,14 @@ struct TensorDiagonalForEach {
     dim3 block(block_size, 1, 1);
     dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    syclcompat::launch<kernel::TensorDiagonalForEach<Func, Rank, Params>>(sycl_grid, sycl_block, 0, size, params, start, end);
+#else
     kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
       size, params, start, end);
+#endif
   }
 };
 
@@ -114,7 +131,11 @@ struct BlockForEach {
     cudaStream_t stream = nullptr) {
 
     if (!grid_size || !block_size) {
-
+#if defined (CUTLASS_ENABLE_SYCL)
+      // TODO: query the queue for block size
+      block_size = 128;
+      grid_size = (capacity + block_size - 1) / block_size;
+#else
       // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
       cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
         &grid_size,
@@ -128,12 +149,19 @@ struct BlockForEach {
       // Limit block size. This has the effect of increasing the number of items processed by a
       // single thread and reduces the impact of initialization overhead.
       block_size = (block_size < 128 ? block_size : 128);
+#endif
     }
 
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    syclcompat::launch<kernel::BlockForEach<Element, Func>>(sycl_grid, sycl_block, 0, ptr, capacity, params);
+#else
     kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
+#endif
   }
 };
 

From 38ac223274e4f6e331821dadd681765cb9e20c35 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Fri, 26 Apr 2024 14:49:05 +0100
Subject: [PATCH 2/7] Use cutlass::abs instead of sycl::abs

---
 include/cutlass/relatively_equal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index 510b4295a5..a2817d776d 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -57,7 +57,7 @@ CUTLASS_HOST_DEVICE
 bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
 
 #if defined (CUTLASS_ENABLE_SYCL)
-  using sycl::abs;
+  using cutlass::abs;
 #elif defined(__CUDACC_RTC__)
   using cuda::std::abs;
 #else

From 1815c82582999b8d876e5a87d85df85108b542bf Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Mon, 29 Apr 2024 10:15:08 +0100
Subject: [PATCH 3/7] Fix call to thread0()


From e51f32ba6ff8f1c8eb2a9150a90d406d103f359a Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Tue, 30 Apr 2024 10:42:23 +0100
Subject: [PATCH 4/7] Enable pvc sample only when SYCL support is enabled

---
 examples/CMakeLists.txt          | 5 ++++-
 examples/sycl/CMakeLists.txt     | 5 +++--
 examples/sycl/pvc/CMakeLists.txt | 1 -
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 861794e062..d80e122f7d 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -149,9 +149,12 @@ foreach(EXAMPLE
   57_hopper_grouped_gemm
   58_ada_fp8_gemm
   59_ampere_gather_scatter_conv
-  sycl
   )
 
   add_subdirectory(${EXAMPLE})
 
 endforeach()
+
+if(${CUTLASS_ENABLE_SYCL})
+  add_subdirectory(sycl)
+endif()
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index 737d6467fd..ef0449f902 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 # Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
@@ -28,4 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-add_subdirectory(pvc)
+if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc")
+  add_subdirectory(pvc)
+endif()
diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt
index 76b3564591..3ac67d2319 100644
--- a/examples/sycl/pvc/CMakeLists.txt
+++ b/examples/sycl/pvc/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 # Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #

From 6096cefe6239d49fbb076c3d21939d2b560b29c2 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Tue, 30 Apr 2024 12:47:06 +0100
Subject: [PATCH 5/7] Sample working with new copy atoms

---
 examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
index 6974c211e6..b1af08037c 100644
--- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
+++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
@@ -346,10 +346,10 @@ int main(int argc, const char** argv)
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
-  using GmemTiledCopyA = XE_2D_LOAD;
-  using GmemTiledCopyB = XE_2D_LOAD;
+  using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N;
 
-  using TileShape = Shape<_32, _32, _16>;
+  using TileShape = Shape<_32, _64, _32>;
 
   using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_BF16BF16F32F32_NN>,
           Layout<Shape<_8,_16,_1>>>;

From 3121facd6a2db00be4efb75d19dcedfef2550702 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Tue, 30 Apr 2024 14:54:41 +0100
Subject: [PATCH 6/7] Fix bug in GEMM computation

---
 examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
index b1af08037c..024df458f2 100644
--- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
+++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
@@ -219,7 +219,7 @@ struct ExampleRunner {
     auto [M, N, K, L] = problem_shape_MNKL;
 
     stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(K, N, L));
     stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
     stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
 

From e18e96bb4422fa7c4dcd27cab4c269886c3ab502 Mon Sep 17 00:00:00 2001
From: Muhammad Tanvir <muhammad.tanvir@codeplay.com>
Date: Wed, 8 May 2024 11:54:59 +0100
Subject: [PATCH 7/7] Example working after rebase

---
 .../util/include/cutlass/util/command_line.h  |  2 ++
 .../util/include/cutlass/util/device_memory.h |  5 ++-
 .../util/reference/device/gemm_complex.h      | 20 ++++++-----
 .../reference/device/kernel/tensor_foreach.h  | 29 ++++++++++++++--
 .../util/reference/device/tensor_compare.h    | 33 +++++++++++--------
 .../util/reference/device/tensor_foreach.h    | 27 +++++++--------
 6 files changed, 76 insertions(+), 40 deletions(-)

diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h
index 9dc3a11740..eeeba2bd1f 100644
--- a/tools/util/include/cutlass/util/command_line.h
+++ b/tools/util/include/cutlass/util/command_line.h
@@ -42,7 +42,9 @@
 #include <string>
 #include <vector>
 
+#if !defined(CUTLASS_ENABLE_SYCL)
 #include <cuda_runtime.h>
+#endif
 
 #include "cutlass/cutlass.h"
 
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 7f84085149..1f6685ab00 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -80,7 +80,10 @@ template <typename T>
 void free(T* ptr) {
   if (ptr) {
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::free(reinterpret_cast<void*>(ptr));
+    syclcompat::free(ptr);
+    if (ptr != nullptr) {
+      throw std::runtime_error("Failed to free device memory");
+    }
 #else
     cudaError_t cuda_error = (cudaFree(ptr));
     if (cuda_error != cudaSuccess) {
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index f49439b629..475a008705 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -73,7 +73,12 @@ template <
   int kMblock = 4,
   int kNblock = 4
 >
-__global__ void GemmComplex(
+#if defined (CUTLASS_ENABLE_SYCL)
+void 
+#else
+__global__ void
+#endif
+ GemmComplex(
   gemm::GemmCoord problem_size,
   ScalarType alpha,
   TensorRef<ElementA, LayoutA> tensor_a,
@@ -228,6 +233,10 @@ void GemmComplex(
   int const kMblock = 4;
   int const kNblock = 4;
 
+#if defined (CUTLASS_ENABLE_SYCL)
+using syclcompat::dim3;
+#endif
+
   dim3 block(16, 8);
   dim3 grid(
     (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
@@ -237,8 +246,6 @@ void GemmComplex(
 
   if (grid.y <= std::numeric_limits<uint16_t>::max()) {
 #if defined(CUTLASS_ENABLE_SYCL)
-  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
 
   syclcompat::launch<kernel::GemmComplex<
                       ElementA,
@@ -254,7 +261,7 @@ void GemmComplex(
                       InnerProductOp,
                       kMblock,
                       kNblock
-                    >>(sycl_grid, sycl_block, 
+                    >>(grid, block, 
                         problem_size,
                         alpha,
                         tensor_a,
@@ -317,9 +324,6 @@ void GemmComplex(
     );
 
 #if defined (CUTLASS_ENABLE_SYCL)
-  const auto sycl_block = syclcompat::dim3(Bigblock.x, Bigblock.y, Bigblock.z);
-  const auto sycl_grid = syclcompat::dim3(Biggrid.x, Biggrid.y, Biggrid.z);
-
   syclcompat::launch<kernel::GemmComplex<
                       ElementA,
                       LayoutA,
@@ -334,7 +338,7 @@ void GemmComplex(
                       InnerProductOp,
                       kBigMblock,
                       kBigNblock
-                    >>(sycl_grid, sycl_block, 
+                    >>(Biggrid, Bigblock, 
                         problem_size,
                         alpha,
                         tensor_a,
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index 3c52294b7c..36f8b190d2 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -51,7 +51,11 @@ template <typename Func, int Rank, int RankRemaining>
 struct TensorForEachHelper {
 
   /// Constructor for general rank
+#if defined (CUTLASS_ENABLE_SYCL)
+  __inline__
+#else
   __inline__ __device__
+#endif
   TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
 
     int64_t product = 1;
@@ -73,7 +77,11 @@ template <typename Func, int Rank>
 struct TensorForEachHelper<Func, Rank, 0> {
 
   /// Constructor for fastest changing rank
+#if defined (CUTLASS_ENABLE_SYCL)
+  __inline__
+#else
   __inline__ __device__
+#endif
   TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
 
     coord[Rank - 1] = index;
@@ -90,7 +98,12 @@ struct TensorForEachHelper<Func, Rank, 0> {
 
 /// Kernel calls a functor for each element in a tensor's index space
 template <typename Func, int Rank, typename Params>
-__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
+#if defined (CUTLASS_ENABLE_SYCL)
+void
+#else
+__global__ void
+#endif
+ TensorForEach(Coord<Rank> size, Params params = Params()) {
 
   Func func(params);
 
@@ -115,7 +128,12 @@ __global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
 
 /// Kernel calls a functor for each element along a tensor's diagonal
 template <typename Func, int Rank, typename Params>
-__global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
+#if defined (CUTLASS_ENABLE_SYCL)
+void
+#else
+__global__ void
+#endif
+ TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
 
   Func func(params);
 
@@ -136,7 +154,12 @@ __global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Element, typename Func>
-__global__ void BlockForEach(
+#if defined (CUTLASS_ENABLE_SYCL)
+void
+#else
+__global__ void
+#endif
+ BlockForEach(
   Element *ptr, 
   size_t capacity, 
   typename Func::Params params) {
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index 041f768138..3c312f5ff8 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -53,7 +53,12 @@ namespace device {
 namespace kernel {
 
 template <typename Element>
-__global__ void BlockCompareEqual(
+#if defined (CUTLASS_ENABLE_SYCL)
+void
+#else
+__global__ void
+#endif
+ BlockCompareEqual(
   int *equal, 
   Element const *ptr_A,
   Element const *ptr_B,
@@ -75,7 +80,12 @@ __global__ void BlockCompareEqual(
 }
 
 template <typename Element>
-__global__ void BlockCompareRelativelyEqual(
+#if defined (CUTLASS_ENABLE_SYCL)
+void
+#else
+__global__ void
+#endif
+ BlockCompareRelativelyEqual(
   int *equal, 
   Element const *ptr_A,
   Element const *ptr_B,
@@ -157,13 +167,9 @@ bool BlockCompareEqual(
 #endif
   }
 
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-
 #if defined(CUTLASS_ENABLE_SYCL)
-  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
-
+  const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
+  const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
   syclcompat::launch<kernel::BlockCompareEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity);
   syclcompat::wait();
 
@@ -171,6 +177,8 @@ bool BlockCompareEqual(
 
   syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
 #else
+  dim3 grid(grid_size, 1, 1);
+  dim3 block(block_size, 1, 1);
   kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
 
   if (cudaMemcpy(
@@ -249,12 +257,9 @@ bool BlockCompareRelativelyEqual(
 #endif
   }
 
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-
 #if defined(CUTLASS_ENABLE_SYCL)
-  const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-  const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+  const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
+  const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
 
   syclcompat::launch<kernel::BlockCompareRelativelyEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity,
                                                                   epsilon, nonzero_floor);
@@ -264,6 +269,8 @@ bool BlockCompareRelativelyEqual(
 
   syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
 #else
+  dim3 grid(grid_size, 1, 1);
+  dim3 block(block_size, 1, 1);
   kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
     device_equal_flag, 
     ptr_A, 
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index be5bda948a..728c0a02f0 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -72,14 +72,13 @@ struct TensorForEach {
 #endif
     }
 
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
+    const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
     syclcompat::launch<kernel::TensorForEach<Func, Rank, Params>>(sycl_grid, sycl_block, 0, size, params);
 #else
+    dim3 grid(grid_size, 1, 1);
+    dim3 block(block_size, 1, 1);
     kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
 #endif
   }
@@ -101,14 +100,13 @@ struct TensorDiagonalForEach {
       end = size.min();
     }
 
-    dim3 block(block_size, 1, 1);
-    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
-
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
+    const auto sycl_grid = syclcompat::dim3((end - start + block_size - 1) / block_size, 1, 1);
     syclcompat::launch<kernel::TensorDiagonalForEach<Func, Rank, Params>>(sycl_grid, sycl_block, 0, size, params, start, end);
 #else
+    dim3 block(block_size, 1, 1);
+    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
     kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
       size, params, start, end);
 #endif
@@ -152,14 +150,13 @@ struct BlockForEach {
 #endif
     }
 
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
+    const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
     syclcompat::launch<kernel::BlockForEach<Element, Func>>(sycl_grid, sycl_block, 0, ptr, capacity, params);
 #else
+    dim3 grid(grid_size, 1, 1);
+    dim3 block(block_size, 1, 1);
     kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
 #endif
   }