From c4677bcb9099f42b762d4af092ba9c6c974767dd Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Fri, 26 Apr 2024 10:23:29 +0100 Subject: [PATCH 1/7] Example changes for Intel PVC pipeline --- examples/CMakeLists.txt | 1 + examples/sycl/CMakeLists.txt | 31 ++ examples/sycl/pvc/CMakeLists.txt | 34 ++ .../sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp | 400 ++++++++++++++++++ include/cutlass/kernel_hardware_info.h | 10 +- include/cutlass/pipeline/sm90_pipeline.hpp | 2 +- include/cutlass/relatively_equal.h | 6 +- .../util/include/cutlass/util/device_memory.h | 24 +- .../util/reference/device/gemm_complex.h | 94 +++- .../reference/device/kernel/tensor_foreach.h | 10 +- .../util/reference/device/tensor_compare.h | 63 ++- .../util/reference/device/tensor_foreach.h | 32 +- 12 files changed, 678 insertions(+), 29 deletions(-) create mode 100644 examples/sycl/CMakeLists.txt create mode 100644 examples/sycl/pvc/CMakeLists.txt create mode 100644 examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0ca87da083..861794e062 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -149,6 +149,7 @@ foreach(EXAMPLE 57_hopper_grouped_gemm 58_ada_fp8_gemm 59_ampere_gather_scatter_conv + sycl ) add_subdirectory(${EXAMPLE}) diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt new file mode 100644 index 0000000000..737d6467fd --- /dev/null +++ b/examples/sycl/CMakeLists.txt @@ -0,0 +1,31 @@ + +# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +add_subdirectory(pvc) diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt new file mode 100644 index 0000000000..76b3564591 --- /dev/null +++ b/examples/sycl/pvc/CMakeLists.txt @@ -0,0 +1,34 @@ + +# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +cutlass_example_add_executable( + pvc_bfloat_dpas_gemm_cute + pvc_bfloat_dpas_gemm_cute.cpp +) diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp new file mode 100644 index 0000000000..6974c211e6 --- /dev/null +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -0,0 +1,400 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include +#include + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +template +static void fill_matrix(std::vector &M) +{ + std::random_device dev; + std::mt19937 rng(dev()); + std::uniform_real_distribution dist(1.0, 2.0); + std::generate(std::begin(M), std::end(M), [&] + { return static_cast(dist(rng)); }); +} + +template +static void vnni_matrix( + T* dst, const T* src, + int batch, int numRows, int numCols, int factor) +{ + for (int b = 0; b < batch; b++) { + for (int r = 0; r < numRows / factor; r++) { + for (int c = 0; c < numCols; c++) { + for (int k = 0; k < factor; k++) { + dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = + src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; + } + } + } + } +} + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l, iterations; + float alpha, beta; + + Options(): + help(false), + error(false), + m(4096), n(4096), k(4096), l(1), iterations(100), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 4096); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "PVC GEMM Example\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class Gemm +> +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_B_vnni; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + syclcompat::wait(); + + // Check if output from CUTLASS kernel and reference kernel are relatively equal or not + // need to set a larger error margin for comparison to succeed + bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), 0.5f, 0.5f); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_B_vnni.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + // TODO: Enable initialization on device directly once RNG is + // available through SYCL. + std::vector a(K * M * L); + std::vector b(K * N * L); + std::vector b_vnni(b.size()); + std::vector c(M * N * L); + std::vector d(M * N * L, ElementC{0}); + + fill_matrix(a); + fill_matrix(b); + fill_matrix(c); + vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); + + syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA)); + syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB)); + syclcompat::memcpy(block_B_vnni.get(), b_vnni.data(), b.size() * sizeof(ElementB)); + syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC)); + syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC)); + } + + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B_vnni.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + + syclcompat::wait(); + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "PVC GEMM Example : " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + syclcompat::wait(); + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + printf("PVC GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + + return; + } + +}; + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // The code section below describes datatype for input, output matrices and computation between + // elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + using GmemTiledCopyA = XE_2D_LOAD; + using GmemTiledCopyB = XE_2D_LOAD; + + using TileShape = Shape<_32, _32, _16>; + + using TiledMma = TiledMMA, + Layout>>; + + using DispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated; + + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + +// Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementInputA, + cutlass::gemm::TagToStrideA_t, + ElementInputB, + cutlass::gemm::TagToStrideB_t, + TiledMma, + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + runner.run(options, hw_info); + + return 0; +} diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h index b69399ff0a..de10016753 100644 --- a/include/cutlass/kernel_hardware_info.h +++ b/include/cutlass/kernel_hardware_info.h @@ -59,7 +59,15 @@ struct KernelHardwareInfo { // Methods // -#if !defined(__CUDACC_RTC__) +#if defined (CUTLASS_ENABLE_SYCL) + static inline int + query_device_multiprocessor_count(int device_id = 0) { + syclcompat::device_ext dev; + int multiprocessor_count = dev.get_max_compute_units(); + return multiprocessor_count; + } + +#elif !defined(__CUDACC_RTC__) static inline int query_device_multiprocessor_count(int device_id = 0) { cudaError_t result = cudaGetDevice(&device_id); diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp index e48d275609..ffe95376b9 100644 --- a/include/cutlass/pipeline/sm90_pipeline.hpp +++ b/include/cutlass/pipeline/sm90_pipeline.hpp @@ -417,7 +417,7 @@ private : } // Most likely you have elected more than one leader - if (params_.is_leader && (threadIdx.x % 32 != 0)) { + if (params_.is_leader && (ThreadIdxX() % 32 != 0)) { asm volatile ("brkpt;\n" ::); } #endif diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h index 81e80281b9..510b4295a5 100644 --- a/include/cutlass/relatively_equal.h +++ b/include/cutlass/relatively_equal.h @@ -55,8 +55,10 @@ namespace detail { template CUTLASS_HOST_DEVICE bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) { - -#if defined(__CUDACC_RTC__) + +#if defined (CUTLASS_ENABLE_SYCL) + using sycl::abs; +#elif defined(__CUDACC_RTC__) using cuda::std::abs; #else using std::abs; diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h index 4ccc6447ac..e8d3cccbb8 100644 --- a/tools/util/include/cutlass/util/device_memory.h +++ b/tools/util/include/cutlass/util/device_memory.h @@ -58,11 +58,19 @@ T* allocate(size_t count = 1) { bytes = count * sizeof(T); +#if defined(CUTLASS_ENABLE_SYCL) + if (bytes > 0) { + ptr = reinterpret_cast(syclcompat::malloc(bytes)); + if ((void*)ptr == nullptr) { + throw std::runtime_error("Failed to allocate memory"); + } + } +#else cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes); - if (cuda_error != cudaSuccess) { throw cuda_exception("Failed to allocate memory", cuda_error); } +#endif return ptr; } @@ -71,10 +79,14 @@ T* allocate(size_t count = 1) { template void free(T* ptr) { if (ptr) { +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::free(reinterpret_cast(ptr)); +#else cudaError_t cuda_error = (cudaFree(ptr)); if (cuda_error != cudaSuccess) { throw cuda_exception("Failed to free device memory", cuda_error); } +#endif } } @@ -87,10 +99,14 @@ void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) { size_t bytes = count * sizeof_bits::value / 8; if (bytes == 0 && count > 0) bytes = 1; +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::memcpy(dst, src, bytes); +#else cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind)); if (cuda_error != cudaSuccess) { throw cuda_exception("cudaMemcpy() failed", cuda_error); - } + } +#endif } template @@ -140,12 +156,16 @@ class DeviceAllocation { /// Delete functor for CUDA device memory struct deleter { void operator()(T* ptr) { +#ifdef CUTLASS_ENABLE_SYCL + syclcompat::free(reinterpret_cast(ptr)); +#else cudaError_t cuda_error = (cudaFree(ptr)); if (cuda_error != cudaSuccess) { // noexcept // throw cuda_exception("cudaFree() failed", cuda_error); return; } +#endif } }; diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h index b4d41bd28e..f49439b629 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h @@ -102,16 +102,16 @@ __global__ void GemmComplex( ConvertOp convert_op; InnerProductOp inner_product_op; - int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock; - int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; - int batch_idx = blockIdx.z; + int row_block = (BlockIdxX() * BlockDimX() + ThreadIdxX()) * kMblock; + int col_block = (BlockIdxY() * BlockDimY() + ThreadIdxY()) * kNblock; + int batch_idx = BlockIdxZ(); tensor_a.add_pointer_offset(batch_idx * batch_stride_A); tensor_b.add_pointer_offset(batch_idx * batch_stride_B); tensor_c.add_pointer_offset(batch_idx * batch_stride_C); tensor_d.add_pointer_offset(batch_idx * batch_stride_D); - for (; batch_idx < batch_count; batch_idx += gridDim.z) { + for (; batch_idx < batch_count; batch_idx += GridDimZ()) { // Compute matrix product using blocks ComputeType accum[kMblock][kNblock]; @@ -171,10 +171,10 @@ __global__ void GemmComplex( } } - tensor_a.add_pointer_offset(batch_stride_A * gridDim.z); - tensor_b.add_pointer_offset(batch_stride_B * gridDim.z); - tensor_c.add_pointer_offset(batch_stride_C * gridDim.z); - tensor_d.add_pointer_offset(batch_stride_D * gridDim.z); + tensor_a.add_pointer_offset(batch_stride_A * GridDimZ()); + tensor_b.add_pointer_offset(batch_stride_B * GridDimZ()); + tensor_c.add_pointer_offset(batch_stride_C * GridDimZ()); + tensor_d.add_pointer_offset(batch_stride_D * GridDimZ()); } // for (batch_idx) } @@ -236,6 +236,42 @@ void GemmComplex( ); if (grid.y <= std::numeric_limits::max()) { +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); + const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + + syclcompat::launch>(sycl_grid, sycl_block, + problem_size, + alpha, + tensor_a, + transform_a, + tensor_b, + transform_b, + beta, + tensor_c, + tensor_d, + initial_accum, + batch_count, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + ); +#else kernel::GemmComplex< ElementA, LayoutA, @@ -267,6 +303,7 @@ void GemmComplex( batch_stride_C, batch_stride_D ); +#endif } else { // Using bigger thread tile size int const kBigMblock = 4; @@ -274,11 +311,47 @@ void GemmComplex( dim3 Bigblock(16, 8); dim3 Biggrid( - (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock), - (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock), + (problem_size.m() + Bigblock.x * kBigMblock - 1) / (Bigblock.x * kBigMblock), + (problem_size.n() + Bigblock.y * kBigNblock - 1) / (Bigblock.y * kBigNblock), batch_count % std::numeric_limits::max() ); +#if defined (CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(Bigblock.x, Bigblock.y, Bigblock.z); + const auto sycl_grid = syclcompat::dim3(Biggrid.x, Biggrid.y, Biggrid.z); + + syclcompat::launch>(sycl_grid, sycl_block, + problem_size, + alpha, + tensor_a, + transform_a, + tensor_b, + transform_b, + beta, + tensor_c, + tensor_d, + initial_accum, + batch_count, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + ); +#else kernel::GemmComplex< ElementA, LayoutA, @@ -310,6 +383,7 @@ void GemmComplex( batch_stride_C, batch_stride_D ); +#endif } } diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h index a64a419d8a..3c52294b7c 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h @@ -94,7 +94,7 @@ __global__ void TensorForEach(Coord size, Params params = Params()) { Func func(params); - int64_t index = threadIdx.x + blockIdx.x * blockDim.x; + int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX(); int64_t max_index = 1; CUTLASS_PRAGMA_UNROLL @@ -107,7 +107,7 @@ __global__ void TensorForEach(Coord size, Params params = Params()) { Coord coord; detail::TensorForEachHelper(func, size, coord, index); - index += blockDim.x * gridDim.x; + index += BlockDimX() * GridDimX(); } } @@ -119,7 +119,7 @@ __global__ void TensorDiagonalForEach(Coord size, Params params, int start Func func(params); - int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start; + int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX() + start; if (index < end) { Coord coord; @@ -143,9 +143,9 @@ __global__ void BlockForEach( Func func(params); - size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t index = ThreadIdxX() + BlockIdxX() * BlockDimX(); - for (; index < capacity; index += blockDim.x * gridDim.x) { + for (; index < capacity; index += BlockDimX() * GridDimX()) { ReferenceFactory::get(ptr, index) = func(); } } diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h index e6b36990f0..041f768138 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h @@ -59,9 +59,9 @@ __global__ void BlockCompareEqual( Element const *ptr_B, size_t capacity) { - size_t idx = threadIdx.x + blockDim.x * blockIdx.x; + size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX(); - for (; idx < capacity; idx += gridDim.x * blockDim.x) { + for (; idx < capacity; idx += GridDimX() * BlockDimX()) { Element a = cutlass::ReferenceFactory::get(ptr_A, idx); Element b = cutlass::ReferenceFactory::get(ptr_B, idx); @@ -83,9 +83,9 @@ __global__ void BlockCompareRelativelyEqual( Element epsilon, Element nonzero_floor) { - size_t idx = threadIdx.x + blockDim.x * blockIdx.x; + size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX(); - for (; idx < capacity; idx += gridDim.x * blockDim.x) { + for (; idx < capacity; idx += GridDimX() * BlockDimX()) { Element a = cutlass::ReferenceFactory::get(ptr_A, idx); Element b = cutlass::ReferenceFactory::get(ptr_B, idx); @@ -114,6 +114,13 @@ bool BlockCompareEqual( int equal_flag = 1; int *device_equal_flag = nullptr; +#if defined (CUTLASS_ENABLE_SYCL) + device_equal_flag = reinterpret_cast(syclcompat::malloc(sizeof(int))); + if (device_equal_flag == nullptr) { + throw std::runtime_error("Failed to allocate device flag."); + } + syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int)); +#else if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) { throw std::runtime_error("Failed to allocate device flag."); } @@ -126,9 +133,14 @@ bool BlockCompareEqual( throw std::runtime_error("Failed to copy equality flag to device."); } +#endif if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + block_size = 128; + grid_size = (capacity + block_size - 1) / block_size; + grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error. +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -142,11 +154,23 @@ bool BlockCompareEqual( // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); + const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + + syclcompat::launch>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity); + syclcompat::wait(); + + syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int)); + + syclcompat::free(reinterpret_cast(device_equal_flag)); +#else kernel::BlockCompareEqual<<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity); if (cudaMemcpy( @@ -161,6 +185,7 @@ bool BlockCompareEqual( } cudaFree(device_equal_flag); +#endif return equal_flag; } @@ -181,6 +206,13 @@ bool BlockCompareRelativelyEqual( int equal_flag = 1; int *device_equal_flag = nullptr; +#if defined (CUTLASS_ENABLE_SYCL) + device_equal_flag = reinterpret_cast(syclcompat::malloc(sizeof(int))); + if (device_equal_flag == nullptr) { + throw std::runtime_error("Failed to allocate device flag."); + } + syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int)); +#else if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) { throw std::runtime_error("Failed to allocate device flag."); } @@ -193,9 +225,14 @@ bool BlockCompareRelativelyEqual( throw std::runtime_error("Failed to copy equality flag to device."); } +#endif if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + block_size = 128; + grid_size = (capacity + block_size - 1) / block_size; + grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error. +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -209,11 +246,24 @@ bool BlockCompareRelativelyEqual( // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); + const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + + syclcompat::launch>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity, + epsilon, nonzero_floor); + syclcompat::wait(); + + syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int)); + + syclcompat::free(reinterpret_cast(device_equal_flag)); +#else kernel::BlockCompareRelativelyEqual<<< grid, block >>>( device_equal_flag, ptr_A, @@ -235,6 +285,7 @@ bool BlockCompareRelativelyEqual( } cudaFree(device_equal_flag); +#endif return equal_flag; } diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index 3911b0240c..be5bda948a 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -51,7 +51,11 @@ struct TensorForEach { cudaStream_t stream = nullptr) { if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + // TODO: query the queue for block size + block_size = 128; + grid_size = (size(size) + block_size - 1) / block_size; +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -65,12 +69,19 @@ struct TensorForEach { // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); + const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + syclcompat::launch>(sycl_grid, sycl_block, 0, size, params); +#else kernel::TensorForEach<<< grid, block, 0, stream >>>(size, params); +#endif } }; @@ -93,8 +104,14 @@ struct TensorDiagonalForEach { dim3 block(block_size, 1, 1); dim3 grid((end - start + block_size - 1) / block_size, 1, 1); +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); + const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + syclcompat::launch>(sycl_grid, sycl_block, 0, size, params, start, end); +#else kernel::TensorDiagonalForEach<<< grid, block, 0, stream >>>( size, params, start, end); +#endif } }; @@ -114,7 +131,11 @@ struct BlockForEach { cudaStream_t stream = nullptr) { if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + // TODO: query the queue for block size + block_size = 128; + grid_size = (capacity + block_size - 1) / block_size; +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -128,12 +149,19 @@ struct BlockForEach { // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); + const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + syclcompat::launch>(sycl_grid, sycl_block, 0, ptr, capacity, params); +#else kernel::BlockForEach<<< grid, block, 0, stream >>>(ptr, capacity, params); +#endif } }; From 38ac223274e4f6e331821dadd681765cb9e20c35 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Fri, 26 Apr 2024 14:49:05 +0100 Subject: [PATCH 2/7] Use cutlass::abs instead of sycl::abs --- include/cutlass/relatively_equal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h index 510b4295a5..a2817d776d 100644 --- a/include/cutlass/relatively_equal.h +++ b/include/cutlass/relatively_equal.h @@ -57,7 +57,7 @@ CUTLASS_HOST_DEVICE bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) { #if defined (CUTLASS_ENABLE_SYCL) - using sycl::abs; + using cutlass::abs; #elif defined(__CUDACC_RTC__) using cuda::std::abs; #else From 1815c82582999b8d876e5a87d85df85108b542bf Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 29 Apr 2024 10:15:08 +0100 Subject: [PATCH 3/7] Fix call to thread0() From e51f32ba6ff8f1c8eb2a9150a90d406d103f359a Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Tue, 30 Apr 2024 10:42:23 +0100 Subject: [PATCH 4/7] Enable pvc sample only when SYCL support is enabled --- examples/CMakeLists.txt | 5 ++++- examples/sycl/CMakeLists.txt | 5 +++-- examples/sycl/pvc/CMakeLists.txt | 1 - 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 861794e062..d80e122f7d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -149,9 +149,12 @@ foreach(EXAMPLE 57_hopper_grouped_gemm 58_ada_fp8_gemm 59_ampere_gather_scatter_conv - sycl ) add_subdirectory(${EXAMPLE}) endforeach() + +if(${CUTLASS_ENABLE_SYCL}) + add_subdirectory(sycl) +endif() diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt index 737d6467fd..ef0449f902 100644 --- a/examples/sycl/CMakeLists.txt +++ b/examples/sycl/CMakeLists.txt @@ -1,4 +1,3 @@ - # Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # @@ -28,4 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -add_subdirectory(pvc) +if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc") + add_subdirectory(pvc) +endif() diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt index 76b3564591..3ac67d2319 100644 --- a/examples/sycl/pvc/CMakeLists.txt +++ b/examples/sycl/pvc/CMakeLists.txt @@ -1,4 +1,3 @@ - # Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # From 6096cefe6239d49fbb076c3d21939d2b560b29c2 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Tue, 30 Apr 2024 12:47:06 +0100 Subject: [PATCH 5/7] Sample working with new copy atoms --- examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp index 6974c211e6..b1af08037c 100644 --- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -346,10 +346,10 @@ int main(int argc, const char** argv) using LayoutC = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor; - using GmemTiledCopyA = XE_2D_LOAD; - using GmemTiledCopyB = XE_2D_LOAD; + using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; + using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N; - using TileShape = Shape<_32, _32, _16>; + using TileShape = Shape<_32, _64, _32>; using TiledMma = TiledMMA, Layout>>; From 3121facd6a2db00be4efb75d19dcedfef2550702 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Tue, 30 Apr 2024 14:54:41 +0100 Subject: [PATCH 6/7] Fix bug in GEMM computation --- examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp index b1af08037c..024df458f2 100644 --- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -219,7 +219,7 @@ struct ExampleRunner { auto [M, N, K, L] = problem_shape_MNKL; stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); - stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(K, N, L)); stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); From e18e96bb4422fa7c4dcd27cab4c269886c3ab502 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Wed, 8 May 2024 11:54:59 +0100 Subject: [PATCH 7/7] Example working after rebase --- .../util/include/cutlass/util/command_line.h | 2 ++ .../util/include/cutlass/util/device_memory.h | 5 ++- .../util/reference/device/gemm_complex.h | 20 ++++++----- .../reference/device/kernel/tensor_foreach.h | 29 ++++++++++++++-- .../util/reference/device/tensor_compare.h | 33 +++++++++++-------- .../util/reference/device/tensor_foreach.h | 27 +++++++-------- 6 files changed, 76 insertions(+), 40 deletions(-) diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h index 9dc3a11740..eeeba2bd1f 100644 --- a/tools/util/include/cutlass/util/command_line.h +++ b/tools/util/include/cutlass/util/command_line.h @@ -42,7 +42,9 @@ #include #include +#if !defined(CUTLASS_ENABLE_SYCL) #include +#endif #include "cutlass/cutlass.h" diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h index 7f84085149..1f6685ab00 100644 --- a/tools/util/include/cutlass/util/device_memory.h +++ b/tools/util/include/cutlass/util/device_memory.h @@ -80,7 +80,10 @@ template void free(T* ptr) { if (ptr) { #if defined(CUTLASS_ENABLE_SYCL) - syclcompat::free(reinterpret_cast(ptr)); + syclcompat::free(ptr); + if (ptr != nullptr) { + throw std::runtime_error("Failed to free device memory"); + } #else cudaError_t cuda_error = (cudaFree(ptr)); if (cuda_error != cudaSuccess) { diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h index f49439b629..475a008705 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h @@ -73,7 +73,12 @@ template < int kMblock = 4, int kNblock = 4 > -__global__ void GemmComplex( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + GemmComplex( gemm::GemmCoord problem_size, ScalarType alpha, TensorRef tensor_a, @@ -228,6 +233,10 @@ void GemmComplex( int const kMblock = 4; int const kNblock = 4; +#if defined (CUTLASS_ENABLE_SYCL) +using syclcompat::dim3; +#endif + dim3 block(16, 8); dim3 grid( (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock), @@ -237,8 +246,6 @@ void GemmComplex( if (grid.y <= std::numeric_limits::max()) { #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); - const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); syclcompat::launch>(sycl_grid, sycl_block, + >>(grid, block, problem_size, alpha, tensor_a, @@ -317,9 +324,6 @@ void GemmComplex( ); #if defined (CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(Bigblock.x, Bigblock.y, Bigblock.z); - const auto sycl_grid = syclcompat::dim3(Biggrid.x, Biggrid.y, Biggrid.z); - syclcompat::launch>(sycl_grid, sycl_block, + >>(Biggrid, Bigblock, problem_size, alpha, tensor_a, diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h index 3c52294b7c..36f8b190d2 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h @@ -51,7 +51,11 @@ template struct TensorForEachHelper { /// Constructor for general rank +#if defined (CUTLASS_ENABLE_SYCL) + __inline__ +#else __inline__ __device__ +#endif TensorForEachHelper(Func &func, Coord const &size, Coord &coord, int64_t index) { int64_t product = 1; @@ -73,7 +77,11 @@ template struct TensorForEachHelper { /// Constructor for fastest changing rank +#if defined (CUTLASS_ENABLE_SYCL) + __inline__ +#else __inline__ __device__ +#endif TensorForEachHelper(Func &func, Coord const &size, Coord &coord, int64_t index) { coord[Rank - 1] = index; @@ -90,7 +98,12 @@ struct TensorForEachHelper { /// Kernel calls a functor for each element in a tensor's index space template -__global__ void TensorForEach(Coord size, Params params = Params()) { +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + TensorForEach(Coord size, Params params = Params()) { Func func(params); @@ -115,7 +128,12 @@ __global__ void TensorForEach(Coord size, Params params = Params()) { /// Kernel calls a functor for each element along a tensor's diagonal template -__global__ void TensorDiagonalForEach(Coord size, Params params, int start, int end) { +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + TensorDiagonalForEach(Coord size, Params params, int start, int end) { Func func(params); @@ -136,7 +154,12 @@ __global__ void TensorDiagonalForEach(Coord size, Params params, int start /////////////////////////////////////////////////////////////////////////////////////////////////// template -__global__ void BlockForEach( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + BlockForEach( Element *ptr, size_t capacity, typename Func::Params params) { diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h index 041f768138..3c312f5ff8 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h @@ -53,7 +53,12 @@ namespace device { namespace kernel { template -__global__ void BlockCompareEqual( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + BlockCompareEqual( int *equal, Element const *ptr_A, Element const *ptr_B, @@ -75,7 +80,12 @@ __global__ void BlockCompareEqual( } template -__global__ void BlockCompareRelativelyEqual( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + BlockCompareRelativelyEqual( int *equal, Element const *ptr_A, Element const *ptr_B, @@ -157,13 +167,9 @@ bool BlockCompareEqual( #endif } - dim3 grid(grid_size, 1, 1); - dim3 block(block_size, 1, 1); - #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); - const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); - + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity); syclcompat::wait(); @@ -171,6 +177,8 @@ bool BlockCompareEqual( syclcompat::free(reinterpret_cast(device_equal_flag)); #else + dim3 grid(grid_size, 1, 1); + dim3 block(block_size, 1, 1); kernel::BlockCompareEqual<<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity); if (cudaMemcpy( @@ -249,12 +257,9 @@ bool BlockCompareRelativelyEqual( #endif } - dim3 grid(grid_size, 1, 1); - dim3 block(block_size, 1, 1); - #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); - const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity, epsilon, nonzero_floor); @@ -264,6 +269,8 @@ bool BlockCompareRelativelyEqual( syclcompat::free(reinterpret_cast(device_equal_flag)); #else + dim3 grid(grid_size, 1, 1); + dim3 block(block_size, 1, 1); kernel::BlockCompareRelativelyEqual<<< grid, block >>>( device_equal_flag, ptr_A, diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index be5bda948a..728c0a02f0 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -72,14 +72,13 @@ struct TensorForEach { #endif } - dim3 grid(grid_size, 1, 1); - dim3 block(block_size, 1, 1); - #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); - const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, 0, size, params); #else + dim3 grid(grid_size, 1, 1); + dim3 block(block_size, 1, 1); kernel::TensorForEach<<< grid, block, 0, stream >>>(size, params); #endif } @@ -101,14 +100,13 @@ struct TensorDiagonalForEach { end = size.min(); } - dim3 block(block_size, 1, 1); - dim3 grid((end - start + block_size - 1) / block_size, 1, 1); - #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); - const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3((end - start + block_size - 1) / block_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, 0, size, params, start, end); #else + dim3 block(block_size, 1, 1); + dim3 grid((end - start + block_size - 1) / block_size, 1, 1); kernel::TensorDiagonalForEach<<< grid, block, 0, stream >>>( size, params, start, end); #endif @@ -152,14 +150,13 @@ struct BlockForEach { #endif } - dim3 grid(grid_size, 1, 1); - dim3 block(block_size, 1, 1); - #if defined(CUTLASS_ENABLE_SYCL) - const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z); - const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); syclcompat::launch>(sycl_grid, sycl_block, 0, ptr, capacity, params); #else + dim3 grid(grid_size, 1, 1); + dim3 block(block_size, 1, 1); kernel::BlockForEach<<< grid, block, 0, stream >>>(ptr, capacity, params); #endif }