diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e74931eb83..dd465ac1f0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -97,6 +97,7 @@ if (CUTLASS_ENABLE_SYCL) foreach(EXAMPLE 14_ampere_tf32_tensorop_gemm cute + sycl ) add_subdirectory(${EXAMPLE}) endforeach() diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt new file mode 100644 index 0000000000..ef0449f902 --- /dev/null +++ b/examples/sycl/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc") + add_subdirectory(pvc) +endif() diff --git a/examples/sycl/pvc/CMakeLists.txt b/examples/sycl/pvc/CMakeLists.txt new file mode 100644 index 0000000000..3ac67d2319 --- /dev/null +++ b/examples/sycl/pvc/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +cutlass_example_add_executable( + pvc_bfloat_dpas_gemm_cute + pvc_bfloat_dpas_gemm_cute.cpp +) diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp new file mode 100644 index 0000000000..024df458f2 --- /dev/null +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -0,0 +1,400 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include +#include + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +template +static void fill_matrix(std::vector &M) +{ + std::random_device dev; + std::mt19937 rng(dev()); + std::uniform_real_distribution dist(1.0, 2.0); + std::generate(std::begin(M), std::end(M), [&] + { return static_cast(dist(rng)); }); +} + +template +static void vnni_matrix( + T* dst, const T* src, + int batch, int numRows, int numCols, int factor) +{ + for (int b = 0; b < batch; b++) { + for (int r = 0; r < numRows / factor; r++) { + for (int c = 0; c < numCols; c++) { + for (int k = 0; k < factor; k++) { + dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = + src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; + } + } + } + } +} + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l, iterations; + float alpha, beta; + + Options(): + help(false), + error(false), + m(4096), n(4096), k(4096), l(1), iterations(100), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 4096); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "PVC GEMM Example\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + class Gemm +> +struct ExampleRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_B_vnni; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + + syclcompat::wait(); + + // Check if output from CUTLASS kernel and reference kernel are relatively equal or not + // need to set a larger error margin for comparison to succeed + bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), 0.5f, 0.5f); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(K, N, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_B_vnni.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + // TODO: Enable initialization on device directly once RNG is + // available through SYCL. + std::vector a(K * M * L); + std::vector b(K * N * L); + std::vector b_vnni(b.size()); + std::vector c(M * N * L); + std::vector d(M * N * L, ElementC{0}); + + fill_matrix(a); + fill_matrix(b); + fill_matrix(c); + vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); + + syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA)); + syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB)); + syclcompat::memcpy(block_B_vnni.get(), b_vnni.data(), b.size() * sizeof(ElementB)); + syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC)); + syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC)); + } + + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B_vnni.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + + syclcompat::wait(); + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "PVC GEMM Example : " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + syclcompat::wait(); + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + printf("PVC GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + + return; + } + +}; + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run examples + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // The code section below describes datatype for input, output matrices and computation between + // elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; + using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N; + + using TileShape = Shape<_32, _64, _32>; + + using TiledMma = TiledMMA, + Layout>>; + + using DispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated; + + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + +// Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementInputA, + cutlass::gemm::TagToStrideA_t, + ElementInputB, + cutlass::gemm::TagToStrideB_t, + TiledMma, + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + ExampleRunner runner; + + runner.run(options, hw_info); + + return 0; +} diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h index f430da243f..4dba99cf6c 100644 --- a/include/cutlass/kernel_hardware_info.h +++ b/include/cutlass/kernel_hardware_info.h @@ -59,7 +59,15 @@ struct KernelHardwareInfo { // Methods // -#if !defined(__CUDACC_RTC__) +#if defined (CUTLASS_ENABLE_SYCL) + static inline int + query_device_multiprocessor_count(int device_id = 0) { + syclcompat::device_ext dev; + int multiprocessor_count = dev.get_max_compute_units(); + return multiprocessor_count; + } + +#elif !defined(__CUDACC_RTC__) static inline int query_device_multiprocessor_count(int device_id = 0) { cudaError_t result = cudaGetDevice(&device_id); diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h index 81e80281b9..a2817d776d 100644 --- a/include/cutlass/relatively_equal.h +++ b/include/cutlass/relatively_equal.h @@ -55,8 +55,10 @@ namespace detail { template CUTLASS_HOST_DEVICE bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) { - -#if defined(__CUDACC_RTC__) + +#if defined (CUTLASS_ENABLE_SYCL) + using cutlass::abs; +#elif defined(__CUDACC_RTC__) using cuda::std::abs; #else using std::abs; diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h index 9dc3a11740..eeeba2bd1f 100644 --- a/tools/util/include/cutlass/util/command_line.h +++ b/tools/util/include/cutlass/util/command_line.h @@ -42,7 +42,9 @@ #include #include +#if !defined(CUTLASS_ENABLE_SYCL) #include +#endif #include "cutlass/cutlass.h" diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h index 0b78640be9..953bbb6608 100644 --- a/tools/util/include/cutlass/util/device_memory.h +++ b/tools/util/include/cutlass/util/device_memory.h @@ -55,6 +55,9 @@ T* allocate(size_t count = 1) { T* ptr = 0; + size_t bytes = 0; + bytes = count * sizeof(T); + #if defined(CUTLASS_ENABLE_SYCL) if (count > 0) { ptr = reinterpret_cast(syclcompat::malloc(bytes)); @@ -63,11 +66,7 @@ T* allocate(size_t count = 1) { } } #else - size_t bytes = 0; - bytes = count * sizeof(T); - cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes); - if (cuda_error != cudaSuccess) { throw cuda_exception("Failed to allocate memory", cuda_error); } @@ -78,19 +77,19 @@ T* allocate(size_t count = 1) { /// Free the buffer pointed to by \p ptr template void free(T* ptr) { + if (ptr) { #if defined(CUTLASS_ENABLE_SYCL) syclcompat::free(ptr); if (ptr != nullptr) { throw std::runtime_error("Failed to free device memory"); } #else - if (ptr) { cudaError_t cuda_error = (cudaFree(ptr)); if (cuda_error != cudaSuccess) { throw cuda_exception("Failed to free device memory", cuda_error); } - } #endif + } } /****************************************************************************** diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h index b4d41bd28e..475a008705 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h @@ -73,7 +73,12 @@ template < int kMblock = 4, int kNblock = 4 > -__global__ void GemmComplex( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + GemmComplex( gemm::GemmCoord problem_size, ScalarType alpha, TensorRef tensor_a, @@ -102,16 +107,16 @@ __global__ void GemmComplex( ConvertOp convert_op; InnerProductOp inner_product_op; - int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock; - int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; - int batch_idx = blockIdx.z; + int row_block = (BlockIdxX() * BlockDimX() + ThreadIdxX()) * kMblock; + int col_block = (BlockIdxY() * BlockDimY() + ThreadIdxY()) * kNblock; + int batch_idx = BlockIdxZ(); tensor_a.add_pointer_offset(batch_idx * batch_stride_A); tensor_b.add_pointer_offset(batch_idx * batch_stride_B); tensor_c.add_pointer_offset(batch_idx * batch_stride_C); tensor_d.add_pointer_offset(batch_idx * batch_stride_D); - for (; batch_idx < batch_count; batch_idx += gridDim.z) { + for (; batch_idx < batch_count; batch_idx += GridDimZ()) { // Compute matrix product using blocks ComputeType accum[kMblock][kNblock]; @@ -171,10 +176,10 @@ __global__ void GemmComplex( } } - tensor_a.add_pointer_offset(batch_stride_A * gridDim.z); - tensor_b.add_pointer_offset(batch_stride_B * gridDim.z); - tensor_c.add_pointer_offset(batch_stride_C * gridDim.z); - tensor_d.add_pointer_offset(batch_stride_D * gridDim.z); + tensor_a.add_pointer_offset(batch_stride_A * GridDimZ()); + tensor_b.add_pointer_offset(batch_stride_B * GridDimZ()); + tensor_c.add_pointer_offset(batch_stride_C * GridDimZ()); + tensor_d.add_pointer_offset(batch_stride_D * GridDimZ()); } // for (batch_idx) } @@ -228,6 +233,10 @@ void GemmComplex( int const kMblock = 4; int const kNblock = 4; +#if defined (CUTLASS_ENABLE_SYCL) +using syclcompat::dim3; +#endif + dim3 block(16, 8); dim3 grid( (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock), @@ -236,6 +245,40 @@ void GemmComplex( ); if (grid.y <= std::numeric_limits::max()) { +#if defined(CUTLASS_ENABLE_SYCL) + + syclcompat::launch>(grid, block, + problem_size, + alpha, + tensor_a, + transform_a, + tensor_b, + transform_b, + beta, + tensor_c, + tensor_d, + initial_accum, + batch_count, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + ); +#else kernel::GemmComplex< ElementA, LayoutA, @@ -267,6 +310,7 @@ void GemmComplex( batch_stride_C, batch_stride_D ); +#endif } else { // Using bigger thread tile size int const kBigMblock = 4; @@ -274,11 +318,44 @@ void GemmComplex( dim3 Bigblock(16, 8); dim3 Biggrid( - (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock), - (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock), + (problem_size.m() + Bigblock.x * kBigMblock - 1) / (Bigblock.x * kBigMblock), + (problem_size.n() + Bigblock.y * kBigNblock - 1) / (Bigblock.y * kBigNblock), batch_count % std::numeric_limits::max() ); +#if defined (CUTLASS_ENABLE_SYCL) + syclcompat::launch>(Biggrid, Bigblock, + problem_size, + alpha, + tensor_a, + transform_a, + tensor_b, + transform_b, + beta, + tensor_c, + tensor_d, + initial_accum, + batch_count, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + ); +#else kernel::GemmComplex< ElementA, LayoutA, @@ -310,6 +387,7 @@ void GemmComplex( batch_stride_C, batch_stride_D ); +#endif } } diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h index a64a419d8a..36f8b190d2 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h @@ -51,7 +51,11 @@ template struct TensorForEachHelper { /// Constructor for general rank +#if defined (CUTLASS_ENABLE_SYCL) + __inline__ +#else __inline__ __device__ +#endif TensorForEachHelper(Func &func, Coord const &size, Coord &coord, int64_t index) { int64_t product = 1; @@ -73,7 +77,11 @@ template struct TensorForEachHelper { /// Constructor for fastest changing rank +#if defined (CUTLASS_ENABLE_SYCL) + __inline__ +#else __inline__ __device__ +#endif TensorForEachHelper(Func &func, Coord const &size, Coord &coord, int64_t index) { coord[Rank - 1] = index; @@ -90,11 +98,16 @@ struct TensorForEachHelper { /// Kernel calls a functor for each element in a tensor's index space template -__global__ void TensorForEach(Coord size, Params params = Params()) { +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + TensorForEach(Coord size, Params params = Params()) { Func func(params); - int64_t index = threadIdx.x + blockIdx.x * blockDim.x; + int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX(); int64_t max_index = 1; CUTLASS_PRAGMA_UNROLL @@ -107,7 +120,7 @@ __global__ void TensorForEach(Coord size, Params params = Params()) { Coord coord; detail::TensorForEachHelper(func, size, coord, index); - index += blockDim.x * gridDim.x; + index += BlockDimX() * GridDimX(); } } @@ -115,11 +128,16 @@ __global__ void TensorForEach(Coord size, Params params = Params()) { /// Kernel calls a functor for each element along a tensor's diagonal template -__global__ void TensorDiagonalForEach(Coord size, Params params, int start, int end) { +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + TensorDiagonalForEach(Coord size, Params params, int start, int end) { Func func(params); - int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start; + int64_t index = ThreadIdxX() + BlockIdxX() * BlockDimX() + start; if (index < end) { Coord coord; @@ -136,16 +154,21 @@ __global__ void TensorDiagonalForEach(Coord size, Params params, int start /////////////////////////////////////////////////////////////////////////////////////////////////// template -__global__ void BlockForEach( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + BlockForEach( Element *ptr, size_t capacity, typename Func::Params params) { Func func(params); - size_t index = threadIdx.x + blockIdx.x * blockDim.x; + size_t index = ThreadIdxX() + BlockIdxX() * BlockDimX(); - for (; index < capacity; index += blockDim.x * gridDim.x) { + for (; index < capacity; index += BlockDimX() * GridDimX()) { ReferenceFactory::get(ptr, index) = func(); } } diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h index e6b36990f0..3c312f5ff8 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h @@ -53,15 +53,20 @@ namespace device { namespace kernel { template -__global__ void BlockCompareEqual( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + BlockCompareEqual( int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity) { - size_t idx = threadIdx.x + blockDim.x * blockIdx.x; + size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX(); - for (; idx < capacity; idx += gridDim.x * blockDim.x) { + for (; idx < capacity; idx += GridDimX() * BlockDimX()) { Element a = cutlass::ReferenceFactory::get(ptr_A, idx); Element b = cutlass::ReferenceFactory::get(ptr_B, idx); @@ -75,7 +80,12 @@ __global__ void BlockCompareEqual( } template -__global__ void BlockCompareRelativelyEqual( +#if defined (CUTLASS_ENABLE_SYCL) +void +#else +__global__ void +#endif + BlockCompareRelativelyEqual( int *equal, Element const *ptr_A, Element const *ptr_B, @@ -83,9 +93,9 @@ __global__ void BlockCompareRelativelyEqual( Element epsilon, Element nonzero_floor) { - size_t idx = threadIdx.x + blockDim.x * blockIdx.x; + size_t idx = ThreadIdxX() + BlockDimX() * BlockIdxX(); - for (; idx < capacity; idx += gridDim.x * blockDim.x) { + for (; idx < capacity; idx += GridDimX() * BlockDimX()) { Element a = cutlass::ReferenceFactory::get(ptr_A, idx); Element b = cutlass::ReferenceFactory::get(ptr_B, idx); @@ -114,6 +124,13 @@ bool BlockCompareEqual( int equal_flag = 1; int *device_equal_flag = nullptr; +#if defined (CUTLASS_ENABLE_SYCL) + device_equal_flag = reinterpret_cast(syclcompat::malloc(sizeof(int))); + if (device_equal_flag == nullptr) { + throw std::runtime_error("Failed to allocate device flag."); + } + syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int)); +#else if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) { throw std::runtime_error("Failed to allocate device flag."); } @@ -126,9 +143,14 @@ bool BlockCompareEqual( throw std::runtime_error("Failed to copy equality flag to device."); } +#endif if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + block_size = 128; + grid_size = (capacity + block_size - 1) / block_size; + grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error. +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -142,11 +164,21 @@ bool BlockCompareEqual( // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); + syclcompat::launch>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity); + syclcompat::wait(); + + syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int)); + + syclcompat::free(reinterpret_cast(device_equal_flag)); +#else dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); - kernel::BlockCompareEqual<<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity); if (cudaMemcpy( @@ -161,6 +193,7 @@ bool BlockCompareEqual( } cudaFree(device_equal_flag); +#endif return equal_flag; } @@ -181,6 +214,13 @@ bool BlockCompareRelativelyEqual( int equal_flag = 1; int *device_equal_flag = nullptr; +#if defined (CUTLASS_ENABLE_SYCL) + device_equal_flag = reinterpret_cast(syclcompat::malloc(sizeof(int))); + if (device_equal_flag == nullptr) { + throw std::runtime_error("Failed to allocate device flag."); + } + syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int)); +#else if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) { throw std::runtime_error("Failed to allocate device flag."); } @@ -193,9 +233,14 @@ bool BlockCompareRelativelyEqual( throw std::runtime_error("Failed to copy equality flag to device."); } +#endif if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + block_size = 128; + grid_size = (capacity + block_size - 1) / block_size; + grid_size = (grid_size < 64 ? grid_size : 64); // limit grid size to avoid out_of_resources runtime error. +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -209,11 +254,23 @@ bool BlockCompareRelativelyEqual( // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); + + syclcompat::launch>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity, + epsilon, nonzero_floor); + syclcompat::wait(); + + syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int)); + + syclcompat::free(reinterpret_cast(device_equal_flag)); +#else dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); - kernel::BlockCompareRelativelyEqual<<< grid, block >>>( device_equal_flag, ptr_A, @@ -235,6 +292,7 @@ bool BlockCompareRelativelyEqual( } cudaFree(device_equal_flag); +#endif return equal_flag; } diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index 3911b0240c..728c0a02f0 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -51,7 +51,11 @@ struct TensorForEach { cudaStream_t stream = nullptr) { if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + // TODO: query the queue for block size + block_size = 128; + grid_size = (size(size) + block_size - 1) / block_size; +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -65,12 +69,18 @@ struct TensorForEach { // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); + syclcompat::launch>(sycl_grid, sycl_block, 0, size, params); +#else dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); - kernel::TensorForEach<<< grid, block, 0, stream >>>(size, params); +#endif } }; @@ -90,11 +100,16 @@ struct TensorDiagonalForEach { end = size.min(); } +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3((end - start + block_size - 1) / block_size, 1, 1); + syclcompat::launch>(sycl_grid, sycl_block, 0, size, params, start, end); +#else dim3 block(block_size, 1, 1); dim3 grid((end - start + block_size - 1) / block_size, 1, 1); - kernel::TensorDiagonalForEach<<< grid, block, 0, stream >>>( size, params, start, end); +#endif } }; @@ -114,7 +129,11 @@ struct BlockForEach { cudaStream_t stream = nullptr) { if (!grid_size || !block_size) { - +#if defined (CUTLASS_ENABLE_SYCL) + // TODO: query the queue for block size + block_size = 128; + grid_size = (capacity + block_size - 1) / block_size; +#else // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API cudaError_t result = cudaOccupancyMaxPotentialBlockSize( &grid_size, @@ -128,12 +147,18 @@ struct BlockForEach { // Limit block size. This has the effect of increasing the number of items processed by a // single thread and reduces the impact of initialization overhead. block_size = (block_size < 128 ? block_size : 128); +#endif } +#if defined(CUTLASS_ENABLE_SYCL) + const auto sycl_block = syclcompat::dim3(block_size, 1, 1); + const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1); + syclcompat::launch>(sycl_grid, sycl_block, 0, ptr, capacity, params); +#else dim3 grid(grid_size, 1, 1); dim3 block(block_size, 1, 1); - kernel::BlockForEach<<< grid, block, 0, stream >>>(ptr, capacity, params); +#endif } };