From 00b210e67396f3b36ac23d4e560b880f7298f62f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Thu, 11 Apr 2024 15:55:36 +0100 Subject: [PATCH 1/3] [SYCL][COMPAT] shared memory test split from main tests (#13063) Some tests were being completely skipped entirely because a part of them required `aspect::usm_shared_allocations`. This PR splits the tests in various files to disable only the usages of shared memory. --- .../test-e2e/syclcompat/math/math_complex.cpp | 11 +- .../syclcompat/memory/memory_fixt.hpp | 10 +- .../memory/memory_management_shared.cpp | 91 ++++++++++++++ .../memory/memory_management_test2.cpp | 42 ------- .../syclcompat/memory/usm_allocations.cpp | 73 ++--------- .../memory/usm_shared_allocations.cpp | 113 ++++++++++++++++++ 6 files changed, 225 insertions(+), 115 deletions(-) create mode 100644 sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp create mode 100644 sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp diff --git a/sycl/test-e2e/syclcompat/math/math_complex.cpp b/sycl/test-e2e/syclcompat/math/math_complex.cpp index cd81246f07ca..3b7dba8b2496 100644 --- a/sycl/test-e2e/syclcompat/math/math_complex.cpp +++ b/sycl/test-e2e/syclcompat/math/math_complex.cpp @@ -30,8 +30,7 @@ //===---------------------------------------------------------------===// // REQUIRES: aspect-fp64 -// REQUIRES: usm_shared_allocations -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out +// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out // RUN: %{run} %t.out #include @@ -72,18 +71,20 @@ template class ComplexLauncher { protected: int *result_; int cpu_result_{0}; + int h_result_; public: ComplexLauncher() { - result_ = (int *)syclcompat::malloc_shared(sizeof(int)); - *result_ = 0; + result_ = (int *)syclcompat::malloc(sizeof(int)); + syclcompat::memset(result_, 0, sizeof(int)); }; ~ComplexLauncher() { syclcompat::free(result_); } void launch() { F(&cpu_result_); // Run on host syclcompat::launch(1, 1, result_); // Run on device syclcompat::wait(); - assert(*result_ == 1); + syclcompat::memcpy(&h_result_, result_, 1); + assert(h_result_ == 1); assert(cpu_result_ == 1); } }; diff --git a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp b/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp index 7c613e25a4a4..e5b8c6ef3797 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp +++ b/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp @@ -123,7 +123,7 @@ template struct USMTest { skip{should_skip(syclcompat::get_current_device())} {} void launch_kernel() { - auto &dd_A = d_A; + auto &dd_A = data; return q_ .submit([&](sycl::handler &cgh) { cgh.parallel_for( @@ -135,15 +135,15 @@ template struct USMTest { // Check result is identity vector // Handles memcpy for USM device alloc void check_result() { - sycl::usm::alloc ptr_type = sycl::get_pointer_type(d_A, q_.get_context()); + sycl::usm::alloc ptr_type = sycl::get_pointer_type(data, q_.get_context()); assert(ptr_type != sycl::usm::alloc::unknown); T *result; if (ptr_type == sycl::usm::alloc::device) { result = static_cast(std::malloc(sizeof(T) * size_)); - syclcompat::memcpy(result, d_A, sizeof(T) * size_); + syclcompat::memcpy(result, data, sizeof(T) * size_); } else { - result = d_A; + result = data; } for (size_t i = 0; i < size_; i++) { @@ -157,7 +157,7 @@ template struct USMTest { sycl::queue q_; syclcompat::dim3 const grid_; syclcompat::dim3 const thread_; - T *d_A; + T *data; size_t size_; bool skip; }; diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp new file mode 100644 index 000000000000..b9641c6864f9 --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp @@ -0,0 +1,91 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCLcompat API + * + * memory_management_shared.cpp + * + * Description: + * memory operations tests with shared memory + **************************************************************************/ + +// The original source was under the license below: +// ====------ memory_management_test2.cpp---------- -*- C++ -* ----===//// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===----------------------------------------------------------------------===// + +// REQUIRES: usm_shared_allocations +// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out +// RUN: %{run} %t.out + +#include + +#include + +#include "../common.hpp" +#include "memory_common.hpp" + +constexpr size_t DataW = 100; +constexpr size_t DataH = 100; + +void test_shared_memory() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + syclcompat::shared_memory s_A(DataW); + syclcompat::shared_memory s_B(DataW); + syclcompat::shared_memory s_C(DataW); + + s_A.init(); + s_B.init(); + s_C.init(); + + for (int i = 0; i < DataW; i++) { + s_A[i] = 1.0f; + s_B[i] = 2.0f; + } + + { + syclcompat::get_default_queue().submit([&](sycl::handler &cgh) { + float *d_A = s_A.get_ptr(); + float *d_B = s_B.get_ptr(); + float *d_C = s_C.get_ptr(); + cgh.parallel_for(sycl::range<1>(DataW), [=](sycl::id<1> id) { + int i = id[0]; + float *A = d_A; + float *B = d_B; + float *C = d_C; + C[i] = A[i] + B[i]; + }); + }); + syclcompat::get_default_queue().wait_and_throw(); + } + + // verify hostD + for (int i = 0; i < DataW; i++) { + for (int j = 0; j < DataH; j++) { + assert(fabs(s_C[i] - s_A[i] - s_B[i]) <= 1e-5); + } + } +} + +int main() { + test_shared_memory(); + + return 0; +} diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp index 9774faa8e8ba..afb1f6a5f5a8 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp @@ -30,7 +30,6 @@ // // ===----------------------------------------------------------------------===// -// REQUIRES: usm_shared_allocations // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out // RUN: %{run} %t.out @@ -217,46 +216,6 @@ void test_global_memory() { } } -void test_shared_memory() { - std::cout << __PRETTY_FUNCTION__ << std::endl; - - syclcompat::shared_memory s_A(DataW); - syclcompat::shared_memory s_B(DataW); - syclcompat::shared_memory s_C(DataW); - - s_A.init(); - s_B.init(); - s_C.init(); - - for (int i = 0; i < DataW; i++) { - s_A[i] = 1.0f; - s_B[i] = 2.0f; - } - - { - syclcompat::get_default_queue().submit([&](sycl::handler &cgh) { - float *d_A = s_A.get_ptr(); - float *d_B = s_B.get_ptr(); - float *d_C = s_C.get_ptr(); - cgh.parallel_for(sycl::range<1>(DataW), [=](sycl::id<1> id) { - int i = id[0]; - float *A = d_A; - float *B = d_B; - float *C = d_C; - C[i] = A[i] + B[i]; - }); - }); - syclcompat::get_default_queue().wait_and_throw(); - } - - // verify hostD - for (int i = 0; i < DataW; i++) { - for (int j = 0; j < DataH; j++) { - assert(fabs(s_C[i] - s_A[i] - s_B[i]) <= 1e-5); - } - } -} - void test_constant_memory() { std::cout << __PRETTY_FUNCTION__ << std::endl; @@ -366,7 +325,6 @@ int main() { test_memcpy_pitched_q(); test_global_memory(); - test_shared_memory(); test_constant_memory(); return 0; } diff --git a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp index fb8a8a52da10..78eb46581986 100644 --- a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp +++ b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp @@ -20,7 +20,6 @@ * USM allocation tests **************************************************************************/ -// REQUIRES: usm_shared_allocations // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out // RUN: %{run} %t.out @@ -41,10 +40,10 @@ template void test_malloc() { if (usm_fixture.skip) return; // Skip unsupported - usm_fixture.d_A = syclcompat::malloc(usm_fixture.size_); + usm_fixture.data = syclcompat::malloc(usm_fixture.size_); usm_fixture.launch_kernel(); usm_fixture.check_result(); - syclcompat::free(usm_fixture.d_A); + syclcompat::free(usm_fixture.data); } template void test_host() { @@ -52,66 +51,37 @@ template void test_host() { USMTest usm_fixture; if (usm_fixture.skip) return; // Skip unsupported - if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_host_allocations)) return; // Skip unsupported - usm_fixture.d_A = syclcompat::malloc_host(usm_fixture.size_); - usm_fixture.launch_kernel(); - usm_fixture.check_result(); - syclcompat::free(usm_fixture.d_A); -} - -template void test_shared() { - std::cout << __PRETTY_FUNCTION__ << std::endl; - USMTest usm_fixture; - if (usm_fixture.skip) - return; // Skip unsupported - if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_shared_allocations)) - return; // Skip unsupported - usm_fixture.d_A = syclcompat::malloc_shared(usm_fixture.size_); + usm_fixture.data = syclcompat::malloc_host(usm_fixture.size_); usm_fixture.launch_kernel(); usm_fixture.check_result(); - syclcompat::free(usm_fixture.d_A); + syclcompat::free(usm_fixture.data); } void test_non_templated_malloc() { std::cout << __PRETTY_FUNCTION__ << std::endl; USMTest usm_fixture; - usm_fixture.d_A = + usm_fixture.data = static_cast(syclcompat::malloc(usm_fixture.size_ * sizeof(int))); usm_fixture.launch_kernel(); usm_fixture.check_result(); - syclcompat::free(usm_fixture.d_A); + syclcompat::free(usm_fixture.data); } void test_non_templated_host() { std::cout << __PRETTY_FUNCTION__ << std::endl; USMTest usm_fixture; - if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_host_allocations)) return; // Skip unsupported - usm_fixture.d_A = static_cast( + usm_fixture.data = static_cast( syclcompat::malloc_host(usm_fixture.size_ * sizeof(int))); usm_fixture.launch_kernel(); usm_fixture.check_result(); - syclcompat::free(usm_fixture.d_A); -} - -void test_non_templated_shared() { - std::cout << __PRETTY_FUNCTION__ << std::endl; - USMTest usm_fixture; - - if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_shared_allocations)) - return; - - usm_fixture.d_A = static_cast( - syclcompat::malloc_shared(usm_fixture.size_ * sizeof(int))); - usm_fixture.launch_kernel(); - usm_fixture.check_result(); - syclcompat::free(usm_fixture.d_A); + syclcompat::free(usm_fixture.data); } // Test deduce direction @@ -120,11 +90,12 @@ void test_deduce() { using memcpy_direction = syclcompat::detail::memcpy_direction; auto default_queue = syclcompat::get_default_queue(); + if (!default_queue.get_device().has(sycl::aspect::usm_host_allocations)) + return; // Skip unsupported int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int)); int *sys_ptr = (int *)std::malloc(sizeof(int)); int *d_ptr = (int *)syclcompat::malloc(sizeof(int)); - int *s_ptr = (int *)syclcompat::malloc_shared(sizeof(int)); // * to host assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, @@ -135,9 +106,6 @@ void test_deduce() { assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, d_ptr) == memcpy_direction::device_to_device); - assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, - s_ptr) == - memcpy_direction::device_to_device); // * to sys assert(syclcompat::detail::deduce_memcpy_direction( @@ -148,8 +116,6 @@ void test_deduce() { assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr, d_ptr) == memcpy_direction::device_to_host); - assert(syclcompat::detail::deduce_memcpy_direction( - default_queue, sys_ptr, s_ptr) == memcpy_direction::host_to_host); // * to dev assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, @@ -161,39 +127,20 @@ void test_deduce() { assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, d_ptr) == memcpy_direction::device_to_device); - assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, - s_ptr) == - memcpy_direction::device_to_device); - - // * to shared - assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, - h_ptr) == - memcpy_direction::device_to_device); - assert(syclcompat::detail::deduce_memcpy_direction( - default_queue, s_ptr, sys_ptr) == memcpy_direction::host_to_host); - assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, - d_ptr) == - memcpy_direction::device_to_device); - assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, - s_ptr) == - memcpy_direction::device_to_device); std::free(sys_ptr); syclcompat::free(h_ptr); syclcompat::free(d_ptr); - syclcompat::free(s_ptr); } int main() { INSTANTIATE_ALL_TYPES(value_type_list, test_malloc); INSTANTIATE_ALL_TYPES(value_type_list, test_host); - INSTANTIATE_ALL_TYPES(value_type_list, test_shared); // Avoid combinatorial explosion by only testing non-templated // syclcompat::malloc with int type test_non_templated_malloc(); test_non_templated_host(); - test_non_templated_shared(); test_deduce(); diff --git a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp new file mode 100644 index 000000000000..6e5f7bcfef34 --- /dev/null +++ b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp @@ -0,0 +1,113 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCLcompat API + * + * usm_allocations.cpp + * + * Description: + * USM allocation tests + **************************************************************************/ + +// REQUIRES: usm_shared_allocations +// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out +// RUN: %{run} %t.out + +#include +#include + +#include + +#include + +#include "../common.hpp" +#include "memory_common.hpp" +#include "memory_fixt.hpp" + +template void test_shared() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + USMTest usm_fixture; + + if (usm_fixture.skip) + return; // Skip unsupported + + usm_fixture.data = syclcompat::malloc_shared(usm_fixture.size_); + usm_fixture.launch_kernel(); + usm_fixture.check_result(); + syclcompat::free(usm_fixture.data); +} + +void test_non_templated_shared() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + USMTest usm_fixture; + + usm_fixture.data = static_cast( + syclcompat::malloc_shared(usm_fixture.size_ * sizeof(int))); + usm_fixture.launch_kernel(); + usm_fixture.check_result(); + syclcompat::free(usm_fixture.data); +} + +// Test deduce direction +void test_deduce_shared() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + using memcpy_direction = syclcompat::detail::memcpy_direction; + auto default_queue = syclcompat::get_default_queue(); + + int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int)); + int *sys_ptr = (int *)std::malloc(sizeof(int)); + int *d_ptr = (int *)syclcompat::malloc(sizeof(int)); + int *s_ptr = (int *)syclcompat::malloc_shared(sizeof(int)); + + // * to host + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr, + s_ptr) == + memcpy_direction::device_to_device); + + // * to sys + assert(syclcompat::detail::deduce_memcpy_direction( + default_queue, sys_ptr, s_ptr) == memcpy_direction::host_to_host); + + // * to dev + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr, + s_ptr) == + memcpy_direction::device_to_device); + + // * to shared + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, + h_ptr) == + memcpy_direction::device_to_device); + assert(syclcompat::detail::deduce_memcpy_direction( + default_queue, s_ptr, sys_ptr) == memcpy_direction::host_to_host); + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, + d_ptr) == + memcpy_direction::device_to_device); + assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr, + s_ptr) == + memcpy_direction::device_to_device); + + syclcompat::free(s_ptr); + std::free(sys_ptr); + syclcompat::free(h_ptr); + syclcompat::free(d_ptr); +} + +int main() { + INSTANTIATE_ALL_TYPES(value_type_list, test_shared); + test_non_templated_shared(); + test_deduce_shared(); + + return 0; +} From 05644a470303c2af3385b9533b8d23ebdea99eb7 Mon Sep 17 00:00:00 2001 From: Chunyang Dai Date: Thu, 11 Apr 2024 23:51:33 +0800 Subject: [PATCH 2/3] [OpenCL] Config dependent-load flag to exclude CWD from DLL search path (#13327) This change is to avoid DLL hijacking security issue. --- opencl/opencl-aot/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/opencl/opencl-aot/CMakeLists.txt b/opencl/opencl-aot/CMakeLists.txt index ff905fd883e6..a05f094df971 100644 --- a/opencl/opencl-aot/CMakeLists.txt +++ b/opencl/opencl-aot/CMakeLists.txt @@ -12,6 +12,11 @@ set(OPENCL_AOT_PROJECT_NAME opencl-aot) add_llvm_tool(${OPENCL_AOT_PROJECT_NAME} ${TARGET_SOURCES}) +if (WIN32) + # 0x2000: exclude CWD from DLL loading path + target_link_options(${OPENCL_AOT_PROJECT_NAME} PRIVATE "/DEPENDENTLOADFLAG:0x2000") +endif() + if(NOT MSVC) # FIXME: when built with clang it produces a warning. target_compile_options(${OPENCL_AOT_PROJECT_NAME} PRIVATE "-Wno-unused-parameter") From 5332773b17efbf10e1b72cd633c1d7e2b4f75125 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Fri, 12 Apr 2024 01:51:19 +0900 Subject: [PATCH 3/3] [SYCL][ESIMD] atomic_update with data size less than 4 bytes should use LSC atomics (#13340) SVM doesn't support less than 4 bytes on Gen12, we either get an error or the wrong answer. --------- Signed-off-by: Sarnie, Nick --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 32 ++--- sycl/test/esimd/memory_properties.cpp | 126 ++++++++++++++++++- 2 files changed, 142 insertions(+), 16 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7ef701b7edc8..188bce9d59b6 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -5862,8 +5862,8 @@ __ESIMD_API simd slm_atomic_update_impl(simd offsets, template __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, simd> slm_atomic_update(simd byte_offset, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are - // supported only by LSC. + // 2 byte, 8 byte types, non-power of two, and operations wider than + // 32 are supported only by LSC. if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || !__ESIMD_DNS::isPowerOf2(N, 32)) { return slm_atomic_update_impl __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, simd> slm_atomic_update(simd byte_offset, simd src0, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are - // supported only by LSC. + // 2 byte, 8 byte types, non-power of two, and operations wider than + // 32 are supported only by LSC. if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || !__ESIMD_DNS::isPowerOf2(N, 32)) { // half and short are supported in LSC. @@ -6031,8 +6031,8 @@ template __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, simd> slm_atomic_update(simd byte_offset, simd src0, simd src1, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are - // supported only by LSC. + // 2 byte, 8 byte types, non-power of two, and operations wider than + // 32 are supported only by LSC. if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || !__ESIMD_DNS::isPowerOf2(N, 32)) { // 2-argument lsc_atomic_update arguments order matches the standard one - @@ -6417,7 +6417,7 @@ atomic_update(T *p, simd byte_offset, simd_mask mask, static_assert(std::is_integral_v, "Unsupported offset type"); if constexpr (detail::has_cache_hints() || - !__ESIMD_DNS::isPowerOf2(N, 32)) { + !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>( p, byte_offset, mask); @@ -6640,7 +6640,7 @@ atomic_update(T *p, simd byte_offset, simd src0, if constexpr (detail::has_cache_hints() || (Op == atomic_op::fmin) || (Op == atomic_op::fmax) || (Op == atomic_op::fadd) || (Op == atomic_op::fsub) || - !__ESIMD_DNS::isPowerOf2(N, 32)) { + !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>( p, byte_offset, src0, mask); @@ -6888,9 +6888,11 @@ atomic_update(T *p, simd byte_offset, simd src0, static_assert(std::is_integral_v, "Unsupported offset type"); // Use LSC atomic when cache hints are present, FP atomics is used, - // non-power of two length is used, or operation width greater than 32. + // non-power of two length is used, or operation width greater than 32, or the + // data size is less than 4 bytes. if constexpr (detail::has_cache_hints() || - Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) { + Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) || + sizeof(T) < 4) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse // order, hence the src1/src0 swap. @@ -7116,7 +7118,7 @@ atomic_update(AccessorTy acc, simd byte_offset, simd_mask mask, static_assert(std::is_integral_v, "Unsupported offset type"); if constexpr (detail::has_cache_hints() || - !detail::isPowerOf2(N, 32)) { + !detail::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( acc, byte_offset, mask); @@ -7384,7 +7386,7 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, if constexpr (detail::has_cache_hints() || Op == atomic_op::fmin || Op == atomic_op::fmax || Op == atomic_op::fadd || Op == atomic_op::fsub || - !__ESIMD_DNS::isPowerOf2(N, 32)) { + !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( acc, byte_offset, src0, mask); @@ -7681,9 +7683,11 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, static_assert(std::is_integral_v, "Unsupported offset type"); static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); // Use LSC atomic when cache hints are present, FP atomics is used, - // non-power of two length is used, or operation width greater than 32. + // non-power of two length is used, operation width greater than 32, or the + // data size is less than 4 bytes, if constexpr (detail::has_cache_hints() || - Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) { + Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) || + sizeof(T) < 4) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse // order, hence the src1/src0 swap. diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 2c69d3a69d78..b23697d91922 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -322,6 +322,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, atomic_update(ptr, offsets, pred); } + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef) + { + int16_t *ptr = 0; + constexpr int VL = 8; + simd offsets = simd(1) * sizeof(int16_t); + auto atomic_res = + atomic_update(ptr, offsets); + } + // Accessor // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef) @@ -377,6 +388,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto atomic_res_acc = atomic_update(acc, offsets, pred); } + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK-STATEFUL: call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 {{[^)]+}}, <8 x i32> undef) + // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef) + { + using AccType = + sycl::accessor; + AccType *acc = nullptr; + constexpr int VL = 8; + simd offsets = simd(1) * sizeof(int16_t); + auto atomic_res = + atomic_update(*acc, offsets); + } } // Test atomic update with one operand. @@ -432,6 +456,18 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto res_atomic_8 = atomic_update(ptr, offsets, add, pred); + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32>{{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef) + { + int16_t *ptr = 0; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + auto add = simd(5); + auto atomic_res = + atomic_update(ptr, offsets, add); + } + // Accessors // CHECK-STATEFUL-COUNT-14: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef) @@ -483,6 +519,21 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.sub.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef) auto res_atomic_17 = atomic_update(acc, offsets, add, pred); + + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef) + { + using AccType = + sycl::accessor; + AccType *acc = nullptr; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + auto add = simd(5); + auto atomic_res = + atomic_update(*acc, offsets, add); + } } // Test atomic update with two operands. @@ -626,6 +677,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto res_atomic_100 = atomic_update( ptr, offsets, swap, compare, pred); + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef) + { + int16_t *ptr = 0; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + simd swap = simd(1) * sizeof(int); + auto compare = swap * 2; + auto atomic_res = atomic_update( + ptr, offsets, swap, compare); + } + // Accessors // CHECK-STATEFUL-COUNT-30: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef) @@ -751,6 +815,22 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.cmpxchg.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef) auto res_atomic_33 = atomic_update( acc, offsets, swap, compare, pred); + + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef) + { + using AccType = + sycl::accessor; + AccType *acc = nullptr; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + simd swap = simd(1) * sizeof(int); + auto compare = swap * 2; + auto atomic_res = atomic_update( + *acc, offsets, compare, swap); + } } // Test slm_atomic_update without operands. @@ -824,12 +904,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, { constexpr int VL = 16; simd offsets = simd(1) * sizeof(int16_t); - auto pred = simd_mask(1); simd add = simd(1) * sizeof(int); // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> undef, i32 0, <16 x i32> undef) auto res_slm_atomic_0 = - slm_atomic_update(offsets, add, pred); + slm_atomic_update(offsets, add); } // Expect DWORD for fmin. { @@ -934,6 +1013,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, offsets_view.select(), swap_view.select(), compare_view.select()); + // Expect LSC for short. + { + constexpr int VL = 16; + simd offsets = simd(1) * sizeof(int16_t); + auto compare = simd(VL, 1); + auto swap = compare * 2; + + // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, i32 0, <16 x i32> undef) + auto res_slm_atomic_0 = + slm_atomic_update(offsets, swap, + compare); + } + // Expect LSC for int64_t. { constexpr int VL = 16; @@ -964,6 +1056,15 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, local_acc, offsets_view.select(), pred); auto res_slm_atomic_6 = atomic_update( local_acc, offsets_view.select()); + + // Expect LSC for short. + { + using LocalAccType = sycl::local_accessor; + LocalAccType *local_acc = nullptr; + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef) + auto res_slm_atomic_1 = + atomic_update(*local_acc, offsets); + } } // One operand atomic. { @@ -997,6 +1098,16 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, pred); res_slm_atomic_8 = atomic_update( local_acc, offsets_view.select(), add_view.select()); + + // Expect LSC for short. + { + using LocalAccType = sycl::local_accessor; + LocalAccType *local_acc = nullptr; + simd add = simd(1) * sizeof(int); + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef) + auto res_slm_atomic_1 = + atomic_update(*local_acc, offsets, add); + } } // Two operand atomic. { @@ -1069,6 +1180,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, res_slm_atomic_16 = atomic_update( local_acc, offsets_view.select(), swap_view.select(), compare_view.select()); + + // Expect LSC for short. + { + using LocalAccType = sycl::local_accessor; + LocalAccType *local_acc = nullptr; + auto compare = simd(VL, 1); + auto swap = compare * 2; + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef) + auto res_slm_atomic_1 = atomic_update( + *local_acc, offsets, swap, compare); + } } }