From 00b210e67396f3b36ac23d4e560b880f7298f62f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Thu, 11 Apr 2024 15:55:36 +0100
Subject: [PATCH 1/3] [SYCL][COMPAT] shared memory test split from main tests
 (#13063)

Some tests were being completely skipped entirely because a part of them
required `aspect::usm_shared_allocations`.
This PR splits the tests in various files to disable only the usages of
shared memory.
---
 .../test-e2e/syclcompat/math/math_complex.cpp |  11 +-
 .../syclcompat/memory/memory_fixt.hpp         |  10 +-
 .../memory/memory_management_shared.cpp       |  91 ++++++++++++++
 .../memory/memory_management_test2.cpp        |  42 -------
 .../syclcompat/memory/usm_allocations.cpp     |  73 ++---------
 .../memory/usm_shared_allocations.cpp         | 113 ++++++++++++++++++
 6 files changed, 225 insertions(+), 115 deletions(-)
 create mode 100644 sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp
 create mode 100644 sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
diff --git a/sycl/test-e2e/syclcompat/math/math_complex.cpp b/sycl/test-e2e/syclcompat/math/math_complex.cpp
index cd81246f07ca..3b7dba8b2496 100644
--- a/sycl/test-e2e/syclcompat/math/math_complex.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_complex.cpp
@@ -30,8 +30,7 @@
 //===---------------------------------------------------------------===//
 
 // REQUIRES: aspect-fp64
-// REQUIRES: usm_shared_allocations
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <complex>
@@ -72,18 +71,20 @@ template <auto F> class ComplexLauncher {
 protected:
   int *result_;
   int cpu_result_{0};
+  int h_result_;
 
 public:
   ComplexLauncher() {
-    result_ = (int *)syclcompat::malloc_shared(sizeof(int));
-    *result_ = 0;
+    result_ = (int *)syclcompat::malloc(sizeof(int));
+    syclcompat::memset(result_, 0, sizeof(int));
   };
   ~ComplexLauncher() { syclcompat::free(result_); }
   void launch() {
     F(&cpu_result_);                      // Run on host
     syclcompat::launch<F>(1, 1, result_); // Run on device
     syclcompat::wait();
-    assert(*result_ == 1);
+    syclcompat::memcpy<int>(&h_result_, result_, 1);
+    assert(h_result_ == 1);
     assert(cpu_result_ == 1);
   }
 };
diff --git a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp b/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp
index 7c613e25a4a4..e5b8c6ef3797 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp
@@ -123,7 +123,7 @@ template <typename T> struct USMTest {
         skip{should_skip<T>(syclcompat::get_current_device())} {}
 
   void launch_kernel() {
-    auto &dd_A = d_A;
+    auto &dd_A = data;
     return q_
         .submit([&](sycl::handler &cgh) {
           cgh.parallel_for(
@@ -135,15 +135,15 @@ template <typename T> struct USMTest {
   // Check result is identity vector
   // Handles memcpy for USM device alloc
   void check_result() {
-    sycl::usm::alloc ptr_type = sycl::get_pointer_type(d_A, q_.get_context());
+    sycl::usm::alloc ptr_type = sycl::get_pointer_type(data, q_.get_context());
     assert(ptr_type != sycl::usm::alloc::unknown);
 
     T *result;
     if (ptr_type == sycl::usm::alloc::device) {
       result = static_cast<T *>(std::malloc(sizeof(T) * size_));
-      syclcompat::memcpy(result, d_A, sizeof(T) * size_);
+      syclcompat::memcpy(result, data, sizeof(T) * size_);
     } else {
-      result = d_A;
+      result = data;
     }
 
     for (size_t i = 0; i < size_; i++) {
@@ -157,7 +157,7 @@ template <typename T> struct USMTest {
   sycl::queue q_;
   syclcompat::dim3 const grid_;
   syclcompat::dim3 const thread_;
-  T *d_A;
+  T *data;
   size_t size_;
   bool skip;
 };
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp
new file mode 100644
index 000000000000..b9641c6864f9
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  memory_management_shared.cpp
+ *
+ *  Description:
+ *    memory operations tests with shared memory
+ **************************************************************************/
+
+// The original source was under the license below:
+// ====------ memory_management_test2.cpp---------- -*- C++ -* ----===////
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===----------------------------------------------------------------------===//
+
+// REQUIRES: usm_shared_allocations
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/sycl.hpp>
+
+#include <syclcompat/memory.hpp>
+
+#include "../common.hpp"
+#include "memory_common.hpp"
+
+constexpr size_t DataW = 100;
+constexpr size_t DataH = 100;
+
+void test_shared_memory() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  syclcompat::shared_memory<float, 1> s_A(DataW);
+  syclcompat::shared_memory<float, 1> s_B(DataW);
+  syclcompat::shared_memory<float, 1> s_C(DataW);
+
+  s_A.init();
+  s_B.init();
+  s_C.init();
+
+  for (int i = 0; i < DataW; i++) {
+    s_A[i] = 1.0f;
+    s_B[i] = 2.0f;
+  }
+
+  {
+    syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
+      float *d_A = s_A.get_ptr();
+      float *d_B = s_B.get_ptr();
+      float *d_C = s_C.get_ptr();
+      cgh.parallel_for(sycl::range<1>(DataW), [=](sycl::id<1> id) {
+        int i = id[0];
+        float *A = d_A;
+        float *B = d_B;
+        float *C = d_C;
+        C[i] = A[i] + B[i];
+      });
+    });
+    syclcompat::get_default_queue().wait_and_throw();
+  }
+
+  // verify hostD
+  for (int i = 0; i < DataW; i++) {
+    for (int j = 0; j < DataH; j++) {
+      assert(fabs(s_C[i] - s_A[i] - s_B[i]) <= 1e-5);
+    }
+  }
+}
+
+int main() {
+  test_shared_memory();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp
index 9774faa8e8ba..afb1f6a5f5a8 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp
@@ -30,7 +30,6 @@
 //
 // ===----------------------------------------------------------------------===//
 
-// REQUIRES: usm_shared_allocations
 // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
@@ -217,46 +216,6 @@ void test_global_memory() {
   }
 }
 
-void test_shared_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::shared_memory<float, 1> s_A(DataW);
-  syclcompat::shared_memory<float, 1> s_B(DataW);
-  syclcompat::shared_memory<float, 1> s_C(DataW);
-
-  s_A.init();
-  s_B.init();
-  s_C.init();
-
-  for (int i = 0; i < DataW; i++) {
-    s_A[i] = 1.0f;
-    s_B[i] = 2.0f;
-  }
-
-  {
-    syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
-      float *d_A = s_A.get_ptr();
-      float *d_B = s_B.get_ptr();
-      float *d_C = s_C.get_ptr();
-      cgh.parallel_for(sycl::range<1>(DataW), [=](sycl::id<1> id) {
-        int i = id[0];
-        float *A = d_A;
-        float *B = d_B;
-        float *C = d_C;
-        C[i] = A[i] + B[i];
-      });
-    });
-    syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  // verify hostD
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      assert(fabs(s_C[i] - s_A[i] - s_B[i]) <= 1e-5);
-    }
-  }
-}
-
 void test_constant_memory() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
@@ -366,7 +325,6 @@ int main() {
   test_memcpy_pitched_q();
 
   test_global_memory();
-  test_shared_memory();
   test_constant_memory();
   return 0;
 }
diff --git a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
index fb8a8a52da10..78eb46581986 100644
--- a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
+++ b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
@@ -20,7 +20,6 @@
  *    USM allocation tests
  **************************************************************************/
 
-// REQUIRES: usm_shared_allocations
 // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
@@ -41,10 +40,10 @@ template <typename T> void test_malloc() {
   if (usm_fixture.skip)
     return; // Skip unsupported
 
-  usm_fixture.d_A = syclcompat::malloc<T>(usm_fixture.size_);
+  usm_fixture.data = syclcompat::malloc<T>(usm_fixture.size_);
   usm_fixture.launch_kernel();
   usm_fixture.check_result();
-  syclcompat::free(usm_fixture.d_A);
+  syclcompat::free(usm_fixture.data);
 }
 
 template <typename T> void test_host() {
@@ -52,66 +51,37 @@ template <typename T> void test_host() {
   USMTest<T> usm_fixture;
   if (usm_fixture.skip)
     return; // Skip unsupported
-
   if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_host_allocations))
     return; // Skip unsupported
-  usm_fixture.d_A = syclcompat::malloc_host<T>(usm_fixture.size_);
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.d_A);
-}
-
-template <typename T> void test_shared() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<T> usm_fixture;
-  if (usm_fixture.skip)
-    return; // Skip unsupported
 
-  if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_shared_allocations))
-    return; // Skip unsupported
-  usm_fixture.d_A = syclcompat::malloc_shared<T>(usm_fixture.size_);
+  usm_fixture.data = syclcompat::malloc_host<T>(usm_fixture.size_);
   usm_fixture.launch_kernel();
   usm_fixture.check_result();
-  syclcompat::free(usm_fixture.d_A);
+  syclcompat::free(usm_fixture.data);
 }
 
 void test_non_templated_malloc() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
   USMTest<int> usm_fixture;
 
-  usm_fixture.d_A =
+  usm_fixture.data =
       static_cast<int *>(syclcompat::malloc(usm_fixture.size_ * sizeof(int)));
   usm_fixture.launch_kernel();
   usm_fixture.check_result();
-  syclcompat::free(usm_fixture.d_A);
+  syclcompat::free(usm_fixture.data);
 }
 
 void test_non_templated_host() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
   USMTest<int> usm_fixture;
-
   if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_host_allocations))
     return; // Skip unsupported
 
-  usm_fixture.d_A = static_cast<int *>(
+  usm_fixture.data = static_cast<int *>(
       syclcompat::malloc_host(usm_fixture.size_ * sizeof(int)));
   usm_fixture.launch_kernel();
   usm_fixture.check_result();
-  syclcompat::free(usm_fixture.d_A);
-}
-
-void test_non_templated_shared() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<int> usm_fixture;
-
-  if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_shared_allocations))
-    return;
-
-  usm_fixture.d_A = static_cast<int *>(
-      syclcompat::malloc_shared(usm_fixture.size_ * sizeof(int)));
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.d_A);
+  syclcompat::free(usm_fixture.data);
 }
 
 // Test deduce direction
@@ -120,11 +90,12 @@ void test_deduce() {
 
   using memcpy_direction = syclcompat::detail::memcpy_direction;
   auto default_queue = syclcompat::get_default_queue();
+  if (!default_queue.get_device().has(sycl::aspect::usm_host_allocations))
+    return; // Skip unsupported
 
   int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int));
   int *sys_ptr = (int *)std::malloc(sizeof(int));
   int *d_ptr = (int *)syclcompat::malloc(sizeof(int));
-  int *s_ptr = (int *)syclcompat::malloc_shared(sizeof(int));
 
   // * to host
   assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
@@ -135,9 +106,6 @@ void test_deduce() {
   assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
                                                      d_ptr) ==
          memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
-                                                     s_ptr) ==
-         memcpy_direction::device_to_device);
 
   // * to sys
   assert(syclcompat::detail::deduce_memcpy_direction(
@@ -148,8 +116,6 @@ void test_deduce() {
   assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr,
                                                      d_ptr) ==
          memcpy_direction::device_to_host);
-  assert(syclcompat::detail::deduce_memcpy_direction(
-             default_queue, sys_ptr, s_ptr) == memcpy_direction::host_to_host);
 
   // * to dev
   assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
@@ -161,39 +127,20 @@ void test_deduce() {
   assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
                                                      d_ptr) ==
          memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
-                                                     s_ptr) ==
-         memcpy_direction::device_to_device);
-
-  // * to shared
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
-                                                     h_ptr) ==
-         memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(
-             default_queue, s_ptr, sys_ptr) == memcpy_direction::host_to_host);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
-                                                     d_ptr) ==
-         memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
-                                                     s_ptr) ==
-         memcpy_direction::device_to_device);
 
   std::free(sys_ptr);
   syclcompat::free(h_ptr);
   syclcompat::free(d_ptr);
-  syclcompat::free(s_ptr);
 }
 
 int main() {
   INSTANTIATE_ALL_TYPES(value_type_list, test_malloc);
   INSTANTIATE_ALL_TYPES(value_type_list, test_host);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_shared);
 
   // Avoid combinatorial explosion by only testing non-templated
   // syclcompat::malloc with int type
   test_non_templated_malloc();
   test_non_templated_host();
-  test_non_templated_shared();
 
   test_deduce();
 
diff --git a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
new file mode 100644
index 000000000000..6e5f7bcfef34
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
@@ -0,0 +1,113 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  usm_allocations.cpp
+ *
+ *  Description:
+ *    USM allocation tests
+ **************************************************************************/
+
+// REQUIRES: usm_shared_allocations
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <cassert>
+#include <numeric>
+
+#include <sycl/sycl.hpp>
+
+#include <syclcompat/memory.hpp>
+
+#include "../common.hpp"
+#include "memory_common.hpp"
+#include "memory_fixt.hpp"
+
+template <typename T> void test_shared() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  USMTest<T> usm_fixture;
+
+  if (usm_fixture.skip)
+    return; // Skip unsupported
+
+  usm_fixture.data = syclcompat::malloc_shared<T>(usm_fixture.size_);
+  usm_fixture.launch_kernel();
+  usm_fixture.check_result();
+  syclcompat::free(usm_fixture.data);
+}
+
+void test_non_templated_shared() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  USMTest<int> usm_fixture;
+
+  usm_fixture.data = static_cast<int *>(
+      syclcompat::malloc_shared(usm_fixture.size_ * sizeof(int)));
+  usm_fixture.launch_kernel();
+  usm_fixture.check_result();
+  syclcompat::free(usm_fixture.data);
+}
+
+// Test deduce direction
+void test_deduce_shared() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  using memcpy_direction = syclcompat::detail::memcpy_direction;
+  auto default_queue = syclcompat::get_default_queue();
+
+  int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int));
+  int *sys_ptr = (int *)std::malloc(sizeof(int));
+  int *d_ptr = (int *)syclcompat::malloc(sizeof(int));
+  int *s_ptr = (int *)syclcompat::malloc_shared(sizeof(int));
+
+  // * to host
+  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
+                                                     s_ptr) ==
+         memcpy_direction::device_to_device);
+
+  // * to sys
+  assert(syclcompat::detail::deduce_memcpy_direction(
+             default_queue, sys_ptr, s_ptr) == memcpy_direction::host_to_host);
+
+  // * to dev
+  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
+                                                     s_ptr) ==
+         memcpy_direction::device_to_device);
+
+  // * to shared
+  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
+                                                     h_ptr) ==
+         memcpy_direction::device_to_device);
+  assert(syclcompat::detail::deduce_memcpy_direction(
+             default_queue, s_ptr, sys_ptr) == memcpy_direction::host_to_host);
+  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
+                                                     d_ptr) ==
+         memcpy_direction::device_to_device);
+  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
+                                                     s_ptr) ==
+         memcpy_direction::device_to_device);
+
+  syclcompat::free(s_ptr);
+  std::free(sys_ptr);
+  syclcompat::free(h_ptr);
+  syclcompat::free(d_ptr);
+}
+
+int main() {
+  INSTANTIATE_ALL_TYPES(value_type_list, test_shared);
+  test_non_templated_shared();
+  test_deduce_shared();
+
+  return 0;
+}

From 05644a470303c2af3385b9533b8d23ebdea99eb7 Mon Sep 17 00:00:00 2001
From: Chunyang Dai <chunyang.dai@intel.com>
Date: Thu, 11 Apr 2024 23:51:33 +0800
Subject: [PATCH 2/3] [OpenCL] Config dependent-load flag to exclude CWD from
 DLL search path (#13327)

This change is to avoid DLL hijacking security issue.
---
 opencl/opencl-aot/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/opencl/opencl-aot/CMakeLists.txt b/opencl/opencl-aot/CMakeLists.txt
index ff905fd883e6..a05f094df971 100644
--- a/opencl/opencl-aot/CMakeLists.txt
+++ b/opencl/opencl-aot/CMakeLists.txt
@@ -12,6 +12,11 @@ set(OPENCL_AOT_PROJECT_NAME opencl-aot)
 
 add_llvm_tool(${OPENCL_AOT_PROJECT_NAME} ${TARGET_SOURCES})
 
+if (WIN32)
+  # 0x2000: exclude CWD from DLL loading path
+  target_link_options(${OPENCL_AOT_PROJECT_NAME} PRIVATE "/DEPENDENTLOADFLAG:0x2000")
+endif()
+
 if(NOT MSVC)
   # FIXME: when built with clang it produces a warning.
   target_compile_options(${OPENCL_AOT_PROJECT_NAME} PRIVATE "-Wno-unused-parameter")

From 5332773b17efbf10e1b72cd633c1d7e2b4f75125 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Fri, 12 Apr 2024 01:51:19 +0900
Subject: [PATCH 3/3] [SYCL][ESIMD] atomic_update with data size less than 4
 bytes should use LSC atomics (#13340)

SVM doesn't support less than 4 bytes on Gen12, we either get an error
or the wrong answer.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp |  32 ++---
 sycl/test/esimd/memory_properties.cpp        | 126 ++++++++++++++++++-
 2 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 7ef701b7edc8..188bce9d59b6 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -5862,8 +5862,8 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
 template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
-  // supported only by LSC.
+  // 2 byte, 8 byte types, non-power of two, and operations wider than
+  // 32 are supported only by LSC.
   if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
                 !__ESIMD_DNS::isPowerOf2(N, 32)) {
     return slm_atomic_update_impl<Op, T, N,
@@ -5942,8 +5942,8 @@ template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
                   simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
-  // supported only by LSC.
+  // 2 byte, 8 byte types, non-power of two, and operations wider than
+  // 32 are supported only by LSC.
   if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
                 !__ESIMD_DNS::isPowerOf2(N, 32)) {
     // half and short are supported in LSC.
@@ -6031,8 +6031,8 @@ template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
                   simd<T, N> src1, simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
-  // supported only by LSC.
+  // 2 byte, 8 byte types, non-power of two, and operations wider than
+  // 32 are supported only by LSC.
   if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
                 !__ESIMD_DNS::isPowerOf2(N, 32)) {
     // 2-argument lsc_atomic_update arguments order matches the standard one -
@@ -6417,7 +6417,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd_mask<N> mask,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
 
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
         p, byte_offset, mask);
@@ -6640,7 +6640,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
                 (Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
                 (Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
-                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
         p, byte_offset, src0, mask);
@@ -6888,9 +6888,11 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
 
   // Use LSC atomic when cache hints are present, FP atomics is used,
-  // non-power of two length is used, or operation width greater than 32.
+  // non-power of two length is used, or operation width greater than 32, or the
+  // data size is less than 4 bytes.
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
+                sizeof(T) < 4) {
     // 2-argument lsc_atomic_update arguments order matches the standard one -
     // expected value first, then new value. But atomic_update uses reverse
     // order, hence the src1/src0 swap.
@@ -7116,7 +7118,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
 
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                !detail::isPowerOf2(N, 32)) {
+                !detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
         acc, byte_offset, mask);
@@ -7384,7 +7386,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
                 Op == atomic_op::fmin || Op == atomic_op::fmax ||
                 Op == atomic_op::fadd || Op == atomic_op::fsub ||
-                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
         acc, byte_offset, src0, mask);
@@ -7681,9 +7683,11 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
   static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
   // Use LSC atomic when cache hints are present, FP atomics is used,
-  // non-power of two length is used, or operation width greater than 32.
+  // non-power of two length is used, operation width greater than 32, or the
+  // data size is less than 4 bytes,
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
+                sizeof(T) < 4) {
     // 2-argument lsc_atomic_update arguments order matches the standard one -
     // expected value first, then new value. But atomic_update uses reverse
     // order, hence the src1/src0 swap.
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 2c69d3a69d78..b23697d91922 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -322,6 +322,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
           atomic_update<atomic_op::inc, int, VL>(ptr, offsets, pred);
     }
 
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef)
+    {
+      int16_t *ptr = 0;
+      constexpr int VL = 8;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto atomic_res =
+          atomic_update<atomic_op::inc, int16_t, VL>(ptr, offsets);
+    }
+
     // Accessor
 
     // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
@@ -377,6 +388,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
       auto atomic_res_acc =
           atomic_update<atomic_op::inc, int, VL>(acc, offsets, pred);
     }
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK-STATEFUL:  call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 {{[^)]+}}, <8 x i32> undef)
+    // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef)
+    {
+      using AccType =
+          sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
+      AccType *acc = nullptr;
+      constexpr int VL = 8;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto atomic_res =
+          atomic_update<atomic_op::inc, int16_t, VL>(*acc, offsets);
+    }
   }
 
   // Test atomic update with one operand.
@@ -432,6 +456,18 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_8 =
         atomic_update<atomic_op::add, int>(ptr, offsets, add, pred);
 
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32>{{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    {
+      int16_t *ptr = 0;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto add = simd<int16_t, VL>(5);
+      auto atomic_res =
+          atomic_update<atomic_op::add, int16_t, VL>(ptr, offsets, add);
+    }
+
     // Accessors
 
     // CHECK-STATEFUL-COUNT-14:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
@@ -483,6 +519,21 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.sub.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_atomic_17 =
         atomic_update<atomic_op::sub, int>(acc, offsets, add, pred);
+
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    {
+      using AccType =
+          sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
+      AccType *acc = nullptr;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto add = simd<int16_t, VL>(5);
+      auto atomic_res =
+          atomic_update<atomic_op::add, int16_t, VL>(*acc, offsets, add);
+    }
   }
 
   // Test atomic update with two operands.
@@ -626,6 +677,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_100 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare, pred);
 
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    {
+      int16_t *ptr = 0;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      simd<int16_t, VL> swap = simd<int16_t, VL>(1) * sizeof(int);
+      auto compare = swap * 2;
+      auto atomic_res = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
+          ptr, offsets, swap, compare);
+    }
+
     // Accessors
 
     // CHECK-STATEFUL-COUNT-30:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
@@ -751,6 +815,22 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.cmpxchg.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_atomic_33 = atomic_update<atomic_op::cmpxchg, int, VL>(
         acc, offsets, swap, compare, pred);
+
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    {
+      using AccType =
+          sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
+      AccType *acc = nullptr;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      simd<int16_t, VL> swap = simd<int16_t, VL>(1) * sizeof(int);
+      auto compare = swap * 2;
+      auto atomic_res = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
+          *acc, offsets, compare, swap);
+    }
   }
 
   // Test slm_atomic_update without operands.
@@ -824,12 +904,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     {
       constexpr int VL = 16;
       simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
-      auto pred = simd_mask<VL>(1);
       simd<int16_t, VL> add = simd<int16_t, VL>(1) * sizeof(int);
 
       // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> undef, i32 0, <16 x i32> undef)
       auto res_slm_atomic_0 =
-          slm_atomic_update<atomic_op::add, int16_t>(offsets, add, pred);
+          slm_atomic_update<atomic_op::add, int16_t>(offsets, add);
     }
     // Expect DWORD for fmin.
     {
@@ -934,6 +1013,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
 
+    // Expect LSC for short.
+    {
+      constexpr int VL = 16;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto compare = simd<int16_t, VL>(VL, 1);
+      auto swap = compare * 2;
+
+      // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, i32 0, <16 x i32> undef)
+      auto res_slm_atomic_0 =
+          slm_atomic_update<atomic_op::cmpxchg, int16_t, VL>(offsets, swap,
+                                                             compare);
+    }
+
     // Expect LSC for int64_t.
     {
       constexpr int VL = 16;
@@ -964,6 +1056,15 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         local_acc, offsets_view.select<VL, 1>(), pred);
     auto res_slm_atomic_6 = atomic_update<atomic_op::inc, int, VL>(
         local_acc, offsets_view.select<VL, 1>());
+
+    // Expect LSC for short.
+    {
+      using LocalAccType = sycl::local_accessor<int16_t, 1>;
+      LocalAccType *local_acc = nullptr;
+      // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+      auto res_slm_atomic_1 =
+          atomic_update<atomic_op::inc, int16_t>(*local_acc, offsets);
+    }
   }
   // One operand atomic.
   {
@@ -997,6 +1098,16 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         pred);
     res_slm_atomic_8 = atomic_update<atomic_op::add, int, VL>(
         local_acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
+
+    // Expect LSC for short.
+    {
+      using LocalAccType = sycl::local_accessor<int16_t, 1>;
+      LocalAccType *local_acc = nullptr;
+      simd<int16_t, VL> add = simd<int16_t, VL>(1) * sizeof(int);
+      // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+      auto res_slm_atomic_1 =
+          atomic_update<atomic_op::add, int16_t>(*local_acc, offsets, add);
+    }
   }
   // Two operand atomic.
   {
@@ -1069,6 +1180,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg, int, VL>(
         local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
+
+    // Expect LSC for short.
+    {
+      using LocalAccType = sycl::local_accessor<int16_t, 1>;
+      LocalAccType *local_acc = nullptr;
+      auto compare = simd<int16_t, VL>(VL, 1);
+      auto swap = compare * 2;
+      // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+      auto res_slm_atomic_1 = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
+          *local_acc, offsets, swap, compare);
+    }
   }
 }