Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/sycl' into duncan/remove-extra-p…
Browse files Browse the repository at this point in the history
…i_mem
  • Loading branch information
DBDuncan committed Apr 16, 2024
2 parents ba78536 + 5332773 commit b4c0e15
Show file tree
Hide file tree
Showing 9 changed files with 372 additions and 131 deletions.
5 changes: 5 additions & 0 deletions opencl/opencl-aot/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ set(OPENCL_AOT_PROJECT_NAME opencl-aot)

add_llvm_tool(${OPENCL_AOT_PROJECT_NAME} ${TARGET_SOURCES})

if (WIN32)
# 0x2000: exclude CWD from DLL loading path
target_link_options(${OPENCL_AOT_PROJECT_NAME} PRIVATE "/DEPENDENTLOADFLAG:0x2000")
endif()

if(NOT MSVC)
# FIXME: when built with clang it produces a warning.
target_compile_options(${OPENCL_AOT_PROJECT_NAME} PRIVATE "-Wno-unused-parameter")
Expand Down
32 changes: 18 additions & 14 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5862,8 +5862,8 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
return slm_atomic_update_impl<Op, T, N,
Expand Down Expand Up @@ -5942,8 +5942,8 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// half and short are supported in LSC.
Expand Down Expand Up @@ -6031,8 +6031,8 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd<T, N> src1, simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
Expand Down Expand Up @@ -6417,7 +6417,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd_mask<N> mask,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

if constexpr (detail::has_cache_hints<PropertyListT>() ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
p, byte_offset, mask);
Expand Down Expand Up @@ -6640,7 +6640,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
(Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
(Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
p, byte_offset, src0, mask);
Expand Down Expand Up @@ -6888,9 +6888,11 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

// Use LSC atomic when cache hints are present, FP atomics is used,
// non-power of two length is used, or operation width greater than 32.
// non-power of two length is used, or operation width greater than 32, or the
// data size is less than 4 bytes.
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
sizeof(T) < 4) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
// order, hence the src1/src0 swap.
Expand Down Expand Up @@ -7116,7 +7118,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

if constexpr (detail::has_cache_hints<PropertyListT>() ||
!detail::isPowerOf2(N, 32)) {
!detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
acc, byte_offset, mask);
Expand Down Expand Up @@ -7384,7 +7386,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fmin || Op == atomic_op::fmax ||
Op == atomic_op::fadd || Op == atomic_op::fsub ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
acc, byte_offset, src0, mask);
Expand Down Expand Up @@ -7681,9 +7683,11 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
// Use LSC atomic when cache hints are present, FP atomics is used,
// non-power of two length is used, or operation width greater than 32.
// non-power of two length is used, operation width greater than 32, or the
// data size is less than 4 bytes,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
sizeof(T) < 4) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
// order, hence the src1/src0 swap.
Expand Down
11 changes: 6 additions & 5 deletions sycl/test-e2e/syclcompat/math/math_complex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
//===---------------------------------------------------------------===//

// REQUIRES: aspect-fp64
// REQUIRES: usm_shared_allocations
// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
// RUN: %{run} %t.out

#include <complex>
Expand Down Expand Up @@ -72,18 +71,20 @@ template <auto F> class ComplexLauncher {
protected:
int *result_;
int cpu_result_{0};
int h_result_;

public:
ComplexLauncher() {
result_ = (int *)syclcompat::malloc_shared(sizeof(int));
*result_ = 0;
result_ = (int *)syclcompat::malloc(sizeof(int));
syclcompat::memset(result_, 0, sizeof(int));
};
~ComplexLauncher() { syclcompat::free(result_); }
void launch() {
F(&cpu_result_); // Run on host
syclcompat::launch<F>(1, 1, result_); // Run on device
syclcompat::wait();
assert(*result_ == 1);
syclcompat::memcpy<int>(&h_result_, result_, 1);
assert(h_result_ == 1);
assert(cpu_result_ == 1);
}
};
Expand Down
10 changes: 5 additions & 5 deletions sycl/test-e2e/syclcompat/memory/memory_fixt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ template <typename T> struct USMTest {
skip{should_skip<T>(syclcompat::get_current_device())} {}

void launch_kernel() {
auto &dd_A = d_A;
auto &dd_A = data;
return q_
.submit([&](sycl::handler &cgh) {
cgh.parallel_for(
Expand All @@ -135,15 +135,15 @@ template <typename T> struct USMTest {
// Check result is identity vector
// Handles memcpy for USM device alloc
void check_result() {
sycl::usm::alloc ptr_type = sycl::get_pointer_type(d_A, q_.get_context());
sycl::usm::alloc ptr_type = sycl::get_pointer_type(data, q_.get_context());
assert(ptr_type != sycl::usm::alloc::unknown);

T *result;
if (ptr_type == sycl::usm::alloc::device) {
result = static_cast<T *>(std::malloc(sizeof(T) * size_));
syclcompat::memcpy(result, d_A, sizeof(T) * size_);
syclcompat::memcpy(result, data, sizeof(T) * size_);
} else {
result = d_A;
result = data;
}

for (size_t i = 0; i < size_; i++) {
Expand All @@ -157,7 +157,7 @@ template <typename T> struct USMTest {
sycl::queue q_;
syclcompat::dim3 const grid_;
syclcompat::dim3 const thread_;
T *d_A;
T *data;
size_t size_;
bool skip;
};
Expand Down
91 changes: 91 additions & 0 deletions sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/***************************************************************************
*
* Copyright (C) Codeplay Software Ltd.
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM
* Exceptions. See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SYCLcompat API
*
* memory_management_shared.cpp
*
* Description:
* memory operations tests with shared memory
**************************************************************************/

// The original source was under the license below:
// ====------ memory_management_test2.cpp---------- -*- C++ -* ----===////
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//
// ===----------------------------------------------------------------------===//

// REQUIRES: usm_shared_allocations
// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
// RUN: %{run} %t.out

#include <sycl/sycl.hpp>

#include <syclcompat/memory.hpp>

#include "../common.hpp"
#include "memory_common.hpp"

constexpr size_t DataW = 100;
constexpr size_t DataH = 100;

void test_shared_memory() {
std::cout << __PRETTY_FUNCTION__ << std::endl;

syclcompat::shared_memory<float, 1> s_A(DataW);
syclcompat::shared_memory<float, 1> s_B(DataW);
syclcompat::shared_memory<float, 1> s_C(DataW);

s_A.init();
s_B.init();
s_C.init();

for (int i = 0; i < DataW; i++) {
s_A[i] = 1.0f;
s_B[i] = 2.0f;
}

{
syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
float *d_A = s_A.get_ptr();
float *d_B = s_B.get_ptr();
float *d_C = s_C.get_ptr();
cgh.parallel_for(sycl::range<1>(DataW), [=](sycl::id<1> id) {
int i = id[0];
float *A = d_A;
float *B = d_B;
float *C = d_C;
C[i] = A[i] + B[i];
});
});
syclcompat::get_default_queue().wait_and_throw();
}

// verify hostD
for (int i = 0; i < DataW; i++) {
for (int j = 0; j < DataH; j++) {
assert(fabs(s_C[i] - s_A[i] - s_B[i]) <= 1e-5);
}
}
}

int main() {
test_shared_memory();

return 0;
}
42 changes: 0 additions & 42 deletions sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
//
// ===----------------------------------------------------------------------===//

// REQUIRES: usm_shared_allocations
// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
// RUN: %{run} %t.out

Expand Down Expand Up @@ -217,46 +216,6 @@ void test_global_memory() {
}
}

void test_shared_memory() {
std::cout << __PRETTY_FUNCTION__ << std::endl;

syclcompat::shared_memory<float, 1> s_A(DataW);
syclcompat::shared_memory<float, 1> s_B(DataW);
syclcompat::shared_memory<float, 1> s_C(DataW);

s_A.init();
s_B.init();
s_C.init();

for (int i = 0; i < DataW; i++) {
s_A[i] = 1.0f;
s_B[i] = 2.0f;
}

{
syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
float *d_A = s_A.get_ptr();
float *d_B = s_B.get_ptr();
float *d_C = s_C.get_ptr();
cgh.parallel_for(sycl::range<1>(DataW), [=](sycl::id<1> id) {
int i = id[0];
float *A = d_A;
float *B = d_B;
float *C = d_C;
C[i] = A[i] + B[i];
});
});
syclcompat::get_default_queue().wait_and_throw();
}

// verify hostD
for (int i = 0; i < DataW; i++) {
for (int j = 0; j < DataH; j++) {
assert(fabs(s_C[i] - s_A[i] - s_B[i]) <= 1e-5);
}
}
}

void test_constant_memory() {
std::cout << __PRETTY_FUNCTION__ << std::endl;

Expand Down Expand Up @@ -366,7 +325,6 @@ int main() {
test_memcpy_pitched_q();

test_global_memory();
test_shared_memory();
test_constant_memory();
return 0;
}
Loading

0 comments on commit b4c0e15

Please sign in to comment.