Skip to content

Commit

Permalink
rm dpct warning, use ref of uninitialized_array, reduce test code dup
Browse files Browse the repository at this point in the history
Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
  • Loading branch information
3 people committed Aug 3, 2021
1 parent f888bd1 commit a5b12b6
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 200 deletions.
47 changes: 9 additions & 38 deletions cuda/test/solver/idr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,6 @@ class Idr : public ::testing::Test {
ref = gko::ReferenceExecutor::create();
cuda = gko::CudaExecutor::create(0, ref);

mtx = gen_mtx(123, 123);
d_mtx = Mtx::create(cuda);
d_mtx->copy_from(mtx.get());
cuda_idr_factory =
Solver::build()
.with_deterministic(true)
Expand Down Expand Up @@ -100,11 +97,11 @@ class Idr : public ::testing::Test {
std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
}

void initialize_data()
void initialize_data(int size = 597, int input_nrhs = 17)
{
int size = 597;
nrhs = 17;
nrhs = input_nrhs;
int s = 4;
mtx = gen_mtx(size, size);
x = gen_mtx(size, nrhs);
b = gen_mtx(size, nrhs);
r = gen_mtx(size, nrhs);
Expand All @@ -125,6 +122,7 @@ class Idr : public ::testing::Test {
stop_status->get_data()[i].reset();
}

d_mtx = Mtx::create(cuda);
d_x = Mtx::create(cuda);
d_b = Mtx::create(cuda);
d_r = Mtx::create(cuda);
Expand All @@ -142,6 +140,7 @@ class Idr : public ::testing::Test {
d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
new gko::Array<gko::stopping_status>(cuda));

d_mtx->copy_from(mtx.get());
d_x->copy_from(x.get());
d_b->copy_from(b.get());
d_r->copy_from(r.get());
Expand Down Expand Up @@ -291,16 +290,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef)

TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)
{
int m = 123;
int n = 1;
initialize_data(123, 1);
auto ref_solver = ref_idr_factory->generate(mtx);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand All @@ -312,8 +304,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
{
int m = 123;
int n = 1;
initialize_data(123, 1);
cuda_idr_factory =
Solver::build()
.with_deterministic(true)
Expand All @@ -330,12 +321,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
.on(ref);
auto ref_solver = ref_idr_factory->generate(mtx);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand All @@ -347,16 +332,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)
{
int m = 123;
int n = 16;
initialize_data(123, 16);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto ref_solver = ref_idr_factory->generate(mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand All @@ -368,8 +346,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
{
int m = 123;
int n = 16;
initialize_data(123, 16);
cuda_idr_factory =
Solver::build()
.with_deterministic(true)
Expand All @@ -386,12 +363,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
.on(ref);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto ref_solver = ref_idr_factory->generate(mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand Down
63 changes: 15 additions & 48 deletions dpcpp/solver/idr_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,44 +111,31 @@ template <size_type block_size, typename ValueType>
void orthonormalize_subspace_vectors_kernel(
size_type num_rows, size_type num_cols, ValueType *__restrict__ values,
size_type stride, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, block_size> *reduction_helper_array,
UninitializedArray<ValueType, block_size> &reduction_helper_array,
remove_complex<ValueType> *reduction_helper_real)
{
const auto tidx = thread::get_thread_id_flat(item_ct1);


ValueType *__restrict__ reduction_helper = (*reduction_helper_array);

ValueType *__restrict__ reduction_helper = reduction_helper_array;

for (size_type row = 0; row < num_rows; row++) {
for (size_type i = 0; i < row; i++) {
auto dot = zero<ValueType>();
// TODO: check with intel why we need this here.
// Is it from we use updated the value even if it is on the same
// thread?
item_ct1.barrier();
for (size_type j = tidx; j < num_cols; j += block_size) {
/*
DPCT1007:5: Migration of this CUDA API is not supported by the
Intel(R) DPC++ Compatibility Tool.
*/
dot += values[row * stride + j] * conj(values[i * stride + j]);
}
// TODO: check with intel why we need this here.
item_ct1.barrier();

reduction_helper[tidx] = dot;

/*
DPCT1065:3: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance, if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);
::gko::kernels::dpcpp::reduce(
group::this_thread_block(item_ct1), reduction_helper,
[](const ValueType &a, const ValueType &b) { return a + b; });
/*
DPCT1065:4: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance, if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);

dot = reduction_helper[0];
for (size_type j = tidx; j < num_cols; j += block_size) {
Expand All @@ -163,22 +150,12 @@ void orthonormalize_subspace_vectors_kernel(

reduction_helper_real[tidx] = norm;

/*
DPCT1065:1: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance, if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);
::gko::kernels::dpcpp::reduce(
group::this_thread_block(item_ct1), reduction_helper_real,
[](const remove_complex<ValueType> &a,
const remove_complex<ValueType> &b) { return a + b; });
/*
DPCT1065:2: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance, if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);

norm = std::sqrt(reduction_helper_real[0]);
for (size_type j = tidx; j < num_cols; j += block_size) {
Expand Down Expand Up @@ -206,10 +183,8 @@ void orthonormalize_subspace_vectors_kernel(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
orthonormalize_subspace_vectors_kernel<block_size>(
num_rows, num_cols, values, stride, item_ct1,
(UninitializedArray<ValueType, block_size> *)
reduction_helper_array_acc_ct1.get_pointer(),
(remove_complex<ValueType> *)
reduction_helper_real_acc_ct1.get_pointer());
*reduction_helper_array_acc_ct1.get_pointer(),
reduction_helper_real_acc_ct1.get_pointer().get());
});
});
}
Expand Down Expand Up @@ -377,7 +352,6 @@ void multidot_kernel(
: (item_ct1.get_group(1) + 1) * num;
// Used that way to get around dynamic initialization warning and
// template error when using `reduction_helper_array` directly in `reduce`

ValueType *__restrict__ reduction_helper = (*reduction_helper_array);

ValueType local_res = zero<ValueType>();
Expand All @@ -389,12 +363,7 @@ void multidot_kernel(
}
}
reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
/*
DPCT1065:10: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
performance, if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);
local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
const auto tile_block = group::tiled_partition<default_dot_dim>(
group::this_thread_block(item_ct1));
Expand Down Expand Up @@ -426,9 +395,7 @@ void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory,
multidot_kernel(
num_rows, nrhs, p_i, g_k, g_k_stride, alpha, stop_status,
item_ct1,
(UninitializedArray<ValueType, default_dot_dim *(
default_dot_dim + 1)> *)
reduction_helper_array_acc_ct1.get_pointer());
reduction_helper_array_acc_ct1.get_pointer().get());
});
});
}
Expand Down
47 changes: 9 additions & 38 deletions dpcpp/test/solver/idr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,6 @@ class Idr : public ::testing::Test {
ref = gko::ReferenceExecutor::create();
dpcpp = gko::DpcppExecutor::create(0, ref);

mtx = gen_mtx(123, 123);
d_mtx = Mtx::create(dpcpp);
d_mtx->copy_from(mtx.get());
dpcpp_idr_factory =
Solver::build()
.with_deterministic(true)
Expand Down Expand Up @@ -110,11 +107,11 @@ class Idr : public ::testing::Test {
rand_engine, ref);
}

void initialize_data()
void initialize_data(int size = 597, int input_nrhs = 17)
{
int size = 597;
nrhs = 17;
nrhs = input_nrhs;
int s = 4;
mtx = gen_mtx(size, size);
x = gen_mtx(size, nrhs);
b = gen_mtx(size, nrhs);
r = gen_mtx(size, nrhs);
Expand All @@ -135,6 +132,7 @@ class Idr : public ::testing::Test {
stop_status->get_data()[i].reset();
}

d_mtx = Mtx::create(dpcpp);
d_x = Mtx::create(dpcpp);
d_b = Mtx::create(dpcpp);
d_r = Mtx::create(dpcpp);
Expand All @@ -152,6 +150,7 @@ class Idr : public ::testing::Test {
d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
new gko::Array<gko::stopping_status>(dpcpp));

d_mtx->copy_from(mtx.get());
d_x->copy_from(x.get());
d_b->copy_from(b.get());
d_r->copy_from(r.get());
Expand Down Expand Up @@ -301,16 +300,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef)

TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)
{
int m = 123;
int n = 1;
initialize_data(123, 1);
auto ref_solver = ref_idr_factory->generate(mtx);
auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(dpcpp);
auto d_x = Mtx::create(dpcpp);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
dpcpp_solver->apply(d_b.get(), d_x.get());
Expand All @@ -322,8 +314,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
{
int m = 123;
int n = 1;
initialize_data(123, 1);
dpcpp_idr_factory =
Solver::build()
.with_deterministic(true)
Expand All @@ -340,12 +331,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
.on(ref);
auto ref_solver = ref_idr_factory->generate(mtx);
auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(dpcpp);
auto d_x = Mtx::create(dpcpp);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
dpcpp_solver->apply(d_b.get(), d_x.get());
Expand All @@ -357,16 +342,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)
{
int m = 123;
int n = 16;
initialize_data(123, 16);
auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx);
auto ref_solver = ref_idr_factory->generate(mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(dpcpp);
auto d_x = Mtx::create(dpcpp);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
dpcpp_solver->apply(d_b.get(), d_x.get());
Expand All @@ -378,8 +356,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
{
int m = 123;
int n = 16;
initialize_data(123, 6);
dpcpp_idr_factory =
Solver::build()
.with_deterministic(true)
Expand All @@ -396,12 +373,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
.on(ref);
auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx);
auto ref_solver = ref_idr_factory->generate(mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(dpcpp);
auto d_x = Mtx::create(dpcpp);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
dpcpp_solver->apply(d_b.get(), d_x.get());
Expand Down
Loading

0 comments on commit a5b12b6

Please sign in to comment.