From ef44f3653037e36ede7ebb963cd77d008be67fd7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Jun 2021 00:36:58 +0200 Subject: [PATCH] add make_temporary_output_clone This provides an alternative to make_temporary_clone that doesn't initialize the content for output-parameters --- core/matrix/dense.cpp | 79 ++++++++++---------- core/test/base/array.cpp | 29 +++++++ core/test/base/utils.cpp | 26 +++++++ include/ginkgo/core/base/array.hpp | 11 ++- include/ginkgo/core/base/temporary_clone.hpp | 38 ++++++++-- include/ginkgo/core/matrix/dense.hpp | 37 +++++++-- reference/test/matrix/dense_kernels.cpp | 22 ++++++ 7 files changed, 188 insertions(+), 54 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index f8d8421a9dd..bbee89b1dc1 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -331,16 +331,19 @@ void Dense::compute_norm2_impl(LinOp *result) const template void Dense::convert_to(Dense *result) const { - if (result->get_size() && result->get_size() == this->get_size()) { - // it's important we clone the const *this instead of result, since - // otherwise the copy_back_deleter of the temporary_clone calls - // convert_to again with the same dimensions, causing an infinite - // recursion. For the same reason we need to check that the result size - // is not zero, since otherwise it could come from the initial - // convert_to call in make_temporary_clone. - auto exec = result->get_executor(); - exec->run( - dense::make_copy(make_temporary_clone(exec, this).get(), result)); + if (result->get_size() == this->get_size()) { + // we need to create a executor-local clone of the target data, that + // will be copied back later. + auto exec = this->get_executor(); + auto result_array = make_temporary_output_clone(exec, &result->values_); + // create a (value, not pointer to avoid allocation overhead) view + // matrix on the array to avoid special-casing cross-executor copies + auto tmp_result = + Dense{exec, result->get_size(), + Array::view(exec, result_array->get_num_elems(), + result_array->get_data()), + result->get_stride()}; + exec->run(dense::make_copy(this, &tmp_result)); } else { result->values_ = this->values_; result->stride_ = this->stride_; @@ -360,12 +363,10 @@ template void Dense::convert_to( Dense> *result) const { - if (result->get_size() && result->get_size() == this->get_size()) { - // here we don't need to use the target executor, since the copy back - // invokes the non-converting convert_to(Dense*). + if (result->get_size() == this->get_size()) { auto exec = this->get_executor(); - exec->run( - dense::make_copy(this, make_temporary_clone(exec, result).get())); + exec->run(dense::make_copy( + this, make_temporary_output_clone(exec, result).get())); } else { result->values_ = this->values_; result->stride_ = this->stride_; @@ -696,8 +697,8 @@ void Dense::transpose(Dense *output) const { GKO_ASSERT_EQUAL_DIMENSIONS(output, gko::transpose(this->get_size())); auto exec = this->get_executor(); - exec->run( - dense::make_transpose(this, make_temporary_clone(exec, output).get())); + exec->run(dense::make_transpose( + this, make_temporary_output_clone(exec, output).get())); } @@ -707,7 +708,7 @@ void Dense::conj_transpose(Dense *output) const GKO_ASSERT_EQUAL_DIMENSIONS(output, gko::transpose(this->get_size())); auto exec = this->get_executor(); exec->run(dense::make_conj_transpose( - this, make_temporary_clone(exec, output).get())); + this, make_temporary_output_clone(exec, output).get())); } @@ -742,7 +743,7 @@ void Dense::permute(const Array *permutation_indices, exec->run(dense::make_symm_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -757,7 +758,7 @@ void Dense::permute(const Array *permutation_indices, exec->run(dense::make_symm_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -792,7 +793,7 @@ void Dense::inverse_permute(const Array *permutation_indices, exec->run(dense::make_inv_symm_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -807,7 +808,7 @@ void Dense::inverse_permute(const Array *permutation_indices, exec->run(dense::make_inv_symm_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -841,7 +842,7 @@ void Dense::row_permute(const Array *permutation_indices, exec->run(dense::make_row_gather( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -855,7 +856,7 @@ void Dense::row_permute(const Array *permutation_indices, exec->run(dense::make_row_gather( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -893,7 +894,7 @@ void Dense::row_gather(const Array *row_indices, exec->run(dense::make_row_gather( make_temporary_clone(exec, row_indices).get(), this, - make_temporary_clone(exec, row_gathered).get())); + make_temporary_output_clone(exec, row_gathered).get())); } @@ -908,7 +909,7 @@ void Dense::row_gather(const Array *row_indices, this->get_executor()->run(dense::make_row_gather( make_temporary_clone(exec, row_indices).get(), this, - make_temporary_clone(exec, row_gathered).get())); + make_temporary_output_clone(exec, row_gathered).get())); } @@ -922,7 +923,7 @@ void Dense::column_permute(const Array *permutation_indices, exec->run(dense::make_column_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -936,7 +937,7 @@ void Dense::column_permute(const Array *permutation_indices, exec->run(dense::make_column_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -970,7 +971,7 @@ void Dense::inverse_row_permute( exec->run(dense::make_inverse_row_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -984,7 +985,7 @@ void Dense::inverse_row_permute( exec->run(dense::make_inverse_row_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -1018,7 +1019,7 @@ void Dense::inverse_column_permute( exec->run(dense::make_inverse_column_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -1032,7 +1033,7 @@ void Dense::inverse_column_permute( exec->run(dense::make_inverse_column_permute( make_temporary_clone(exec, permutation_indices).get(), this, - make_temporary_clone(exec, output).get())); + make_temporary_output_clone(exec, output).get())); } @@ -1064,7 +1065,7 @@ void Dense::extract_diagonal(Diagonal *output) const GKO_ASSERT_EQ(output->get_size()[0], diag_size); exec->run(dense::make_extract_diagonal( - this, make_temporary_clone(exec, output).get())); + this, make_temporary_output_clone(exec, output).get())); } @@ -1104,7 +1105,7 @@ void Dense::compute_absolute( auto exec = this->get_executor(); exec->run(dense::make_outplace_absolute_dense( - this, make_temporary_clone(exec, output).get())); + this, make_temporary_output_clone(exec, output).get())); } @@ -1126,7 +1127,7 @@ void Dense::make_complex( auto exec = this->get_executor(); exec->run(dense::make_make_complex( - this, make_temporary_clone(exec, result).get())); + this, make_temporary_output_clone(exec, result).get())); } @@ -1147,8 +1148,8 @@ void Dense::get_real( GKO_ASSERT_EQUAL_DIMENSIONS(this, result); auto exec = this->get_executor(); - exec->run( - dense::make_get_real(this, make_temporary_clone(exec, result).get())); + exec->run(dense::make_get_real( + this, make_temporary_output_clone(exec, result).get())); } @@ -1169,8 +1170,8 @@ void Dense::get_imag( GKO_ASSERT_EQUAL_DIMENSIONS(this, result); auto exec = this->get_executor(); - exec->run( - dense::make_get_imag(this, make_temporary_clone(exec, result).get())); + exec->run(dense::make_get_imag( + this, make_temporary_output_clone(exec, result).get())); } diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp index 91c5e8867b9..708bfcf5dad 100644 --- a/core/test/base/array.cpp +++ b/core/test/base/array.cpp @@ -272,6 +272,14 @@ TYPED_TEST(Array, CanCreateTemporaryCloneOnSameExecutor) } +TYPED_TEST(Array, CanCreateTemporaryOutputCloneOnSameExecutor) +{ + auto tmp_clone = make_temporary_output_clone(this->exec, &this->x); + + ASSERT_EQ(tmp_clone.get(), &this->x); +} + + // For tests between different memory, check cuda/test/base/array.cu TYPED_TEST(Array, DoesNotCreateATemporaryCloneBetweenSameMemory) { @@ -299,6 +307,27 @@ TYPED_TEST(Array, DoesNotCopyBackTemporaryCloneBetweenSameMemory) } +TYPED_TEST(Array, CanCreateTemporaryOutputCloneOnDifferentExecutors) +{ + auto other = gko::OmpExecutor::create(); + + { + auto tmp_clone = make_temporary_output_clone(other, &this->x); + tmp_clone->get_data()[0] = 4; + tmp_clone->get_data()[1] = 5; + + // there is no reliable way to check the memory is uninitialized + ASSERT_EQ(tmp_clone->get_num_elems(), this->x.get_num_elems()); + ASSERT_EQ(tmp_clone->get_executor(), other); + ASSERT_EQ(this->x.get_executor(), this->exec); + ASSERT_EQ(this->x.get_data()[0], TypeParam{5}); + ASSERT_EQ(this->x.get_data()[1], TypeParam{2}); + } + ASSERT_EQ(this->x.get_data()[0], TypeParam{4}); + ASSERT_EQ(this->x.get_data()[1], TypeParam{5}); +} + + TYPED_TEST(Array, CanBeCleared) { this->x.clear(); diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp index b66a1260a3a..6d8985e1dff 100644 --- a/core/test/base/utils.cpp +++ b/core/test/base/utils.cpp @@ -410,13 +410,39 @@ TEST_F(TemporaryClone, DoesNotCopyToSameMemory) } +TEST_F(TemporaryClone, OutputDoesNotCopyToSameMemory) +{ + auto other = gko::ReferenceExecutor::create(); + auto clone = make_temporary_output_clone(other, gko::lend(obj)); + + ASSERT_NE(clone.get()->get_executor(), other); + ASSERT_EQ(obj->get_executor(), ref); +} + + TEST_F(TemporaryClone, CopiesBackAfterLeavingScope) { + obj->data = 4; { auto clone = make_temporary_clone(omp, gko::lend(obj)); clone.get()->data = 7; + + ASSERT_EQ(obj->data, 4); } + ASSERT_EQ(obj->get_executor(), ref); + ASSERT_EQ(obj->data, 7); +} + +TEST_F(TemporaryClone, OutputCopiesBackAfterLeavingScope) +{ + obj->data = 4; + { + auto clone = make_temporary_output_clone(omp, gko::lend(obj)); + clone.get()->data = 7; + + ASSERT_EQ(obj->data, 4); + } ASSERT_EQ(obj->get_executor(), ref); ASSERT_EQ(obj->data, 7); } diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index c9f4aaa57c4..c4aa2323417 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -576,16 +576,21 @@ namespace detail { template struct temporary_clone_helper> { static std::unique_ptr> create( - std::shared_ptr exec, Array *ptr) + std::shared_ptr exec, Array *ptr, bool copy_data) { - return std::make_unique>(std::move(exec), *ptr); + if (copy_data) { + return std::make_unique>(std::move(exec), *ptr); + } else { + return std::make_unique>(std::move(exec), + ptr->get_num_elems()); + } } }; template struct temporary_clone_helper> { static std::unique_ptr> create( - std::shared_ptr exec, const Array *ptr) + std::shared_ptr exec, const Array *ptr, bool) { return std::make_unique>(std::move(exec), *ptr); } diff --git a/include/ginkgo/core/base/temporary_clone.hpp b/include/ginkgo/core/base/temporary_clone.hpp index c3309d8736a..8f1f059a647 100644 --- a/include/ginkgo/core/base/temporary_clone.hpp +++ b/include/ginkgo/core/base/temporary_clone.hpp @@ -108,7 +108,7 @@ class copy_back_deleter { template struct temporary_clone_helper { static std::unique_ptr create(std::shared_ptr exec, - T *ptr) + T *ptr, bool) { return gko::clone(std::move(exec), ptr); } @@ -137,8 +137,11 @@ class temporary_clone { * * @param exec the executor where the clone will be created * @param ptr a pointer to the object of which the clone will be created + * @param copy_data should the data be copied to the executor, or should + * only the result be copied back afterwards? */ - explicit temporary_clone(std::shared_ptr exec, pointer ptr) + explicit temporary_clone(std::shared_ptr exec, pointer ptr, + bool copy_data = true) { if (ptr->get_executor()->memory_accessible(exec)) { // just use the object we already have @@ -146,10 +149,10 @@ class temporary_clone { } else { // clone the object to the new executor and make sure it's copied // back before we delete it - handle_ = handle_type( - temporary_clone_helper::create(std::move(exec), ptr) - .release(), - copy_back_deleter(ptr)); + handle_ = handle_type(temporary_clone_helper::create( + std::move(exec), ptr, copy_data) + .release(), + copy_back_deleter(ptr)); } } @@ -196,6 +199,29 @@ detail::temporary_clone make_temporary_clone( } +/** + * Creates a uninitialized temporary_clone that will be copied back to the input + * afterwards. It can be used for output parameters to avoid an unnecessary copy + * in make_temporary_clone. + * + * This is a helper function which avoids the need to explicitly specify the + * type of the object, as would be the case if using the constructor of + * temporary_clone. + * + * @param exec the executor where the uninitialized clone will be created + * @param ptr a pointer to the object of which the clone will be created + */ +template +detail::temporary_clone make_temporary_output_clone( + std::shared_ptr exec, T *ptr) +{ + static_assert( + !std::is_const::value, + "make_temporary_output_clone should only be used on non-const objects"); + return detail::temporary_clone(std::move(exec), ptr); +} + + } // namespace gko diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index cde4a8d91cf..25ed80c7b85 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -660,7 +660,7 @@ class Dense { auto exec = this->get_executor(); this->compute_dot_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, result).get()); + make_temporary_output_clone(exec, result).get()); } /** @@ -674,8 +674,9 @@ class Dense void compute_conj_dot(const LinOp *b, LinOp *result) const { auto exec = this->get_executor(); - this->compute_conj_dot_impl(make_temporary_clone(exec, b).get(), - make_temporary_clone(exec, result).get()); + this->compute_conj_dot_impl( + make_temporary_clone(exec, b).get(), + make_temporary_output_clone(exec, result).get()); } /** @@ -688,7 +689,8 @@ class Dense void compute_norm2(LinOp *result) const { auto exec = this->get_executor(); - this->compute_norm2_impl(make_temporary_clone(exec, result).get()); + this->compute_norm2_impl( + make_temporary_output_clone(exec, result).get()); } /** @@ -818,8 +820,10 @@ class Dense values_{exec, std::forward(values)}, stride_{stride} { - GKO_ENSURE_IN_BOUNDS((size[0] - 1) * stride + size[1] - 1, - values_.get_num_elems()); + if (size[0] > 0 && size[1] > 0) { + GKO_ENSURE_IN_BOUNDS((size[0] - 1) * stride + size[1] - 1, + values_.get_num_elems()); + } } /** @@ -938,6 +942,27 @@ class Dense } // namespace matrix +namespace detail { + + +template +struct temporary_clone_helper> { + static std::unique_ptr> create( + std::shared_ptr exec, matrix::Dense *ptr, + bool copy_data) + { + if (copy_data) { + return gko::clone(std::move(exec), ptr); + } else { + return matrix::Dense::create(exec, ptr->get_size()); + } + } +}; + + +} // namespace detail + + /** * Creates and initializes a column-vector. * diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 36abd6a02c2..4da0248b196 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -135,6 +135,28 @@ TYPED_TEST(Dense, CopyRespectsStride) } +TYPED_TEST(Dense, TemporaryOutputCloneWorks) +{ + using value_type = typename TestFixture::value_type; + auto other = gko::OmpExecutor::create(); + auto m = + gko::initialize>({1.0, 2.0}, this->exec); + + { + auto clone = gko::make_temporary_output_clone(other, m.get()); + clone->at(0) = 4.0; + clone->at(1) = 5.0; + + ASSERT_EQ(m->at(0), value_type{1.0}); + ASSERT_EQ(m->at(1), value_type{2.0}); + ASSERT_EQ(clone->get_size(), m->get_size()); + ASSERT_EQ(clone->get_executor(), other); + } + ASSERT_EQ(m->at(0), value_type{4.0}); + ASSERT_EQ(m->at(1), value_type{5.0}); +} + + TYPED_TEST(Dense, CanBeFilledWithValue) { using value_type = typename TestFixture::value_type;