Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Dense stride handling #774

Merged
merged 17 commits into from
Jun 3, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
315 changes: 167 additions & 148 deletions core/matrix/dense.cpp

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions core/test/base/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,14 @@ TYPED_TEST(Array, CanCreateTemporaryCloneOnSameExecutor)
}


TYPED_TEST(Array, CanCreateTemporaryOutputCloneOnSameExecutor)
{
auto tmp_clone = make_temporary_output_clone(this->exec, &this->x);

ASSERT_EQ(tmp_clone.get(), &this->x);
}


// For tests between different memory, check cuda/test/base/array.cu
TYPED_TEST(Array, DoesNotCreateATemporaryCloneBetweenSameMemory)
{
Expand Down Expand Up @@ -299,6 +307,27 @@ TYPED_TEST(Array, DoesNotCopyBackTemporaryCloneBetweenSameMemory)
}


TYPED_TEST(Array, CanCreateTemporaryOutputCloneOnDifferentExecutors)
{
auto other = gko::OmpExecutor::create();
yhmtsai marked this conversation as resolved.
Show resolved Hide resolved

{
auto tmp_clone = make_temporary_output_clone(other, &this->x);
tmp_clone->get_data()[0] = 4;
tmp_clone->get_data()[1] = 5;

// there is no reliable way to check the memory is uninitialized
ASSERT_EQ(tmp_clone->get_num_elems(), this->x.get_num_elems());
ASSERT_EQ(tmp_clone->get_executor(), other);
ASSERT_EQ(this->x.get_executor(), this->exec);
ASSERT_EQ(this->x.get_data()[0], TypeParam{5});
ASSERT_EQ(this->x.get_data()[1], TypeParam{2});
}
ASSERT_EQ(this->x.get_data()[0], TypeParam{4});
ASSERT_EQ(this->x.get_data()[1], TypeParam{5});
}


TYPED_TEST(Array, CanBeCleared)
{
this->x.clear();
Expand Down
26 changes: 26 additions & 0 deletions core/test/base/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,13 +410,39 @@ TEST_F(TemporaryClone, DoesNotCopyToSameMemory)
}


TEST_F(TemporaryClone, OutputDoesNotCopyToSameMemory)
{
auto other = gko::ReferenceExecutor::create();
auto clone = make_temporary_output_clone(other, gko::lend(obj));

ASSERT_NE(clone.get()->get_executor(), other);
ASSERT_EQ(obj->get_executor(), ref);
}


TEST_F(TemporaryClone, CopiesBackAfterLeavingScope)
{
obj->data = 4;
{
auto clone = make_temporary_clone(omp, gko::lend(obj));
clone.get()->data = 7;

ASSERT_EQ(obj->data, 4);
}
ASSERT_EQ(obj->get_executor(), ref);
ASSERT_EQ(obj->data, 7);
}


TEST_F(TemporaryClone, OutputCopiesBackAfterLeavingScope)
{
obj->data = 4;
{
auto clone = make_temporary_output_clone(omp, gko::lend(obj));
clone.get()->data = 7;

ASSERT_EQ(obj->data, 4);
}
ASSERT_EQ(obj->get_executor(), ref);
ASSERT_EQ(obj->data, 7);
}
Expand Down
4 changes: 3 additions & 1 deletion cuda/test/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ class Dense : public ::testing::Test {
TEST_F(Dense, CudaCopyRespectsStride)
{
set_up_vector_data(3);
auto result = Mtx::create(cuda, dx->get_size(), dx->get_size()[1] + 1);
auto stride = dx->get_size()[1] + 1;
auto result = Mtx::create(cuda, dx->get_size(), stride);
double val = 123456789.0;
auto original_data = result->get_values();
auto padding_ptr = original_data + dx->get_size()[1];
Expand All @@ -203,6 +204,7 @@ TEST_F(Dense, CudaCopyRespectsStride)
dx->convert_to(result.get());

GKO_ASSERT_MTX_NEAR(result, dx, 0);
upsj marked this conversation as resolved.
Show resolved Hide resolved
ASSERT_EQ(result->get_stride(), stride);
ASSERT_EQ(cuda->copy_val_to_host(padding_ptr), val);
ASSERT_EQ(result->get_values(), original_data);
}
Expand Down
4 changes: 3 additions & 1 deletion hip/test/matrix/dense_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ class Dense : public ::testing::Test {
TEST_F(Dense, HipCopyRespectsStride)
{
set_up_vector_data(3);
auto result = Mtx::create(hip, dx->get_size(), dx->get_size()[1] + 1);
auto stride = dx->get_size()[1] + 1;
auto result = Mtx::create(hip, dx->get_size(), stride);
double val = 123456789.0;
auto original_data = result->get_values();
auto padding_ptr = original_data + dx->get_size()[1];
Expand All @@ -198,6 +199,7 @@ TEST_F(Dense, HipCopyRespectsStride)
dx->convert_to(result.get());

GKO_ASSERT_MTX_NEAR(result, dx, 0);
ASSERT_EQ(result->get_stride(), stride);
ASSERT_EQ(hip->copy_val_to_host(padding_ptr), val);
ASSERT_EQ(result->get_values(), original_data);
}
Expand Down
11 changes: 8 additions & 3 deletions include/ginkgo/core/base/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,16 +576,21 @@ namespace detail {
template <typename T>
struct temporary_clone_helper<Array<T>> {
static std::unique_ptr<Array<T>> create(
std::shared_ptr<const Executor> exec, Array<T> *ptr)
std::shared_ptr<const Executor> exec, Array<T> *ptr, bool copy_data)
upsj marked this conversation as resolved.
Show resolved Hide resolved
{
return std::make_unique<Array<T>>(std::move(exec), *ptr);
if (copy_data) {
return std::make_unique<Array<T>>(std::move(exec), *ptr);
} else {
return std::make_unique<Array<T>>(std::move(exec),
ptr->get_num_elems());
}
}
};

template <typename T>
struct temporary_clone_helper<const Array<T>> {
static std::unique_ptr<const Array<T>> create(
std::shared_ptr<const Executor> exec, const Array<T> *ptr)
std::shared_ptr<const Executor> exec, const Array<T> *ptr, bool)
{
return std::make_unique<const Array<T>>(std::move(exec), *ptr);
}
Expand Down
38 changes: 32 additions & 6 deletions include/ginkgo/core/base/temporary_clone.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class copy_back_deleter<const T> {
template <typename T>
struct temporary_clone_helper {
static std::unique_ptr<T> create(std::shared_ptr<const Executor> exec,
T *ptr)
T *ptr, bool)
{
return gko::clone(std::move(exec), ptr);
}
Expand Down Expand Up @@ -137,19 +137,22 @@ class temporary_clone {
*
* @param exec the executor where the clone will be created
* @param ptr a pointer to the object of which the clone will be created
* @param copy_data should the data be copied to the executor, or should
* only the result be copied back afterwards?
*/
explicit temporary_clone(std::shared_ptr<const Executor> exec, pointer ptr)
explicit temporary_clone(std::shared_ptr<const Executor> exec, pointer ptr,
bool copy_data = true)
{
if (ptr->get_executor()->memory_accessible(exec)) {
// just use the object we already have
handle_ = handle_type(ptr, null_deleter<T>());
} else {
// clone the object to the new executor and make sure it's copied
// back before we delete it
handle_ = handle_type(
temporary_clone_helper<T>::create(std::move(exec), ptr)
.release(),
copy_back_deleter<T>(ptr));
handle_ = handle_type(temporary_clone_helper<T>::create(
std::move(exec), ptr, copy_data)
.release(),
copy_back_deleter<T>(ptr));
}
}

Expand Down Expand Up @@ -196,6 +199,29 @@ detail::temporary_clone<T> make_temporary_clone(
}


/**
* Creates a uninitialized temporary_clone that will be copied back to the input
* afterwards. It can be used for output parameters to avoid an unnecessary copy
* in make_temporary_clone.
*
* This is a helper function which avoids the need to explicitly specify the
* type of the object, as would be the case if using the constructor of
* temporary_clone.
*
* @param exec the executor where the uninitialized clone will be created
* @param ptr a pointer to the object of which the clone will be created
*/
template <typename T>
detail::temporary_clone<T> make_temporary_output_clone(
std::shared_ptr<const Executor> exec, T *ptr)
{
static_assert(
!std::is_const<T>::value,
"make_temporary_output_clone should only be used on non-const objects");
return detail::temporary_clone<T>(std::move(exec), ptr, false);
}


} // namespace gko


Expand Down
33 changes: 24 additions & 9 deletions include/ginkgo/core/base/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,15 +527,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
#endif


/**
* Instantiates a template for each value type conversion pair compiled by
* Ginkgo.
*
* @param _macro A macro which expands the template instantiation
* (not including the leading `template` specifier).
* Should take two arguments `src` and `dst`, which
* are replaced by the source and destination value type.
*/
#if GINKGO_DPCPP_SINGLE_MODE
#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \
template <> \
Expand All @@ -546,6 +537,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
_macro(std::complex<float>, std::complex<double>) GKO_NOT_IMPLEMENTED; \
template <> \
_macro(std::complex<double>, std::complex<float>) GKO_NOT_IMPLEMENTED


#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \
upsj marked this conversation as resolved.
Show resolved Hide resolved
GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \
template <> \
Expand All @@ -557,11 +550,33 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
template <> \
_macro(std::complex<double>, std::complex<double>) GKO_NOT_IMPLEMENTED
#else


/**
* Instantiates a template for each value type conversion pair compiled by
* Ginkgo.
*
* @param _macro A macro which expands the template instantiation
* (not including the leading `template` specifier).
* Should take two arguments `src` and `dst`, which
* are replaced by the source and destination value type.
*/
#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \
template _macro(float, double); \
template _macro(double, float); \
template _macro(std::complex<float>, std::complex<double>); \
template _macro(std::complex<double>, std::complex<float>)


/**
* Instantiates a template for each value type conversion or copy pair compiled
* by Ginkgo.
*
* @param _macro A macro which expands the template instantiation
* (not including the leading `template` specifier).
* Should take two arguments `src` and `dst`, which
* are replaced by the source and destination value type.
*/
#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \
GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \
template _macro(float, float); \
Expand Down
68 changes: 60 additions & 8 deletions include/ginkgo/core/matrix/dense.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ class Dense
const Array<int64> *permutation_indices) const override;

/**
* Writes the symetrically permuted matrix into the given output matrix.
* Writes the symmetrically permuted matrix into the given output matrix.
*
* @param permutation_indices The array containing permutation indices.
* It must have `this->get_size()[0]` elements.
Expand All @@ -319,7 +319,7 @@ class Dense
const Array<int64> *permutation_indices) const override;

/**
* Writes the inverse symetrically permuted matrix into the given output
* Writes the inverse symmetrically permuted matrix into the given output
* matrix.
*
* @param permutation_indices The array containing permutation indices.
Expand Down Expand Up @@ -660,7 +660,7 @@ class Dense
{
auto exec = this->get_executor();
this->compute_dot_impl(make_temporary_clone(exec, b).get(),
make_temporary_clone(exec, result).get());
make_temporary_output_clone(exec, result).get());
}

/**
Expand All @@ -674,8 +674,9 @@ class Dense
void compute_conj_dot(const LinOp *b, LinOp *result) const
{
auto exec = this->get_executor();
this->compute_conj_dot_impl(make_temporary_clone(exec, b).get(),
make_temporary_clone(exec, result).get());
this->compute_conj_dot_impl(
make_temporary_clone(exec, b).get(),
make_temporary_output_clone(exec, result).get());
}

/**
Expand All @@ -688,7 +689,8 @@ class Dense
void compute_norm2(LinOp *result) const
{
auto exec = this->get_executor();
this->compute_norm2_impl(make_temporary_clone(exec, result).get());
this->compute_norm2_impl(
make_temporary_output_clone(exec, result).get());
}

/**
Expand Down Expand Up @@ -818,8 +820,10 @@ class Dense
values_{exec, std::forward<ValuesArray>(values)},
stride_{stride}
{
GKO_ENSURE_IN_BOUNDS((size[0] - 1) * stride + size[1] - 1,
values_.get_num_elems());
if (size[0] > 0 && size[1] > 0) {
GKO_ENSURE_IN_BOUNDS((size[0] - 1) * stride + size[1] - 1,
values_.get_num_elems());
}
}

/**
Expand Down Expand Up @@ -929,6 +933,33 @@ class Dense
idx % this->get_size()[1]);
}

template <typename IndexType>
void permute_impl(const Array<IndexType> *permutation, Dense *output) const;

template <typename IndexType>
void inverse_permute_impl(const Array<IndexType> *permutation,
Dense *output) const;

template <typename IndexType>
void row_permute_impl(const Array<IndexType> *permutation,
Dense *output) const;

template <typename IndexType>
void inverse_row_permute_impl(const Array<IndexType> *permutation,
Dense *output) const;

template <typename IndexType>
void row_gather_impl(const Array<IndexType> *row_indices,
Dense *output) const;

template <typename IndexType>
void column_permute_impl(const Array<IndexType> *permutation,
Dense *output) const;

template <typename IndexType>
void inverse_column_permute_impl(const Array<IndexType> *permutation,
Dense *output) const;

private:
Array<value_type> values_;
size_type stride_;
Expand All @@ -938,6 +969,27 @@ class Dense
} // namespace matrix


namespace detail {


template <typename ValueType>
struct temporary_clone_helper<matrix::Dense<ValueType>> {
static std::unique_ptr<matrix::Dense<ValueType>> create(
std::shared_ptr<const Executor> exec, matrix::Dense<ValueType> *ptr,
bool copy_data)
{
if (copy_data) {
return gko::clone(std::move(exec), ptr);
} else {
return matrix::Dense<ValueType>::create(exec, ptr->get_size());
}
}
};


} // namespace detail


/**
* Creates and initializes a column-vector.
*
Expand Down
Loading