Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Jul 15, 2021
1 parent 696bf73 commit fe9b218
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 11 deletions.
115 changes: 112 additions & 3 deletions dpcpp/matrix/dense_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ constexpr auto kcfg_1d_list =
KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
KCFG_1D::encode(256, 8)>();
constexpr auto subgroup_list = syn::value_list<std::uint32_t, 64, 32, 16, 8>();
constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
constexpr auto subgroup_list =
syn::value_list<std::uint32_t, 64, 32, 16, 8, 4>();
constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list);
constexpr auto default_block_size = 256;


Expand Down Expand Up @@ -675,6 +676,104 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols,
GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols,
kcfg_1d_list)

template <std::uint32_t sg_size, typename ValueType, typename Closure>
void transpose(const size_type nrows, const size_type ncols,
const ValueType *__restrict__ in, const size_type in_stride,
ValueType *__restrict__ out, const size_type out_stride,
Closure op, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
{
auto local_x = item_ct1.get_local_id(2) % sg_size;
auto local_y = item_ct1.get_local_id(2) / sg_size;
auto x = item_ct1.get_group(2) * sg_size + local_x;
auto y = item_ct1.get_group(1) * sg_size + local_y;
if (y < nrows && x < ncols) {
(*space)[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]);
}
/*
DPCT1065:0: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
performance, if there is no access to global memory.
*/
item_ct1.barrier();
x = item_ct1.get_group(1) * sg_size + local_x;
y = item_ct1.get_group(2) * sg_size + local_y;
if (y < nrows && x < ncols) {
out[y * out_stride + x] = (*space)[local_x * sg_size + local_y];
}
}

template <std::uint32_t sg_size, typename ValueType>
void transpose(const size_type nrows, const size_type ncols,
const ValueType *__restrict__ in, const size_type in_stride,
ValueType *__restrict__ out, const size_type out_stride,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
{
transpose<sg_size>(
nrows, ncols, in, in_stride, out, out_stride,
[](ValueType val) { return val; }, item_ct1, space);
}

template <std::uint32_t sg_size = 32, typename ValueType>
void transpose(dim3 grid, dim3 block, size_t dynamic_shared_memory,
sycl::queue *stream, const size_type nrows,
const size_type ncols, const ValueType *in,
const size_type in_stride, ValueType *out,
const size_type out_stride)
{
stream->submit([&](sycl::handler &cgh) {
sycl::accessor<UninitializedArray<ValueType, sg_size *(sg_size + 1)>, 0,
sycl::access_mode::read_write,
sycl::access::target::local>
space_acc_ct1(cgh);

cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
transpose<sg_size>(
nrows, ncols, in, in_stride, out, out_stride, item_ct1,
(UninitializedArray<ValueType, sg_size *(sg_size + 1)> *)
space_acc_ct1.get_pointer());
});
});
}

template <std::uint32_t sg_size, typename ValueType>
void conj_transpose(
const size_type nrows, const size_type ncols,
const ValueType *__restrict__ in, const size_type in_stride,
ValueType *__restrict__ out, const size_type out_stride,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
{
transpose<sg_size>(
nrows, ncols, in, in_stride, out, out_stride,
[](ValueType val) { return conj(val); }, item_ct1, space);
}

template <std::uint32_t sg_size = 32, typename ValueType>
void conj_transpose(dim3 grid, dim3 block, size_t dynamic_shared_memory,
sycl::queue *stream, const size_type nrows,
const size_type ncols, const ValueType *in,
const size_type in_stride, ValueType *out,
const size_type out_stride)
{
stream->submit([&](sycl::handler &cgh) {
sycl::accessor<UninitializedArray<ValueType, sg_size *(sg_size + 1)>, 0,
sycl::access_mode::read_write,
sycl::access::target::local>
space_acc_ct1(cgh);

cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
conj_transpose<sg_size>(
nrows, ncols, in, in_stride, out, out_stride, item_ct1,
(UninitializedArray<ValueType, sg_size *(sg_size + 1)> *)
space_acc_ct1.get_pointer());
});
});
}


} // namespace kernel

Expand Down Expand Up @@ -1202,7 +1301,17 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
template <typename ValueType>
void transpose(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Dense<ValueType> *orig,
matrix::Dense<ValueType> *trans) GKO_NOT_IMPLEMENTED;
matrix::Dense<ValueType> *trans)
{
auto size = orig->get_size();
auto sg_array = syn::as_array(subgroup_list);
const std::uint32_t cfg = 8;
dim3 grid(ceildiv(size[1], cfg), ceildiv(size[0], cfg));
dim3 block(cfg, cfg);
kernel::transpose<cfg>(grid, block, 0, exec->get_queue(), size[0], size[1],
orig->get_const_values(), orig->get_stride(),
trans->get_values(), trans->get_stride());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);

Expand Down
16 changes: 8 additions & 8 deletions dpcpp/test/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -550,16 +550,16 @@ TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef)
}


// TEST_F(Dense, IsTransposable)
// {
// set_up_apply_data();
TEST_F(Dense, IsTransposable)
{
set_up_apply_data();

// auto trans = x->transpose();
// auto dtrans = dx->transpose();
auto trans = x->transpose();
auto dtrans = dx->transpose();

// GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(dtrans.get()),
// static_cast<Mtx *>(trans.get()), 0);
// }
GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(dtrans.get()),
static_cast<Mtx *>(trans.get()), 0);
}


// TEST_F(Dense, IsConjugateTransposable)
Expand Down

0 comments on commit fe9b218

Please sign in to comment.