Skip to content

Commit

Permalink
Merge Port Dense to Dpcpp
Browse files Browse the repository at this point in the history
This PR adds dpcpp support and for current ginkgo dense functionalities.

Summary:
- port dense kernel
- port some components like reduction/thread_ids ...
- use config selection to select kernel
- add some macro/func for the usual dpcpp case
- add oneMKL CMake configuration
Note: there are some porting related stuff. we need to revisit them afterward.

Related PR: #710
  • Loading branch information
yhmtsai committed Jul 21, 2021
2 parents c101790 + d175116 commit 47ba37d
Show file tree
Hide file tree
Showing 34 changed files with 3,216 additions and 92 deletions.
4 changes: 2 additions & 2 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ build/dpcpp/opencl_igpu/release/static:
SYCL_DEVICE_FILTER: "OpenCL"
SYCL_DEVICE_TYPE: "GPU"

build/dpcpp/level_zero_igpu/debug/static:
build/dpcpp/level_zero_igpu/debug/shared:
<<: *default_build_with_test
extends:
- .full_test_condition
Expand All @@ -765,7 +765,7 @@ build/dpcpp/level_zero_igpu/debug/static:
CXX_COMPILER: "dpcpp"
BUILD_DPCPP: "ON"
BUILD_TYPE: "Debug"
BUILD_SHARED_LIBS: "OFF"
BUILD_SHARED_LIBS: "ON"
DPCPP_SINGLE_MODE: "ON"
SYCL_DEVICE_FILTER: "Level_Zero:GPU"

Expand Down
5 changes: 5 additions & 0 deletions cmake/GinkgoConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@)

set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@)
set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@)
set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@)

set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@)

Expand Down Expand Up @@ -168,4 +169,8 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
find_package(rocrand REQUIRED)
endif()

if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP)
find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}")
endif()

include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake)
8 changes: 6 additions & 2 deletions cmake/create_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ function(ginkgo_create_dpcpp_test test_name)
target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}")
target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
ginkgo_set_test_target_properties(${test_name} ${test_target_name})
# Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test.
if (MKL_ENV)
set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}")
endif()
endfunction(ginkgo_create_dpcpp_test)

function(ginkgo_create_thread_test test_name)
Expand Down Expand Up @@ -165,7 +169,7 @@ function(ginkgo_create_common_test test_name)
# use float for DPC++ if necessary
if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1)
endif()
endif()
ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name})
endforeach()
endfunction(ginkgo_create_common_test)
endfunction(ginkgo_create_common_test)
2 changes: 1 addition & 1 deletion common/components/prefix_sum.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
total_sum = element;
#pragma unroll
// hypercube prefix sum
for (auto step = 1; step < subwarp.size(); step *= 2) {
for (int step = 1; step < subwarp.size(); step *= 2) {
auto neighbor = subwarp.shfl_xor(total_sum, step);
total_sum += neighbor;
prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
Expand Down
12 changes: 6 additions & 6 deletions common/components/sorting.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ struct bitonic_local {
bool reverse)
{
auto els_mid = els + (num_elements / 2);
for (auto i = 0; i < num_elements / 2; ++i) {
for (int i = 0; i < num_elements / 2; ++i) {
bitonic_cas(els[i], els_mid[i], reverse);
}
half::merge(els, reverse);
Expand Down Expand Up @@ -131,7 +131,7 @@ struct bitonic_warp {
auto tile =
group::tiled_partition<num_threads>(group::this_thread_block());
auto new_reverse = reverse != upper_half();
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
auto other = tile.shfl_xor(els[i], num_threads / 2);
bitonic_cas(els[i], other, new_reverse);
}
Expand Down Expand Up @@ -206,7 +206,7 @@ struct bitonic_global {
auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
// only the lower group executes the CAS
if (!upper_half()) {
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
auto j = shared_idx(i);
bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
}
Expand Down Expand Up @@ -241,11 +241,11 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
bool reverse)
{
group::this_thread_block().sync();
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
local_els[i] = shared_els[shared_idx(i)];
}
warp::merge(local_els, reverse);
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
shared_els[shared_idx(i)] = local_els[i];
}
}
Expand All @@ -258,7 +258,7 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
// This is the first step, so we don't need to load from shared memory
warp::sort(local_els, reverse);
// store the sorted elements in shared memory
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
shared_els[shared_idx(i)] = local_els[i];
}
}
Expand Down
7 changes: 4 additions & 3 deletions common/components/uninitialized_array.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**
* Stores an array with uninitialized contents.
*
* This class needed for datatypes that do have a non-empty constructor when`
* This class is needed for datatypes that do have a non-empty constructor when
* using them as shared memory, for example `thrust::complex<float>`.
*
* @tparam ValueType the type of values
Expand All @@ -49,7 +49,7 @@ public:
*
* @return the constexpr pointer to the first entry of the array.
*/
constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept
{
return &(*this)[0];
}
Expand All @@ -70,7 +70,8 @@ public:
*
* @return a reference to the array entry at the given index.
*/
constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const
noexcept
{
return reinterpret_cast<const ValueType *>(data_)[pos];
}
Expand Down
4 changes: 2 additions & 2 deletions common/factorization/par_ilut_filter_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
auto end = row_ptrs[row + 1];
begin_cb(row);
auto num_steps = ceildiv(end - begin, subwarp_size);
for (auto step = 0; step < num_steps; ++step) {
for (IndexType step = 0; step < num_steps; ++step) {
auto idx = begin + lane + step * subwarp_size;
auto keep = idx < end && pred(idx, begin, end);
auto mask = subwarp.ballot(keep);
Expand Down Expand Up @@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
}


} // namespace kernel
} // namespace kernel
8 changes: 4 additions & 4 deletions common/factorization/par_ilut_select_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
// assuming rounding towards zero
auto stride = double(size) / sample_size;
#pragma unroll
for (auto i = 0; i < sampleselect_oversampling; ++i) {
for (int i = 0; i < sampleselect_oversampling; ++i) {
auto lidx = idx * sampleselect_oversampling + i;
auto val = input[static_cast<IndexType>(lidx * stride)];
samples[i] = abs(val);
Expand Down Expand Up @@ -119,7 +119,7 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
auto el = abs(input[i]);
IndexType tree_idx{};
#pragma unroll
for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
for (int level = 0; level < sampleselect_searchtree_height; ++level) {
auto cmp = !(el < sh_tree[tree_idx]);
tree_idx = 2 * tree_idx + 1 + cmp;
}
Expand Down Expand Up @@ -168,7 +168,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
// compute prefix sum over warp-sized blocks
IndexType total{};
auto base_idx = warp_idx * work_per_warp * warp.size();
for (auto step = 0; step < work_per_warp; ++step) {
for (IndexType step = 0; step < work_per_warp; ++step) {
auto idx = warp_lane + step * warp.size() + base_idx;
auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
IndexType warp_total{};
Expand Down Expand Up @@ -207,7 +207,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
// add block prefix sum to each warp's block of data
block.sync();
auto warp_prefixsum = warp_sums[warp_idx];
for (auto step = 0; step < work_per_warp; ++step) {
for (IndexType step = 0; step < work_per_warp; ++step) {
auto idx = warp_lane + step * warp.size() + base_idx;
auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
if (idx < num_blocks) {
Expand Down
2 changes: 1 addition & 1 deletion common/matrix/dense_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(

if (tidx < num_rows) {
auto write_to = row_ptrs[tidx];
for (auto i = 0; i < num_cols; i++) {
for (size_type i = 0; i < num_cols; i++) {
if (source[stride * tidx + i] != zero<ValueType>()) {
values[write_to] = source[stride * tidx + i];
col_idxs[write_to] = i;
Expand Down
4 changes: 2 additions & 2 deletions common/matrix/ell_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < num_rows) {
for (auto col = 0; col < nnz; col++) {
for (size_type col = 0; col < nnz; col++) {
result[tidx * result_stride +
col_idxs[tidx + col * source_stride]] +=
values[tidx + col * source_stride];
Expand Down Expand Up @@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(

if (tidx < num_rows) {
auto write_to = result_row_ptrs[tidx];
for (auto i = 0; i < max_nnz_per_row; i++) {
for (size_type i = 0; i < max_nnz_per_row; i++) {
const auto source_idx = tidx + stride * i;
if (source_values[source_idx] != zero<ValueType>()) {
result_values[write_to] = source_values[source_idx];
Expand Down
2 changes: 1 addition & 1 deletion common/matrix/hybrid_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(

if (tidx < num_rows) {
auto write_to = result_row_ptrs[tidx];
for (auto i = 0; i < max_nnz_per_row; i++) {
for (size_type i = 0; i < max_nnz_per_row; i++) {
const auto source_idx = tidx + stride * i;
if (ell_val[source_idx] != zero<ValueType>()) {
result_values[write_to] = ell_val[source_idx];
Expand Down
6 changes: 3 additions & 3 deletions cuda/components/prefix_sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,16 @@ template <typename IndexType>
void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
size_type num_entries)
{
// prefix_sum should be on the valid array
// prefix_sum should only be performed on a valid array
if (num_entries > 0) {
auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
Array<IndexType> block_sum_array(exec, num_blocks - 1);
auto block_sums = block_sum_array.get_data();
start_prefix_sum<prefix_sum_block_size>
<<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
block_sums);
// add the total sum of the previous block only when the number of block
// is larger than 1.
// add the total sum of the previous block only when the number of
// blocks is larger than 1.
if (num_blocks > 1) {
finalize_prefix_sum<prefix_sum_block_size>
<<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
Expand Down
2 changes: 1 addition & 1 deletion cuda/test/components/sorting_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ protected:
{
// we want some duplicate elements
std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
for (auto i = 0; i < num_elements; ++i) {
for (int i = 0; i < num_elements; ++i) {
ref_shared.get_data()[i] = dist(rng);
}
ddata = gko::Array<gko::int32>{cuda, ref_shared};
Expand Down
2 changes: 1 addition & 1 deletion cuda/test/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
&dnnz_per_row);

auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
}
}
Expand Down
2 changes: 1 addition & 1 deletion cuda/test/matrix/ell_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
&dnnz_per_row);

auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
}
}
Expand Down
1 change: 0 additions & 1 deletion dev_tools/scripts/format_header.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ GINKGO_LICENSE_BEACON="******************************<GINKGO LICENSE>***********

CONTENT="content.cpp" # Store the residual part (start from namespace)
BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file
BEGIN="begin.cpp" # Store the header before license
HAS_HIP_RUNTIME="false"
DURING_LICENSE="false"
INCLUDE_REGEX="^#include.*"
Expand Down
2 changes: 1 addition & 1 deletion dev_tools/scripts/regroup
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<(rapidjson|gflags|gtest|papi).*'
Priority: 3
- Regex: '^<(omp|cu|hip|thrust|CL/|cooperative).*'
- Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*'
Priority: 2
- Regex: '^<ginkgo.*'
Priority: 5
Expand Down
19 changes: 17 additions & 2 deletions dpcpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@ endif()
ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION)
set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)

find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)

add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
target_sources(ginkgo_dpcpp
PRIVATE
base/version.dp.cpp
base/executor.dp.cpp
base/helper.dp.cpp
components/absolute_array.dp.cpp
components/fill_array.dp.cpp
components/prefix_sum.dp.cpp
Expand Down Expand Up @@ -54,13 +58,24 @@ target_sources(ginkgo_dpcpp
ginkgo_compile_features(ginkgo_dpcpp)
target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)

set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -fsycl)
set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
# find_package(MKL) everywhere when linking ginkgo (see the MKL example
# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
# When building ginkgo as a static library, we need to use dpcpp and per_kernel
# link option when the program uses a dpcpp related function.
if (BUILD_SHARED_LIBS)
target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
else ()
target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
endif()
target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
if (GINKGO_DPCPP_SINGLE_MODE)
target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
endif()
Expand Down
Loading

0 comments on commit 47ba37d

Please sign in to comment.