Skip to content

Commit

Permalink
Merge branch 'master' into mixed_precision_gemm
Browse files Browse the repository at this point in the history
  • Loading branch information
OuadiElfarouki committed May 7, 2024
2 parents 8e2930b + 628017f commit d0a64b9
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 16 deletions.
25 changes: 22 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ set(PORTBLAS_GENERATED_SRC ${CMAKE_CURRENT_BINARY_DIR}/generated_src)
set(PORTBLAS_INCLUDE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
set(PORTBLAS_COMMON_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common/include)
if(INSTALL_HEADER_ONLY)
set(PORTBLAS_INSTALL_SRC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src> $<INSTALL_INTERFACE:src>)
endif()
set(PORTBLAS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/src)
set(PORTBLAS_SRC_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/python_generator)
list(APPEND THIRD_PARTIES_INCLUDE ${CBLAS_INCLUDE})
Expand Down Expand Up @@ -142,8 +145,24 @@ include(CmakeFunctionHelper)
if (INSTALL_HEADER_ONLY)
add_library(portblas INTERFACE)
set_target_properties(portblas PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${PORTBLAS_INCLUDE};$<INSTALL_INTERFACE:src>"
)
INTERFACE_INCLUDE_DIRECTORIES "${PORTBLAS_INCLUDE};${PORTBLAS_INSTALL_SRC}")
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-fsycl" is_dpcpp)
if(is_dpcpp)
target_compile_definitions(portblas INTERFACE "SB_ENABLE_USM")
endif()
if(${BLAS_ENABLE_COMPLEX})
target_compile_definitions(portblas INTERFACE "BLAS_ENABLE_COMPLEX")
endif()
target_compile_definitions(portblas INTERFACE ${TUNING_TARGET})
target_compile_options(portblas INTERFACE -Wno-deprecated-declarations)
target_compile_options(portblas INTERFACE -Wno-deprecated-copy-with-user-provided-copy)
if((${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM") AND NOT
(${TUNING_TARGET} STREQUAL "INTEL_GPU") )
target_compile_options(portblas INTERFACE -fno-fast-math)
target_compile_options(portblas INTERFACE -mllvm=-loopopt=0)
message(STATUS "Adding -fno-fast-math -mllvm=-loopopt=0 to portblas")
endif()
else()
add_subdirectory(src)
build_library(portblas ${BLAS_ENABLE_EXTENSIONS})
Expand Down Expand Up @@ -202,7 +221,7 @@ install(DIRECTORY ${PORTBLAS_INCLUDE}
FILES_MATCHING PATTERN "*.h"
)
if (INSTALL_HEADER_ONLY)
install(DIRECTORY ${PORTBLAS_SRC}
install(DIRECTORY ${PORTBLAS_INSTALL_SRC}
DESTINATION ${CMAKE_INSTALL_PREFIX}
COMPONENT portblas
FILES_MATCHING PATTERN "*.hpp"
Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,7 @@ to clone submodule(s).

### Compile with DPC++
```bash
export CC=[path/to/intel/clang]
export CXX=[path/to/intel/clang++]
export CXX=[path/to/intel/icpx]
cd build
cmake -GNinja ../ -DSYCL_COMPILER=dpcpp
ninja
Expand All @@ -417,6 +416,13 @@ advisable for NVIDIA and **mandatory for AMD** to provide the specific device
architecture through `-DDPCPP_SYCL_ARCH=<arch>`, e.g., `<arch>` can be `sm_80`
for NVIDIA or `gfx908` for AMD.

#### DPC++ Compiler Support

As DPCPP SYCL compiler the project is fully compatible with `icpx` provided by
intel [oneAPI base-toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html#gs.7t6x52)
which is the suggested one. PortBLAS can be compiled also with the [open source intel/llvm](https://github.com/intel/llvm)
compiler, but not all the latest changes are tested.

### Compile with AdaptiveCpp *(Formerly hipSYCL)*
The following instructions concern the **generic** *(clang-based)* flow supported
by AdaptiveCpp.
Expand Down
6 changes: 6 additions & 0 deletions benchmark/portblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ foreach(portblas_bench ${sources})
target_link_libraries(bench_${bench_exec} PRIVATE blas::blas)
endif()

# Cmake identifies compilers as IntelLLVM only those distributed with Intel oneAPI releases,
# so this flag doesn't apply to intel/llvm open source compiler.
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM")
target_compile_options(bench_${bench_exec} PRIVATE "-fno-fast-math")
endif()

message(STATUS "Created benchmark: ${bench_exec}")
install(TARGETS bench_${bench_exec}
RUNTIME
Expand Down
8 changes: 8 additions & 0 deletions cmake/Modules/FindDPCPP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ function(add_sycl_to_target)
"${multi_value_args}"
${ARGN}
)
# Cmake identifies as IntelLLVM compiler only those distributed with intel oneAPI releases,
# so this flag doesn't apply to intel/llvm open source compiler.
if((${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM") AND NOT
(${TUNING_TARGET} STREQUAL "INTEL_GPU") )
target_compile_options(${SB_ADD_SYCL_TARGET} PRIVATE -fno-fast-math)
target_compile_options(${SB_ADD_SYCL_TARGET} PRIVATE -mllvm=-loopopt=0)
message(STATUS "Adding -fno-fast-math -mllvm=-loopopt=0 to target ${SB_ADD_SYCL_TARGET}")
endif()
target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
if (NOT target_type STREQUAL "OBJECT_LIBRARY")
Expand Down
4 changes: 2 additions & 2 deletions src/operations/blas1/IndexMaxMin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ PORTBLAS_INLINE void IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>::eval(
// reduction within the sub_group
for (index_t i = sg_local_range >> 1; i > 0; i >>= 1) {
if (sg_local_id < i) {
element_t shfl_val = sg.shuffle_down(val.get_value(), i);
index_t shfl_idx = sg.shuffle_down(val.get_index(), i);
element_t shfl_val = sycl::shift_group_left(sg, val.get_value(), i);
index_t shfl_idx = sycl::shift_group_left(sg, val.get_index(), i);
value_t shfl{shfl_idx, shfl_val};
val = op::eval(val, shfl);
}
Expand Down
12 changes: 3 additions & 9 deletions test/unittest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,6 @@ if(is_dpcpp)
)
endif()


# Contains tests that fail if compiled with -ffast-math
set(SYCL_UNITTEST_NOFASTMATH
${PORTBLAS_UNITTEST}/blas1/blas1_rotg_test.cpp
${PORTBLAS_UNITTEST}/blas1/blas1_rotmg_test.cpp
)

if(GEMM_TALL_SKINNY_SUPPORT)
list(APPEND SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_tall_skinny_test.cpp)
endif()
Expand Down Expand Up @@ -142,8 +135,9 @@ foreach(blas_test ${SYCL_UNITTEST_SRCS})
target_link_libraries(${test_exec} PRIVATE gtest_main Clara::Clara blas::blas portblas)
target_include_directories(${test_exec} PRIVATE ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR})

list (FIND SYCL_UNITTEST_NOFASTMATH ${blas_test} _index)
if (${_index} GREATER -1)
# Cmake identifies compilers as IntelLLVM only those distributed with Intel oneAPI releases,
# so this flag doesn't apply to intel/llvm open source compiler.
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM")
target_compile_options(${test_exec} PRIVATE "-fno-fast-math")
endif()

Expand Down

0 comments on commit d0a64b9

Please sign in to comment.