Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Anton/experimental fpga #298

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e762183
Enable selecting hipSYCL as SYCL implementation (#296)
fodinabor May 11, 2021
fdbb3af
Fix race condition in ReductionPartialRows kernel (#299)
KumudhaN Aug 12, 2021
be3ff4e
Fix invalid memory access issue in ReductionPartialRows kernel (#300)
KumudhaN Aug 18, 2021
5b9b967
Rebase on master (#305)
isaacault Sep 24, 2021
03c040a
Add export files for build and install (#312)
DuncanMcBain Oct 25, 2021
b9c086b
Fix how libraries propagate in SYCL-BLAS lib (#314)
DuncanMcBain Nov 9, 2021
e2bef35
Add support for partial column reduction (#309)
sgeor255 Nov 10, 2021
c6e7588
Add cmake option to disable extensions (#310)
sgeor255 Nov 11, 2021
add0eb8
Add support for Mean reduction (#311)
sgeor255 Nov 12, 2021
fcdd153
Improve testing of Reduction (#315)
sgeor255 Nov 12, 2021
2ae4b24
Experimental support for running on fpga
ShanoToni Apr 21, 2021
43f982d
Extended header only macro to build benchmarks as well
ShanoToni Apr 22, 2021
6888705
Modifed cmake to allow c++ standard definition from command line
ShanoToni Apr 23, 2021
5324481
New queue constructor in runtime for enqueueing dummy kernel
ShanoToni May 6, 2021
e8d001a
Header only fpga support for benchmarks
ShanoToni May 10, 2021
734e3d0
Experimental support for running on fpga
ShanoToni Apr 21, 2021
5e5a761
Removed use of new queue constructors for fpga
ShanoToni May 17, 2021
6cdf3a6
Removed copy kernel from policy handler
ShanoToni May 25, 2021
b18ee8c
Added flag for enqueueing new kernel
ShanoToni May 26, 2021
63e6900
Fixed duplication in cmake
ShanoToni Dec 6, 2021
e89e2f4
Fixed duplication in cmake 2
ShanoToni Dec 6, 2021
e56d7b2
Revert naming change
ShanoToni Dec 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ if(EXISTS ${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
endif()

set(BUILD_SHARED_LIBS ON CACHE BOOL "")
set(CMAKE_CXX_STANDARD 11)
if(NOT "${CMAKE_CXX_STANDARD}")
set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ${BUILD_SHARED_LIBS})
Expand Down Expand Up @@ -195,6 +197,11 @@ option(BLAS_HEADER_ONLY_TESTING "Whether to use sycl-blas testing in header-only
if(BLAS_HEADER_ONLY_TESTING)
add_definitions(-DBLAS_HEADER_ONLY_TESTING)
endif()
# Enable first kernel in queue constructor
option(ENQUEUE_FIRST_KERNEL "Whether to enqueue dummy kernel in queue constructor" OFF)
if(ENQUEUE_FIRST_KERNEL)
add_definitions(-DENQUEUE_FIRST_KERNEL)
endif()

if(${BLAS_ENABLE_TESTING})
enable_testing()
Expand Down
27 changes: 21 additions & 6 deletions benchmark/syclblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ if(BLAS_VERIFY_BENCHMARK)
find_package(SystemBLAS REQUIRED)
endif()

if(${BLAS_HEADER_ONLY_TESTING})
set(SyclBLAS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../..)
find_package(SyclBLAS REQUIRED)
endif()

set(sources
# Level 1 blas
blas1/axpy.cpp
Expand All @@ -27,12 +32,22 @@ endif()
foreach(syclblas_bench ${sources})
get_filename_component(bench_exec ${syclblas_bench} NAME_WE)
add_executable(bench_${bench_exec} ${syclblas_bench} main.cpp)
target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara sycl_blas)
add_sycl_to_target(
TARGET bench_${bench_exec}
SOURCES ${syclblas_bench}
)
target_include_directories(bench_${bench_exec} PRIVATE ${SYCLBLAS_INCLUDE} ${CBLAS_INCLUDE} ${SYCLBLAS_COMMON_INCLUDE_DIR})

if(${BLAS_HEADER_ONLY_TESTING})
target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara SyclBLAS::SyclBLAS)
add_sycl_to_target(
TARGET bench_${bench_exec}
SOURCES ${syclblas_bench}
)
target_include_directories(bench_${bench_exec} PRIVATE ${CBLAS_INCLUDE} ${SYCLBLAS_COMMON_INCLUDE_DIR})
else()
target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara sycl_blas)
add_sycl_to_target(
TARGET bench_${bench_exec}
SOURCES ${syclblas_bench}
)
target_include_directories(bench_${bench_exec} PRIVATE ${SYCLBLAS_INCLUDE} ${CBLAS_INCLUDE} ${SYCLBLAS_COMMON_INCLUDE_DIR})
endif()

if(BLAS_VERIFY_BENCHMARK)
target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_VERIFY_BENCHMARK)
Expand Down
31 changes: 31 additions & 0 deletions benchmark/syclblas/blas1/asum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,25 @@ std::string get_name(int size) {
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void run(benchmark::State& state, index_t size,
bool* success) {
#else
void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
bool* success) {
#endif
// Google-benchmark counters are double.
double size_d = static_cast<double>(size);
state.counters["size"] = size_d;
state.counters["n_fl_ops"] = 2.0 * size_d;
state.counters["bytes_processed"] = size_d * sizeof(scalar_t);

#ifdef SYCL_BLAS_FPGA
auto q = blas_benchmark::utils::make_queue();
ExecutorType ex(q);
#else
ExecutorType& ex = *executorPtr;
#endif

using data_t = utils::data_storage_t<scalar_t>;

Expand Down Expand Up @@ -106,23 +116,44 @@ void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void register_benchmark(Args& args,
bool* success) {
#else
void register_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
#endif
auto gemm_params = blas_benchmark::utils::get_blas1_params(args);

for (auto size : gemm_params) {
#ifdef SYCL_BLAS_FPGA
auto BM_lambda = [&](benchmark::State& st,
index_t size, bool* success) {
run<scalar_t>(st, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
size, success);
#else
auto BM_lambda = [&](benchmark::State& st, ExecutorType* exPtr,
index_t size, bool* success) {
run<scalar_t>(st, exPtr, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
exPtr, size, success);
#endif
}
}

namespace blas_benchmark {
#ifdef SYCL_BLAS_FPGA
void create_benchmark(Args& args,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, success);
}
#else
void create_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, exPtr, success);
}
#endif
} // namespace blas_benchmark
31 changes: 31 additions & 0 deletions benchmark/syclblas/blas1/axpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,25 @@ std::string get_name(int size) {
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void run(benchmark::State& state, index_t size,
bool* success) {
#else
void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
bool* success) {
#endif
// Google-benchmark counters are double.
double size_d = static_cast<double>(size);
state.counters["size"] = size_d;
state.counters["n_fl_ops"] = 2.0 * size_d;
state.counters["bytes_processed"] = 3.0 * size_d * sizeof(scalar_t);

#ifdef SYCL_BLAS_FPGA
auto q = blas_benchmark::utils::make_queue();
ExecutorType ex(q);
#else
ExecutorType& ex = *executorPtr;
#endif

using data_t = utils::data_storage_t<scalar_t>;

Expand Down Expand Up @@ -103,23 +113,44 @@ void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void register_benchmark(Args& args,
bool* success) {
#else
void register_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
#endif
auto gemm_params = blas_benchmark::utils::get_blas1_params(args);

for (auto size : gemm_params) {
#ifdef SYCL_BLAS_FPGA
auto BM_lambda = [&](benchmark::State& st,
index_t size, bool* success) {
run<scalar_t>(st, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
size, success);
#else
auto BM_lambda = [&](benchmark::State& st, ExecutorType* exPtr,
index_t size, bool* success) {
run<scalar_t>(st, exPtr, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
exPtr, size, success);
#endif
}
}

namespace blas_benchmark {
#ifdef SYCL_BLAS_FPGA
void create_benchmark(Args& args,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, success);
}
#else
void create_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, exPtr, success);
}
#endif
} // namespace blas_benchmark
31 changes: 31 additions & 0 deletions benchmark/syclblas/blas1/dot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,25 @@ std::string get_name(int size) {
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void run(benchmark::State& state, index_t size,
bool* success) {
#else
void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
bool* success) {
#endif
// Google-benchmark counters are double.
double size_d = static_cast<double>(size);
state.counters["size"] = size_d;
state.counters["n_fl_ops"] = 2 * size_d;
state.counters["bytes_processed"] = 2 * size_d * sizeof(scalar_t);

#ifdef SYCL_BLAS_FPGA
auto q = blas_benchmark::utils::make_queue();
ExecutorType ex(q);
#else
ExecutorType& ex = *executorPtr;
#endif

using data_t = utils::data_storage_t<scalar_t>;

Expand Down Expand Up @@ -105,23 +115,44 @@ void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void register_benchmark(Args& args,
bool* success) {
#else
void register_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
#endif
auto gemm_params = blas_benchmark::utils::get_blas1_params(args);

for (auto size : gemm_params) {
#ifdef SYCL_BLAS_FPGA
auto BM_lambda = [&](benchmark::State& st,
index_t size, bool* success) {
run<scalar_t>(st, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
size, success);
#else
auto BM_lambda = [&](benchmark::State& st, ExecutorType* exPtr,
index_t size, bool* success) {
run<scalar_t>(st, exPtr, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
exPtr, size, success);
#endif
}
}

namespace blas_benchmark {
#ifdef SYCL_BLAS_FPGA
void create_benchmark(Args& args,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, success);
}
#else
void create_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, exPtr, success);
}
#endif
} // namespace blas_benchmark
31 changes: 31 additions & 0 deletions benchmark/syclblas/blas1/iamax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,25 @@ std::string get_name(int size) {
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void run(benchmark::State& state, index_t size,
bool* success) {
#else
void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
bool* success) {
#endif
// Google-benchmark counters are double.
double size_d = static_cast<double>(size);
state.counters["size"] = size_d;
state.counters["n_fl_ops"] = 2 * size_d;
state.counters["bytes_processed"] = size_d * sizeof(scalar_t);

#ifdef SYCL_BLAS_FPGA
auto q = blas_benchmark::utils::make_queue();
ExecutorType ex(q);
#else
ExecutorType& ex = *executorPtr;
#endif

using data_t = utils::data_storage_t<scalar_t>;
using tuple_scalar_t = blas::IndexValueTuple<index_t, scalar_t>;
Expand Down Expand Up @@ -107,23 +117,44 @@ void run(benchmark::State& state, ExecutorType* executorPtr, index_t size,
}

template <typename scalar_t>
#ifdef SYCL_BLAS_FPGA
void register_benchmark(Args& args,
bool* success) {
#else
void register_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
#endif
auto gemm_params = blas_benchmark::utils::get_blas1_params(args);

for (auto size : gemm_params) {
#ifdef SYCL_BLAS_FPGA
auto BM_lambda = [&](benchmark::State& st,
index_t size, bool* success) {
run<scalar_t>(st, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
size, success);
#else
auto BM_lambda = [&](benchmark::State& st, ExecutorType* exPtr,
index_t size, bool* success) {
run<scalar_t>(st, exPtr, size, success);
};
benchmark::RegisterBenchmark(get_name<scalar_t>(size).c_str(), BM_lambda,
exPtr, size, success);
#endif
}
}

namespace blas_benchmark {
#ifdef SYCL_BLAS_FPGA
void create_benchmark(Args& args,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, success);
}
#else
void create_benchmark(blas_benchmark::Args& args, ExecutorType* exPtr,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, exPtr, success);
}
#endif
} // namespace blas_benchmark