From 1695f2c7625fa8a0a01e12cd2edb89b47e1d8aec Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 22 May 2021 22:25:07 +0200 Subject: [PATCH 1/3] add BLAS benchmark --- benchmark/CMakeLists.txt | 1 + benchmark/blas/CMakeLists.txt | 1 + benchmark/blas/blas.cpp | 540 ++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+) create mode 100644 benchmark/blas/CMakeLists.txt create mode 100644 benchmark/blas/blas.cpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index ad719467c2b..da1c80b31e1 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -97,6 +97,7 @@ function(ginkgo_add_typed_benchmark_executables name use_lib_linops) endfunction(ginkgo_add_typed_benchmark_executables) +add_subdirectory(blas) add_subdirectory(conversions) add_subdirectory(matrix_generator) add_subdirectory(matrix_statistics) diff --git a/benchmark/blas/CMakeLists.txt b/benchmark/blas/CMakeLists.txt new file mode 100644 index 00000000000..c3e40e80bc2 --- /dev/null +++ b/benchmark/blas/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_add_typed_benchmark_executables(blas "NO" blas.cpp) diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp new file mode 100644 index 00000000000..1f9a7325a2f --- /dev/null +++ b/benchmark/blas/blas.cpp @@ -0,0 +1,540 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "benchmark/utils/general.hpp" +#include "benchmark/utils/loggers.hpp" +#include "benchmark/utils/timer.hpp" +#include "benchmark/utils/types.hpp" + + +// Command-line arguments +DEFINE_string( + operations, "copy,axpy,scal", + "A comma-separated list of BLAS operations to benchmark.\nCandidates are" + " copy (y = x),\n" + " axpy (y = y + a * x),\n" + " multiaxpy (like axpy, but a has one entry per column),\n" + " scal (y = a * y),\n" + " multiscal (like scal, but a has one entry per column),\n" + " dot (a = x' * y)," + " norm (a = sqrt(x' * x)),\n" + " mm (C = A * B),\n" + " gemm (C = a * A * B + b * C)\n" + "where A has dimensions n x k, B has dimensions k x m,\n" + "C has dimensions n x m and x and y have dimensions n x r"); + + +class BenchmarkOperation { +public: + virtual ~BenchmarkOperation() = default; + + virtual void write_error(rapidjson::Value &blas_case, + rapidjson::MemoryPoolAllocator<> &allocator) + {} + virtual gko::size_type get_flops() const = 0; + virtual gko::size_type get_memory() const = 0; + virtual void prepare(){}; + virtual void run() = 0; +}; + + +class CopyOperation : public BenchmarkOperation { +public: + CopyOperation(std::shared_ptr exec, + gko::size_type rows, gko::size_type cols, + gko::size_type istride, gko::size_type ostride) + { + in_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + istride); + out_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + ostride); + in_->fill(1); + } + + gko::size_type get_flops() const override + { + return in_->get_size()[0] * in_->get_size()[1]; + } + + gko::size_type get_memory() const override + { + return in_->get_size()[0] * in_->get_size()[1] * sizeof(etype) * 2; + } + + void run() override { in_->convert_to(lend(out_)); } + +private: + std::unique_ptr> in_; + std::unique_ptr> out_; +}; + + +class AxpyOperation : public BenchmarkOperation { +public: + AxpyOperation(std::shared_ptr exec, + gko::size_type rows, gko::size_type cols, + gko::size_type stride_in, gko::size_type stride_out, + bool multi) + { + alpha_ = gko::matrix::Dense::create( + exec, gko::dim<2>{1, multi ? cols : 1}); + x_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + stride_in); + y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + stride_out); + alpha_->fill(1); + x_->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 3; + } + + void prepare() override { y_->fill(1); } + + void run() override { y_->add_scaled(lend(alpha_), lend(x_)); } + +private: + std::unique_ptr> alpha_; + std::unique_ptr> x_; + std::unique_ptr> y_; +}; + + +class ScalOperation : public BenchmarkOperation { +public: + ScalOperation(std::shared_ptr exec, + gko::size_type rows, gko::size_type cols, + gko::size_type stride, bool multi) + { + alpha_ = gko::matrix::Dense::create( + exec, gko::dim<2>{1, multi ? cols : 1}); + y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + stride); + alpha_->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1]; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2; + } + + void prepare() override { y_->fill(1); } + + void run() override { y_->scale(lend(alpha_)); } + +private: + std::unique_ptr> alpha_; + std::unique_ptr> y_; +}; + + +class DotOperation : public BenchmarkOperation { +public: + DotOperation(std::shared_ptr exec, gko::size_type rows, + gko::size_type cols, gko::size_type stride_x, + gko::size_type stride_y) + { + alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, cols}); + x_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + stride_x); + y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + stride_y); + x_->fill(1); + y_->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype) * 2; + } + + void run() override { x_->compute_dot(lend(y_), lend(alpha_)); } + +private: + std::unique_ptr> alpha_; + std::unique_ptr> x_; + std::unique_ptr> y_; +}; + + +class NormOperation : public BenchmarkOperation { +public: + NormOperation(std::shared_ptr exec, + gko::size_type rows, gko::size_type cols, + gko::size_type stride) + { + alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, cols}); + y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, + stride); + y_->fill(1); + } + + gko::size_type get_flops() const override + { + return y_->get_size()[0] * y_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return y_->get_size()[0] * y_->get_size()[1] * sizeof(etype); + } + + void run() override { y_->compute_norm2(lend(alpha_)); } + +private: + std::unique_ptr> alpha_; + std::unique_ptr> y_; +}; + + +class ApplyOperation : public BenchmarkOperation { +public: + ApplyOperation(std::shared_ptr exec, gko::size_type n, + gko::size_type k, gko::size_type m, gko::size_type stride_A, + gko::size_type stride_B, gko::size_type stride_C) + { + A_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, k}, + stride_A); + B_ = gko::matrix::Dense::create(exec, gko::dim<2>{k, m}, + stride_B); + C_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, m}, + stride_C); + A_->fill(1); + B_->fill(1); + } + + gko::size_type get_flops() const override + { + return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2; + } + + gko::size_type get_memory() const override + { + return (A_->get_size()[0] * A_->get_size()[1] + + B_->get_size()[0] * B_->get_size()[1] + + C_->get_size()[0] * C_->get_size()[1]) * + sizeof(etype); + } + + void run() override { B_->apply(lend(A_), lend(C_)); } + +private: + std::unique_ptr> A_; + std::unique_ptr> B_; + std::unique_ptr> C_; +}; + + +class AdvancedApplyOperation : public BenchmarkOperation { +public: + AdvancedApplyOperation(std::shared_ptr exec, + gko::size_type n, gko::size_type k, gko::size_type m, + gko::size_type stride_A, gko::size_type stride_B, + gko::size_type stride_C) + { + A_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, k}, + stride_A); + B_ = gko::matrix::Dense::create(exec, gko::dim<2>{k, m}, + stride_B); + C_ = gko::matrix::Dense::create(exec, gko::dim<2>{n, m}, + stride_C); + alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, 1}); + beta_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, 1}); + A_->fill(1); + B_->fill(1); + alpha_->fill(1); + beta_->fill(1); + } + + gko::size_type get_flops() const override + { + return A_->get_size()[0] * A_->get_size()[1] * B_->get_size()[1] * 2 + + C_->get_size()[0] * C_->get_size()[1] * 3; + } + + gko::size_type get_memory() const override + { + return (A_->get_size()[0] * A_->get_size()[1] + + B_->get_size()[0] * B_->get_size()[1] + + C_->get_size()[0] * C_->get_size()[1] * 2) * + sizeof(etype); + } + + void run() override + { + B_->apply(lend(alpha_), lend(A_), lend(beta_), lend(C_)); + } + +private: + std::unique_ptr> alpha_; + std::unique_ptr> beta_; + std::unique_ptr> A_; + std::unique_ptr> B_; + std::unique_ptr> C_; +}; + + +struct dimensions { + gko::size_type n; + gko::size_type k; + gko::size_type m; + gko::size_type r; + gko::size_type stride_x; + gko::size_type stride_y; + gko::size_type stride_A; + gko::size_type stride_B; + gko::size_type stride_C; +}; + + +gko::size_type get_optional(rapidjson::Value &obj, const char *name, + gko::size_type default_value) +{ + if (obj.HasMember(name)) { + return obj[name].GetUint64(); + } else { + return default_value; + } +} + + +dimensions parse_dims(rapidjson::Value &test_case) +{ + dimensions result; + result.n = test_case["n"].GetInt64(); + result.k = get_optional(test_case, "k", result.n); + result.m = get_optional(test_case, "m", result.n); + result.r = get_optional(test_case, "r", 1); + if (test_case.HasMember("stride")) { + result.stride_x = test_case["stride"].GetInt64(); + result.stride_y = result.stride_x; + } else { + result.stride_x = get_optional(test_case, "stride_x", result.r); + result.stride_y = get_optional(test_case, "stride_y", result.r); + } + result.stride_A = get_optional(test_case, "stride_A", result.k); + result.stride_B = get_optional(test_case, "stride_B", result.m); + result.stride_C = get_optional(test_case, "stride_C", result.m); + return result; +} + + +std::map( + std::shared_ptr, dimensions)>> + operation_map{ + {"copy", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.r, dims.stride_x, dims.stride_y); + }}, + {"axpy", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.r, dims.stride_x, dims.stride_y, false); + }}, + {"multiaxpy", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.r, dims.stride_x, dims.stride_y, true); + }}, + {"scal", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique(exec, dims.n, dims.r, + dims.stride_y, false); + }}, + {"multiscal", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique(exec, dims.n, dims.r, + dims.stride_y, true); + }}, + {"dot", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.r, dims.stride_x, dims.stride_y); + }}, + {"norm", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique(exec, dims.n, dims.r, + dims.stride_y); + }}, + {"mm", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.k, dims.m, dims.stride_A, dims.stride_B, + dims.stride_C); + }}, + {"gemm", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.k, dims.m, dims.stride_A, dims.stride_B, + dims.stride_C); + }}}; + + +void apply_blas(const char *operation_name, std::shared_ptr exec, + rapidjson::Value &test_case, + rapidjson::MemoryPoolAllocator<> &allocator) +{ + try { + auto &blas_case = test_case["blas"]; + add_or_set_member(blas_case, operation_name, + rapidjson::Value(rapidjson::kObjectType), allocator); + + auto op = operation_map[operation_name](exec, parse_dims(test_case)); + + // warm run + for (unsigned int i = 0; i < FLAGS_warmup; i++) { + exec->synchronize(); + op->run(); + exec->synchronize(); + } + + // timed run + auto timer = get_timer(exec, FLAGS_gpu_timer); + for (unsigned int i = 0; i < FLAGS_repetitions; i++) { + exec->synchronize(); + timer->tic(); + op->run(); + timer->toc(); + } + auto runtime = timer->compute_average_time(); + auto flops = static_cast(op->get_flops()); + auto mem = static_cast(op->get_memory()); + add_or_set_member(blas_case[operation_name], "time", runtime, + allocator); + add_or_set_member(blas_case[operation_name], "flops", flops / runtime, + allocator); + add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime, + allocator); + + // compute and write benchmark data + add_or_set_member(blas_case[operation_name], "completed", true, + allocator); + } catch (const std::exception &e) { + add_or_set_member(test_case["blas"][operation_name], "completed", false, + allocator); + std::cerr << "Error when processing test case " << test_case << "\n" + << "what(): " << e.what() << std::endl; + } +} + + +int main(int argc, char *argv[]) +{ + std::string header = + "A benchmark for measuring performance of Ginkgo's BLAS-like " + "operations.\n"; + std::string format = std::string() + " [\n { \"n\": 100 },\n" + + " { \"n\": 200, \"m\": 200, \"k\": 200 }\n" + + " ]\n\n"; + initialize_argument_parsing(&argc, &argv, header, format); + + std::string extra_information = "The operations are " + FLAGS_operations; + print_general_information(extra_information); + + auto exec = executor_factory.at(FLAGS_executor)(); + auto engine = get_engine(); + auto operations = split(FLAGS_operations, ','); + + rapidjson::IStreamWrapper jcin(std::cin); + rapidjson::Document test_cases; + test_cases.ParseStream(jcin); + if (!test_cases.IsArray()) { + std::cerr + << "Input has to be a JSON array of benchmark configurations:\n" + << format; + std::exit(1); + } + + auto &allocator = test_cases.GetAllocator(); + + for (auto &test_case : test_cases.GetArray()) { + try { + // set up benchmark + if (!test_case.HasMember("blas")) { + test_case.AddMember("blas", + rapidjson::Value(rapidjson::kObjectType), + allocator); + } + auto &blas_case = test_case["blas"]; + if (!FLAGS_overwrite && + all_of(begin(operations), end(operations), + [&blas_case](const std::string &s) { + return blas_case.HasMember(s.c_str()); + })) { + continue; + } + std::clog << "Running test case: " << test_case << std::endl; + + for (const auto &operation_name : operations) { + apply_blas(operation_name.c_str(), exec, test_case, allocator); + std::clog << "Current state:" << std::endl + << test_cases << std::endl; + backup_results(test_cases); + } + } catch (const std::exception &e) { + std::cerr << "Error setting up benchmark, what(): " << e.what() + << std::endl; + } + } + + std::cout << test_cases << std::endl; +} From ba54b82dadab247531a4ebfdd8304afb5a4512c6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 23 May 2021 21:24:05 +0200 Subject: [PATCH 2/3] fix windows range build issues --- include/ginkgo/core/base/range.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 2fe87a364a7..e0ea8bcf284 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -342,7 +342,7 @@ class range { const -> decltype(std::declval()( std::forward(dimensions)...)) { - static_assert(sizeof...(dimensions) <= dimensionality, + static_assert(sizeof...(DimensionTypes) <= dimensionality, "Too many dimensions in range call"); return accessor_(std::forward(dimensions)...); } From aca683078274592519ea440caf73fd1c687c565c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 25 May 2021 11:57:28 +0200 Subject: [PATCH 3/3] review updates Co-authored-by: Yuhsiang Tsai --- benchmark/blas/blas.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 1f9a7325a2f..1934aac853a 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -69,9 +69,6 @@ class BenchmarkOperation { public: virtual ~BenchmarkOperation() = default; - virtual void write_error(rapidjson::Value &blas_case, - rapidjson::MemoryPoolAllocator<> &allocator) - {} virtual gko::size_type get_flops() const = 0; virtual gko::size_type get_memory() const = 0; virtual void prepare(){}; @@ -442,6 +439,7 @@ void apply_blas(const char *operation_name, std::shared_ptr exec, // warm run for (unsigned int i = 0; i < FLAGS_warmup; i++) { + op->prepare(); exec->synchronize(); op->run(); exec->synchronize(); @@ -450,6 +448,7 @@ void apply_blas(const char *operation_name, std::shared_ptr exec, // timed run auto timer = get_timer(exec, FLAGS_gpu_timer); for (unsigned int i = 0; i < FLAGS_repetitions; i++) { + op->prepare(); exec->synchronize(); timer->tic(); op->run(); @@ -481,7 +480,17 @@ int main(int argc, char *argv[]) { std::string header = "A benchmark for measuring performance of Ginkgo's BLAS-like " - "operations.\n"; + "operations.\nParameters for a benchmark case are:\n" + " n: number of rows for vectors and gemm output (required)\n" + " r: number of columns for vectors (optional, default 1)\n" + " m: number of columns for gemm output (optional, default n)\n" + " k: inner dimension of the gemm (optional, default n)\n" + " stride: storage stride for both vectors (optional, default r)\n" + " stride_x: stride for input vector x (optional, default r)\n" + " stride_y: stride for in/out vector y (optional, default r)\n" + " stride_A: stride for A matrix in gemm (optional, default k)\n" + " stride_B: stride for B matrix in gemm (optional, default m)\n" + " stride_C: stride for C matrix in gemm (optional, default m)\n"; std::string format = std::string() + " [\n { \"n\": 100 },\n" + " { \"n\": 200, \"m\": 200, \"k\": 200 }\n" + " ]\n\n";