Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

459 lines (358 sloc) 10.868 kB
#include <iostream>
#include <fstream>
#include <vector>
#include <tuple>
#include <numeric>
#include <vexcl/vexcl.hpp>
using namespace vex;
typedef double real;
#define BENCHMARK_VECTOR
#define BENCHMARK_REDUCTOR
#define BENCHMARK_SPMAT
#define BENCHMARK_CPU
#ifdef WIN32
# pragma warning(disable : 4267)
#endif
//---------------------------------------------------------------------------
std::pair<double,double> benchmark_vector(
const std::vector<cl::CommandQueue> &queue, profiler &prof
)
{
const size_t N = 1024 * 1024;
const size_t M = 1024;
double time_elapsed;
std::vector<real> A(N, 0);
std::vector<real> B(N);
std::vector<real> C(N);
std::vector<real> D(N);
std::generate(B.begin(), B.end(), [](){ return (double)rand() / RAND_MAX; });
std::generate(C.begin(), C.end(), [](){ return (double)rand() / RAND_MAX; });
std::generate(D.begin(), D.end(), [](){ return (double)rand() / RAND_MAX; });
vex::vector<real> a(queue, A);
vex::vector<real> b(queue, B);
vex::vector<real> c(queue, C);
vex::vector<real> d(queue, D);
a += b + c * d;
a = 0;
prof.tic_cl("OpenCL");
for(size_t i = 0; i < M; i++)
a += b + c * d;
time_elapsed = prof.toc("OpenCL");
double gflops = (3.0 * N * M) / time_elapsed / 1e9;
double bwidth = (5.0 * N * M * sizeof(real)) / time_elapsed / 1e9;
std::cout
<< "Vector arithmetic\n"
<< " OpenCL"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
#ifdef BENCHMARK_CPU
prof.tic_cpu("C++");
for(size_t i = 0; i < M; i++)
for(size_t j = 0; j < N; j++)
A[j] += B[j] + C[j] * D[j];
time_elapsed = prof.toc("C++");
{
double gflops = (3.0 * N * M) / time_elapsed / 1e9;
double bwidth = (5.0 * N * M * sizeof(real)) / time_elapsed / 1e9;
std::cout
<< " C++"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
}
vex::copy(A, b);
Reductor<real,SUM> sum(queue);
a -= b;
std::cout << " res = " << sum(a * a)
<< std::endl << std::endl;
#endif
return std::make_pair(gflops, bwidth);
}
//---------------------------------------------------------------------------
std::pair<double, double> benchmark_reductor(
const std::vector<cl::CommandQueue> &queue, profiler &prof
)
{
const size_t N = 16 * 1024 * 1024;
const size_t M = 1024 / 16;
double time_elapsed;
std::vector<real> A(N);
std::vector<real> B(N);
std::generate(A.begin(), A.end(), [](){ return (double)rand() / RAND_MAX; });
std::generate(B.begin(), B.end(), [](){ return (double)rand() / RAND_MAX; });
vex::vector<real> a(queue, A);
vex::vector<real> b(queue, B);
Reductor<real,SUM> sum(queue);
double sum_cl = sum(a * b);
sum_cl = 0;
prof.tic_cl("OpenCL");
for(size_t i = 0; i < M; i++)
sum_cl += sum(a * b);
time_elapsed = prof.toc("OpenCL");
double gflops = 2.0 * N * M / time_elapsed / 1e9;
double bwidth = 2.0 * N * M * sizeof(real) / time_elapsed / 1e9;
std::cout
<< "Reduction\n"
<< " OpenCL"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
#ifdef BENCHMARK_CPU
double sum_cpp = 0;
prof.tic_cpu("C++");
for(size_t i = 0; i < M; i++)
sum_cpp += std::inner_product(A.begin(), A.end(), B.begin(), 0.0);
time_elapsed = prof.toc("C++");
{
double gflops = 2.0 * N * M / time_elapsed / 1e9;
double bwidth = 2.0 * N * M * sizeof(real) / time_elapsed / 1e9;
std::cout
<< " C++"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
}
std::cout << " res = " << fabs(sum_cl - sum_cpp)
<< std::endl << std::endl;
#endif
return std::make_pair(gflops, bwidth);
}
//---------------------------------------------------------------------------
std::pair<double,double> benchmark_spmv(
const std::vector<cl::CommandQueue> &queue, profiler &prof
)
{
// Construct matrix for 3D Poisson problem in cubic domain.
const size_t n = 128;
const size_t N = n * n * n;
const size_t M = 1024;
double time_elapsed;
const real h2i = (n - 1) * (n - 1);
std::vector<size_t> row;
std::vector<uint> col;
std::vector<real> val;
std::vector<real> X(n * n * n, 1e-2);
std::vector<real> Y(n * n * n, 0);
row.reserve(n * n * n + 1);
col.reserve(6 * (n - 2) * (n - 2) * (n - 2) + n * n * n);
val.reserve(6 * (n - 2) * (n - 2) * (n - 2) + n * n * n);
row.push_back(0);
for(size_t k = 0, idx = 0; k < n; k++) {
for(size_t j = 0; j < n; j++) {
for(size_t i = 0; i < n; i++, idx++) {
if (
i == 0 || i == (n - 1) ||
j == 0 || j == (n - 1) ||
k == 0 || k == (n - 1)
)
{
col.push_back(idx);
val.push_back(1);
row.push_back(row.back() + 1);
} else {
col.push_back(idx - n * n);
val.push_back(-h2i);
col.push_back(idx - n);
val.push_back(-h2i);
col.push_back(idx - 1);
val.push_back(-h2i);
col.push_back(idx);
val.push_back(6 * h2i);
col.push_back(idx + 1);
val.push_back(-h2i);
col.push_back(idx + n);
val.push_back(-h2i);
col.push_back(idx + n * n);
val.push_back(-h2i);
row.push_back(row.back() + 7);
}
}
}
}
size_t nnz = row.back();
// Transfer data to compute devices.
vex::SpMat<real,uint> A(queue, n * n * n, row.data(), col.data(), val.data());
vex::vector<real> x(queue, X);
vex::vector<real> y(queue, Y);
// Get timings.
y += A * x;
y = 0;
prof.tic_cl("OpenCL");
for(size_t i = 0; i < M; i++)
y += A * x;
time_elapsed = prof.toc("OpenCL");
double gflops = (2.0 * nnz + N) * M / time_elapsed / 1e9;
double bwidth = M * (nnz * (2 * sizeof(real) + sizeof(size_t)) + 4 * N * sizeof(real)) / time_elapsed / 1e9;
std::cout
<< "SpMV\n"
<< " OpenCL"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
#ifdef BENCHMARK_CPU
prof.tic_cpu("C++");
for(size_t k = 0; k < M; k++)
for(size_t i = 0; i < N; i++) {
real s = 0;
for(size_t j = row[i]; j < row[i + 1]; j++)
s += val[j] * X[col[j]];
Y[i] += s;
}
time_elapsed = prof.toc("C++");
{
double gflops = (2.0 * nnz + N) * M / time_elapsed / 1e9;
double bwidth = M * (nnz * (2 * sizeof(real) + sizeof(size_t)) + 4 * N * sizeof(real)) / time_elapsed / 1e9;
std::cout
<< " C++"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
}
copy(Y, x);
y -= x;
Reductor<real,SUM> sum(queue);
std::cout << " res = " << sum(y * y) << std::endl << std::endl;
#endif
return std::make_pair(gflops, bwidth);
}
//---------------------------------------------------------------------------
std::pair<double,double> benchmark_spmv_ccsr(
const std::vector<cl::CommandQueue> &queue, profiler &prof
)
{
// Construct matrix for 3D Poisson problem in cubic domain.
const uint n = 128;
const uint N = n * n * n;
const uint M = 1024;
double time_elapsed;
const real h2i = (n - 1) * (n - 1);
std::vector<size_t> idx;
std::vector<size_t> row(3);
std::vector<int> col(8);
std::vector<real> val(8);
std::vector<real> X(n * n * n, 1e-2);
std::vector<real> Y(n * n * n, 0);
idx.reserve(n * n * n);
row[0] = 0;
row[1] = 1;
row[2] = 8;
col[0] = 0;
val[0] = 1;
col[1] = -static_cast<int>(n * n);
col[2] = -static_cast<int>(n);
col[3] = -1;
col[4] = 0;
col[5] = 1;
col[6] = n;
col[7] = (n * n);
val[1] = -h2i;
val[2] = -h2i;
val[3] = -h2i;
val[4] = h2i * 6;
val[5] = -h2i;
val[6] = -h2i;
val[7] = -h2i;
for(size_t k = 0; k < n; k++) {
for(size_t j = 0; j < n; j++) {
for(size_t i = 0; i < n; i++) {
if (
i == 0 || i == (n - 1) ||
j == 0 || j == (n - 1) ||
k == 0 || k == (n - 1)
)
{
idx.push_back(0);
} else {
idx.push_back(1);
}
}
}
}
size_t nnz = 6 * (n - 2) * (n - 2) * (n - 2) + n * n * n;
// Transfer data to compute devices.
vex::SpMatCCSR<real,int> A(queue[0], n * n * n, 2,
idx.data(), row.data(), col.data(), val.data());
std::vector<cl::CommandQueue> q1(1, queue[0]);
vex::vector<real> x(q1, X);
vex::vector<real> y(q1, Y);
// Get timings.
y += A * x;
y = 0;
prof.tic_cl("OpenCL");
for(size_t i = 0; i < M; i++)
y += A * x;
time_elapsed = prof.toc("OpenCL");
double gflops = (2.0 * nnz + N) * M / time_elapsed / 1e9;
double bwidth = M * (nnz * (2 * sizeof(real) + sizeof(int)) + 4 * N * sizeof(real)) / time_elapsed / 1e9;
std::cout
<< "SpMV (CCSR)\n"
<< " OpenCL"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
#ifdef BENCHMARK_CPU
prof.tic_cpu("C++");
for(size_t k = 0; k < M; k++)
for(size_t i = 0; i < N; i++) {
real s = 0;
for(size_t j = row[idx[i]]; j < row[idx[i] + 1]; j++)
s += val[j] * X[i + col[j]];
Y[i] += s;
}
time_elapsed = prof.toc("C++");
{
double gflops = (2.0 * nnz + N) * M / time_elapsed / 1e9;
double bwidth = M * (nnz * (2 * sizeof(real) + sizeof(int)) + 4 * N * sizeof(real)) / time_elapsed / 1e9;
std::cout
<< " C++"
<< "\n GFLOPS: " << gflops
<< "\n Bandwidth: " << bwidth
<< std::endl;
}
copy(Y, x);
y -= x;
Reductor<real,SUM> sum(q1);
std::cout << " res = " << sum(y * y) << std::endl << std::endl;
#endif
return std::make_pair(gflops, bwidth);
}
//---------------------------------------------------------------------------
int main() {
try {
vex::Context ctx(
Filter::DoublePrecision && Filter::Env,
CL_QUEUE_PROFILING_ENABLE
);
std::cout << ctx << std::endl;
std::ofstream log("profiling.dat", std::ios::app);
log << ctx.size() << " ";
double gflops, bwidth;
profiler prof(ctx.queue());
#ifdef BENCHMARK_VECTOR
prof.tic_cpu("Vector arithmetic");
std::tie(gflops, bwidth) = benchmark_vector(ctx.queue(), prof);
prof.toc("Vector arithmetic");
log << gflops << " " << bwidth << " ";
#endif
#ifdef BENCHMARK_REDUCTOR
prof.tic_cpu("Reduction");
std::tie(gflops, bwidth) = benchmark_reductor(ctx.queue(), prof);
prof.toc("Reduction");
log << gflops << " " << bwidth << " ";
#endif
#ifdef BENCHMARK_SPMAT
prof.tic_cpu("SpMV");
std::tie(gflops, bwidth) = benchmark_spmv(ctx.queue(), prof);
prof.toc("SpMV");
log << gflops << " " << bwidth << std::endl;
prof.tic_cpu("SpMV (CCSR)");
std::tie(gflops, bwidth) = benchmark_spmv_ccsr(ctx.queue(), prof);
prof.toc("SpMV (CCSR)");
#endif
std::cout << prof << std::endl;
} catch (const cl::Error &e) {
std::cerr << e << std::endl;
return 1;
}
}
Jump to Line
Something went wrong with that request. Please try again.