In [1]:
#include <sycl/sycl.hpp>
#include <vector>
#include <chrono>
#include <iostream>

In [2]:
double run_axpy_cpu(std::size_t N,
                    const std::vector<float>& A,
                    const std::vector<float>& B,
                    std::vector<float>& OUT)
{
    auto start = std::chrono::high_resolution_clock::now();

    for (std::size_t i = 0; i < N; ++i)
        OUT[i] = A[i] + 2.0f * B[i];

    auto end = std::chrono::high_resolution_clock::now();
    return std::chrono::duration<double>(end - start).count();
}

In [3]:
double run_axpy_sycl(std::size_t N,
                     const std::vector<float>& A,
                     const std::vector<float>& B,
                     std::vector<float>& OUT,
                     sycl::queue& q)
{
    float* dA = sycl::malloc_shared<float>(N, q);
    float* dB = sycl::malloc_shared<float>(N, q);
    float* dOUT = sycl::malloc_shared<float>(N, q);

    std::memcpy(dA, A.data(), N * sizeof(float));
    std::memcpy(dB, B.data(), N * sizeof(float));

    auto start = std::chrono::high_resolution_clock::now();

    q.submit([&](sycl::handler& h) {
        h.parallel_for(N, [=](sycl::id<1> i) {
            dOUT[i] = dA[i] + 2.0f * dB[i];
        });
    }).wait();

    auto end = std::chrono::high_resolution_clock::now();
    double t = std::chrono::duration<double>(end - start).count();

    std::memcpy(OUT.data(), dOUT, N * sizeof(float));

    sycl::free(dA, q);
    sycl::free(dB, q);
    sycl::free(dOUT, q);

    return t;
}

In [4]:
std::size_t N = 2'000'000'000;
std::vector<float> A(N, 1.0f), B(N, 2.0f), OUT(N), OUT2(N);

In [8]:
double cpu_time = run_axpy_cpu(N, A, B, OUT);
std::cout << "CPU time: " << cpu_time << " s\n";

CPU time: 0.118838 s


In [9]:
sycl::queue q_cpu{ sycl::cpu_selector_v };
double sycl_time_cpu = run_axpy_sycl(N, A, B, OUT2, q_cpu);
std::cout << "SYCL CPU time: " << sycl_time_cpu << " s\n";

SYCL CPU time: 0.0438269 s


