In [None]:
!apt update
!apt install -y ocl-icd-opencl-dev pocl-opencl-icd clinfo


In [None]:
!clinfo


In [10]:
%%writefile kernel.cl
__kernel void vector_add(__global const float* A,
                         __global const float* B,
                         __global float* C) {
    int id = get_global_id(0);
    C[id] = A[id] + B[id];
}


Overwriting kernel.cl


In [7]:
%%writefile vector_add_opencl.cpp
#include <CL/cl.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <chrono>

using namespace std;
using namespace chrono;

const int N = 1'000'000;

string loadKernel(const char* filename) {
    ifstream file(filename);
    return string((istreambuf_iterator<char>(file)),
                  istreambuf_iterator<char>());
}

int main() {
    vector<float> A(N), B(N), C(N);

    for (int i = 0; i < N; i++) {
        A[i] = 1.0f;
        B[i] = 2.0f;
    }

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    clGetPlatformIDs(1, &platform, nullptr);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, nullptr);

    context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);
    queue = clCreateCommandQueue(context, device, 0, nullptr);

    string source = loadKernel("kernel.cl");
    const char* src = source.c_str();
    size_t length = source.size();

    program = clCreateProgramWithSource(context, 1, &src, &length, nullptr);
    clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);

    kernel = clCreateKernel(program, "vector_add", nullptr);

    cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 sizeof(float) * N, A.data(), nullptr);
    cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 sizeof(float) * N, B.data(), nullptr);
    cl_mem bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 sizeof(float) * N, nullptr, nullptr);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);

    size_t globalSize = N;

    auto startCL = high_resolution_clock::now();
    clEnqueueNDRangeKernel(queue, kernel, 1, nullptr,
                           &globalSize, nullptr, 0, nullptr, nullptr);
    clFinish(queue);
    auto endCL = high_resolution_clock::now();

    clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
                        sizeof(float) * N, C.data(), 0, nullptr, nullptr);

    auto startCPU = high_resolution_clock::now();
    for (int i = 0; i < N; i++) {
        C[i] = A[i] + B[i];
    }
    auto endCPU = high_resolution_clock::now();

    cout << "OpenCL (CPU) время: "
         << duration<double, milli>(endCL - startCL).count() << " мс\n";
    cout << "Обычный C++ CPU: "
         << duration<double, milli>(endCPU - startCPU).count() << " мс\n";

    clReleaseMemObject(bufA);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufC);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}


Writing vector_add_opencl.cpp


Writing kernel.cl


In [None]:
!g++ vector_add_opencl.cpp -lOpenCL -o vector_add


In [11]:
!./vector_add



OpenCL (CPU) время: 0.000228 мс
Обычный C++ CPU: 7.36635 мс


In [12]:
%%writefile matrix_mul.cl
__kernel void matrix_mul(__global const float* A,
                         __global const float* B,
                         __global float* C,
                         int N,
                         int M,
                         int K) {

    int row = get_global_id(0);
    int col = get_global_id(1);

    if (row < N && col < K) {
        float sum = 0.0f;

        for (int i = 0; i < M; i++) {
            sum += A[row * M + i] * B[i * K + col];
        }

        C[row * K + col] = sum;
    }
}


Writing matrix_mul.cl


In [13]:
%%writefile matrix_mul_opencl.cpp
#include <CL/cl.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <chrono>

using namespace std;
using namespace chrono;

// Размеры матриц
const int N = 256;
const int M = 256;
const int K = 256;

string loadKernel(const char* filename) {
    ifstream file(filename);
    return string((istreambuf_iterator<char>(file)),
                  istreambuf_iterator<char>());
}

int main() {
    vector<float> A(N * M);
    vector<float> B(M * K);
    vector<float> C(N * K);
    vector<float> C_cpu(N * K);

    // Инициализация матриц
    for (int i = 0; i < N * M; i++) {
        A[i] = 1.0f;
    }
    for (int i = 0; i < M * K; i++) {
        B[i] = 2.0f;
    }

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    clGetPlatformIDs(1, &platform, nullptr);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, nullptr);

    context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);
    queue = clCreateCommandQueue(context, device, 0, nullptr);

    string source = loadKernel("matrix_mul.cl");
    const char* src = source.c_str();
    size_t length = source.size();

    program = clCreateProgramWithSource(context, 1, &src, &length, nullptr);
    clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);

    kernel = clCreateKernel(program, "matrix_mul", nullptr);

    cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 sizeof(float) * A.size(), A.data(), nullptr);
    cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 sizeof(float) * B.size(), B.data(), nullptr);
    cl_mem bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 sizeof(float) * C.size(), nullptr, nullptr);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
    clSetKernelArg(kernel, 3, sizeof(int), &N);
    clSetKernelArg(kernel, 4, sizeof(int), &M);
    clSetKernelArg(kernel, 5, sizeof(int), &K);

    size_t globalSize[2] = { (size_t)N, (size_t)K };

    auto startCL = high_resolution_clock::now();
    clEnqueueNDRangeKernel(queue, kernel, 2, nullptr,
                           globalSize, nullptr,
                           0, nullptr, nullptr);
    clFinish(queue);
    auto endCL = high_resolution_clock::now();

    clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
                        sizeof(float) * C.size(),
                        C.data(), 0, nullptr, nullptr);

    // Последовательное умножение на CPU
    auto startCPU = high_resolution_clock::now();
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < K; j++) {
            float sum = 0.0f;
            for (int t = 0; t < M; t++) {
                sum += A[i * M + t] * B[t * K + j];
            }
            C_cpu[i * K + j] = sum;
        }
    }
    auto endCPU = high_resolution_clock::now();

    cout << "OpenCL (CPU) время: "
         << duration<double, milli>(endCL - startCL).count() << " мс\n";
    cout << "CPU последовательное время: "
         << duration<double, milli>(endCPU - startCPU).count() << " мс\n";

    // Проверка корректности
    bool correct = true;
    for (int i = 0; i < N * K; i++) {
        if (abs(C[i] - C_cpu[i]) > 1e-5) {
            correct = false;
            break;
        }
    }

    cout << "Корректность результата: "
         << (correct ? "верно" : "ошибка") << endl;

    clReleaseMemObject(bufA);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufC);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}


Writing matrix_mul_opencl.cpp


In [15]:
!./matrix_mul


OpenCL (CPU) время: 0.000188 мс
CPU последовательное время: 94.3417 мс
Корректность результата: ошибка
