In [None]:
!nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-bq7jqjjl
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-bq7jqjjl
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4305 sha256=59d56d3af435c16d4dd401e7f7969e9c

In [None]:
%%cu

#include <cublas_v2.h>
#include <malloc.h>
#include <stdio.h>
#include <stdlib.h>

__global__ void addKernel(double* c, double* a, double* b, unsigned int size) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x)
    c[i] = a[i] + b[i];
}

int main(int argc, char* argv[])
{
    int GRID_DIM = 2048;
    int BLOCK_DIM = 64;
    int n = 2700000;
    printf("n = %d\n", n);
    printf("BLOCK_DIM = %d, GRID_DIM = %d\n", BLOCK_DIM, GRID_DIM);
    int n2b = n * sizeof(double);

    double* a = (double*)calloc(n, sizeof(double));
    double* b = (double*)calloc(n, sizeof(double));
    double* c = (double*)calloc(n, sizeof(double));
    double* c_ = (double*)calloc(n, sizeof(double));

    for (int i = 0; i < n; i++) {
        a[i] = double(i);
        b[i] = double(i);
    }

    cudaEvent_t start_p, stop_p;
    float cpuTime = 0.0f;
    cudaError_t cuerr = cudaEventCreate(&start_p);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot create CUDA start event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventCreate(&stop_p);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot create CUDA end event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventRecord(start_p, 0);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot record start_p CUDA event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }

    for (int i = 0; i < n; i++) {
        c_[i] = a[i] + b[i];
    }

    cuerr = cudaEventRecord(stop_p, 0);
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot record stop_p CUDA event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventElapsedTime(&cpuTime, start_p, stop_p);
    cudaEventDestroy(start_p);
    cudaEventDestroy(stop_p);


    double* adev = NULL;
    cuerr = cudaMalloc((void**)&adev, n2b);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot allocate device array for a: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    double* bdev = NULL;
    cuerr = cudaMalloc((void**)&bdev, n2b);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot allocate device array for b: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    double* cdev = NULL;
    cuerr = cudaMalloc((void**)&cdev, n2b);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot allocate device array for c: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cudaEvent_t start, stop;
    float gpuTime = 0.0f;
    cuerr = cudaEventCreate(&start);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot create CUDA start event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventCreate(&stop);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot create CUDA end event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaMemcpy(adev, a, n2b, cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot copy a array from host to device: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaMemcpy(bdev, b, n2b, cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot copy b array from host to device: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventRecord(start, 0);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot record start CUDA event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    addKernel <<< GRID_DIM, BLOCK_DIM >>> (cdev, adev, bdev, n);
    cuerr = cudaGetLastError();
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot launch CUDA kernel: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaDeviceSynchronize();
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot synchronize CUDA kernel: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventRecord(stop, 0);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot record stop CUDA event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaMemcpy(c, cdev, n2b, cudaMemcpyDeviceToHost);
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot copy c array from device to host: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventElapsedTime(&gpuTime, start, stop);
    printf("seq time: %.9f seconds\n", cpuTime / 1000);
    printf("time spent executing %s: %.9f seconds\n", "kernel", gpuTime / 1000);
    for (int i = 0; i < 5; i++) {
        printf("a: %.2f b: %.2f c: %.2f\n", a[i], b[i], c[i]);
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(adev);
    cudaFree(bdev);
    cudaFree(cdev);
    free(a);
    free(b);
    free(c);
    free(c_);
    return 0;
}

n = 2700000
BLOCK_DIM = 64, GRID_DIM = 2048
seq time: 0.000000000 seconds
time spent executing kernel: 0.000270464 seconds
a: 0.00 b: 0.00 c: 0.00
a: 1.00 b: 1.00 c: 2.00
a: 2.00 b: 2.00 c: 4.00
a: 3.00 b: 3.00 c: 6.00
a: 4.00 b: 4.00 c: 8.00

