In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [4]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-nj1p1pwz
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-nj1p1pwz
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=06d7f648f5192efedc37ca9810c6a33de990f7f28b0d232c9930cfcf288324bb
  Stored in directory: /tmp/pip-ephem-wheel-cache-r8bfawtx/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%%cu
#include <iostream>
int main() {
    std::cout << "Compiling a C++ program in a Notbook\n";
    return 0;
}

Compiling a C++ program in a Notbook



In [13]:
%%cu
#include <cstdio>
#include <iostream>
#include <cuda_runtime.h>

__global__ void add_arrays(int *a, int *b, int *c, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    //const int N = 1 << 20;  // Example: 1 million elements
    const int N = 6;

    // 1. Memory Allocation on the Host
    int size = N * sizeof(int);
    int *h_a = new int[N];
    int *h_b = new int[N];
    int *h_c = new int[N];

    // 2. Initialize Data
    for (int i = 0; i < N; i++) {
        h_a[i] = i;
        h_b[i] = 2 * i;
    }

    std::cout << "array a:" << std::endl;
    for (int i = 0; i < N; i++) {
        std::cout << h_a[i] << ",";
    }
    std::cout << std::endl;
    std::cout << "array b:" << std::endl;
    for (int i = 0; i < N; i++) {
        std::cout << h_b[i] << ",";
    }
    std::cout << std::endl;

    // 3. Memory Allocation on the GPU (Device)
    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, size);
    cudaMalloc((void**)&d_b, size);
    cudaMalloc((void**)&d_c, size);

    // 4. Copy Data from Host to Device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // 5. Kernel Launch
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    add_arrays<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // 6. Copy Data from Device to Host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Verify results
    std::cout << "verifying....array addition..." << std::endl;
    for (int i = 0; i < N; i++) {
        std::cout << h_c[i] << std::endl;
        if (h_c[i] != h_a[i] + h_b[i]) {
            std::cerr << "Error: mismatch at position " << i << ". Got: " << h_c[i] << ", Expected: " << h_a[i] + h_b[i] << std::endl;
            break;
        }
    }
    std::cout << "Verification complete!" << std::endl;

    // 7. Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    delete[] h_a;
    delete[] h_b;
    delete[] h_c;

    return 0;
}



array a:
0,1,2,3,4,5,
array b:
0,2,4,6,8,10,
verifying....array addition...
0
3
6
9
12
15
Verification complete!

