In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
%%writefile array_sum.cu

#include <stdio.h>
#include <cuda_runtime.h>

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/********************/
/* CPU SUM FUNCTION */
/********************/
void sumCPU(int *a, int *b, int *c, int N) {
    for (int k = 0; k < N; k++) {
        c[k] = a[k] + b[k];
        printf("c[%i] = %i\n", k, c[k]);
    }
}

/********************/
/* GPU SUM FUNCTION */
/********************/
__global__ void sumGPU(int *d_a, int *d_b, int *d_c, int N) {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N)
        d_c[tid] = d_a[tid] + d_b[tid];
}

/********************/
/* MAIN PROGRAM     */
/********************/
int main() {

    const int N = 17;
    const int threadsPerBlock = 8;
    const int blocksPerGrid = iDivUp(N, threadsPerBlock);

    printf("Launching %d blocks of %d threads (total %d threads)\n",
           blocksPerGrid, threadsPerBlock, blocksPerGrid * threadsPerBlock);

    // Host allocation
    int *a = (int *)malloc(N * sizeof(int));
    int *b = (int *)malloc(N * sizeof(int));
    int *c = (int *)malloc(N * sizeof(int));

    // Device allocation
    int *d_a, *d_b, *d_c;
    gpuErrchk(cudaMalloc(&d_a, N * sizeof(int)));
    gpuErrchk(cudaMalloc(&d_b, N * sizeof(int)));
    gpuErrchk(cudaMalloc(&d_c, N * sizeof(int)));

    // Initialize input data
    for (int k = 0; k < N; k++) {
        a[k] = k;
        b[k] = 2 * k;
    }

    // Copy inputs to device
    gpuErrchk(cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice));

    // Launch kernel
    sumGPU<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    // Copy result back to host
    gpuErrchk(cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));

    // Print results
    for (int k = 0; k < N; k++) printf("c[%d] = %d\n", k, c[k]);

    // Free resources
    free(a);
    free(b);
    free(c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Optional final sync (good practice)
    gpuErrchk(cudaDeviceSynchronize());

    return 0;
}


Writing array_sum.cu


In [3]:
!nvcc -arch=sm_75 array_sum.cu -o array_sum

In [4]:
!./array_sum

Launching 3 blocks of 8 threads (total 24 threads)
c[0] = 0
c[1] = 3
c[2] = 6
c[3] = 9
c[4] = 12
c[5] = 15
c[6] = 18
c[7] = 21
c[8] = 24
c[9] = 27
c[10] = 30
c[11] = 33
c[12] = 36
c[13] = 39
c[14] = 42
c[15] = 45
c[16] = 48
