<a href="https://colab.research.google.com/github/clashingera/LP/blob/main/cuda_vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile vector_add.cu
#include <stdio.h>      // For printf, fprintf
#include <stdlib.h>     // For malloc, free

// CUDA kernel for vector addition
__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

// Macro to check for CUDA errors after CUDA API calls
#define CHECK_CUDA_ERROR(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error in %s at line %d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

int main() {
    int n = 1 << 20; // 2^20 = 1048576 elements
    size_t size = n * sizeof(float);

    // Allocate memory on the host
    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_c = (float *)malloc(size);

    // Initialize input vectors
    for (int i = 0; i < n; i++) {
        h_a[i] = float(i);
        h_b[i] = float(i * 2);
    }

    // Allocate memory on the device
    float *d_a, *d_b, *d_c;
    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_a, size));
    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_b, size));
    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_c, size));

    // Copy vectors from host to device
    CHECK_CUDA_ERROR(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
    CHECK_CUDA_ERROR(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));

    // Launch kernel with 256 threads per block
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);

    // Check for kernel launch errors
    CHECK_CUDA_ERROR(cudaGetLastError());

    // Copy result vector from device to host
    CHECK_CUDA_ERROR(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));

    // Verify results
    for (int i = 0; i < 9; i++) {
        printf("c[%d] = %f\n", i, h_c[i]);
    }

    // Free memory
    CHECK_CUDA_ERROR(cudaFree(d_a));
    CHECK_CUDA_ERROR(cudaFree(d_b));
    CHECK_CUDA_ERROR(cudaFree(d_c));
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}


Writing vector_add.cu


In [2]:

!nvcc -arch=sm_75 vector_add.cu -o vector_add
!./vector_add


c[0] = 0.000000
c[1] = 3.000000
c[2] = 6.000000
c[3] = 9.000000
c[4] = 12.000000
c[5] = 15.000000
c[6] = 18.000000
c[7] = 21.000000
c[8] = 24.000000
