In [1]:
%%writefile vector_addition.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <time.h>

#define N 10000000  // Adjust to test with different values (e.g., 10^5, 10^6, 10^7)

// CUDA Kernel for Vector Addition
__global__ void vectorAddCUDA(float *A, float *B, float *C, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        C[i] = A[i] + B[i];
    }
}

// CPU-based Vector Addition
void vectorAddCPU(float *A, float *B, float *C, int n) {
    for (int i = 0; i < n; i++) {
        C[i] = A[i] + B[i];
    }
}

// Helper function to initialize vector with random values
void initializeVector(float *vec, int n) {
    for (int i = 0; i < n; i++) {
        vec[i] = (float)rand() / RAND_MAX;
    }
}

// Helper function to get the current time in seconds
double getTime() {
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

int main() {
    // Allocate host memory
    float *h_A = (float *)malloc(N * sizeof(float));
    float *h_B = (float *)malloc(N * sizeof(float));
    float *h_C = (float *)malloc(N * sizeof(float));

    // Initialize vectors A and B with random values
    initializeVector(h_A, N);
    initializeVector(h_B, N);

    // Measure CPU execution time
    double startCPU = getTime();
    vectorAddCPU(h_A, h_B, h_C, N);
    double endCPU = getTime();
    double cpuTime = endCPU - startCPU;

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, N * sizeof(float));
    cudaMalloc((void **)&d_B, N * sizeof(float));
    cudaMalloc((void **)&d_C, N * sizeof(float));

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * sizeof(float), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Measure GPU execution time
    double startGPU = getTime();
    vectorAddCUDA<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();
    double endGPU = getTime();
    double gpuTime = endGPU - startGPU;

    // Copy result back to host
    cudaMemcpy(h_C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);

    // Calculate speedup
    double speedup = cpuTime / gpuTime;

    // Output results
    printf("CPU Execution Time: %f seconds\n", cpuTime);
    printf("GPU Execution Time: %f seconds\n", gpuTime);
    printf("Speedup: %f\n", speedup);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}




Writing vector_addition.cu


In [2]:
!nvcc vector_addition.cu -o vector_addition


In [3]:
!./vector_addition

CPU Execution Time: 0.047567 seconds
GPU Execution Time: 0.110654 seconds
Speedup: 0.429871
