In [1]:
%%writefile dot-product.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <time.h>

#define THREADS_PER_BLOCK 256

// CUDA Kernel for Dot Product using Atomic Operations
__global__ void dotProductCUDA(float *A, float *B, float *result, int N) {
    __shared__ float cache[THREADS_PER_BLOCK];
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int cacheIdx = threadIdx.x;
    float tempSum = 0;

    // Compute partial dot product for this block
    while (tid < N) {
        tempSum += A[tid] * B[tid];
        tid += blockDim.x * gridDim.x;
    }

    // Store the partial result in shared memory
    cache[cacheIdx] = tempSum;

    // Synchronize threads in this block
    __syncthreads();

    // Perform reduction in shared memory
    int i = blockDim.x / 2;
    while (i != 0) {
        if (cacheIdx < i) {
            cache[cacheIdx] += cache[cacheIdx + i];
        }
        __syncthreads();
        i /= 2;
    }

    // Store the result from this block to global memory
    if (cacheIdx == 0) atomicAdd(result, cache[0]);
}

// CPU-based Dot Product
float dotProductCPU(float *A, float *B, int N) {
    float sum = 0;
    for (int i = 0; i < N; i++) {
        sum += A[i] * B[i];
    }
    return sum;
}

// Helper function to initialize vector with random values
void initializeVector(float *vector, int size) {
    for (int i = 0; i < size; i++) {
        vector[i] = (float)rand() / RAND_MAX;
    }
}

// Helper function to get current time in seconds
double getTime() {
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

int main() {
    int N = 1000000; // Vector size (can be adjusted)

    // Allocate host memory
    float *h_A = (float *)malloc(N * sizeof(float));
    float *h_B = (float *)malloc(N * sizeof(float));
    float h_resultCPU, h_resultGPU;

    // Initialize vectors A and B with random values
    initializeVector(h_A, N);
    initializeVector(h_B, N);

    // CPU Dot Product
    double startCPU = getTime();
    h_resultCPU = dotProductCPU(h_A, h_B, N);
    double endCPU = getTime();
    double cpuTime = endCPU - startCPU;

    // Allocate device memory
    float *d_A, *d_B, *d_result;
    cudaMalloc((void **)&d_A, N * sizeof(float));
    cudaMalloc((void **)&d_B, N * sizeof(float));
    cudaMalloc((void **)&d_result, sizeof(float));

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemset(d_result, 0, sizeof(float));

    // Define grid and block dimensions
    int blocksPerGrid = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

    // Set up CUDA events for accurate GPU timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Start recording time for GPU
    cudaEventRecord(start);
    dotProductCUDA<<<blocksPerGrid, THREADS_PER_BLOCK>>>(d_A, d_B, d_result, N);
    cudaEventRecord(stop);

    // Wait for the kernel to complete
    cudaEventSynchronize(stop);

    // Calculate GPU execution time
    float gpuTime = 0;
    cudaEventElapsedTime(&gpuTime, start, stop);
    gpuTime /= 1000;  // Convert from milliseconds to seconds

    // Copy result back to host
    cudaMemcpy(&h_resultGPU, d_result, sizeof(float), cudaMemcpyDeviceToHost);

    // Calculate speedup
    double speedup = cpuTime / gpuTime;

    // Output results
    printf("CPU Result: %f\n", h_resultCPU);
    printf("GPU Result: %f\n", h_resultGPU);
    printf("CPU Execution Time: %f seconds\n", cpuTime);
    printf("GPU Execution Time: %f seconds\n", gpuTime);
    printf("Speedup: %f\n", speedup);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_result);

    // Free host memory
    free(h_A);
    free(h_B);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing dot-product.cu


In [2]:
!nvcc dot-product.cu -o dot-product

In [3]:
!./dot-product

CPU Result: 249863.093750
GPU Result: 249902.562500
CPU Execution Time: 0.003804 seconds
GPU Execution Time: 0.114311 seconds
Speedup: 0.033280
