In [1]:
%%writefile matrix_addition.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <time.h>

#define M 1000  // Matrix rows
#define N 1000  // Matrix columns

// CUDA Kernel for Matrix Addition
__global__ void matrixAddCUDA(float *A, float *B, float *C, int rows, int cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int index = row * cols + col;

    if (row < rows && col < cols) {
        C[index] = A[index] + B[index];
    }
}

// CPU-based Matrix Addition
void matrixAddCPU(float *A, float *B, float *C, int rows, int cols) {
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
        }
    }
}

// Helper function to initialize matrix with random values
void initializeMatrix(float *matrix, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++) {
        matrix[i] = (float)rand() / RAND_MAX;
    }
}

// Helper function to get current time in seconds
double getTime() {
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

int main() {
    // Allocate host memory
    float *h_A = (float *)malloc(M * N * sizeof(float));
    float *h_B = (float *)malloc(M * N * sizeof(float));
    float *h_C = (float *)malloc(M * N * sizeof(float));

    // Initialize matrices A and B with random values
    initializeMatrix(h_A, M, N);
    initializeMatrix(h_B, M, N);

    // Measure CPU execution time
    double startCPU = getTime();
    matrixAddCPU(h_A, h_B, h_C, M, N);
    double endCPU = getTime();
    double cpuTime = endCPU - startCPU;

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, M * N * sizeof(float));
    cudaMalloc((void **)&d_B, M * N * sizeof(float));
    cudaMalloc((void **)&d_C, M * N * sizeof(float));

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, M * N * sizeof(float), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(16, 16);  // 16x16 threads per block
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
    (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Set up CUDA events for accurate GPU timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Start recording time for the GPU
    cudaEventRecord(start);
    matrixAddCUDA<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, M, N);
    cudaEventRecord(stop);

    // Wait for the kernel to complete
    cudaEventSynchronize(stop);

    // Calculate the GPU time
    float gpuTime = 0;
    cudaEventElapsedTime(&gpuTime, start, stop);
    gpuTime /= 1000;  // Convert from milliseconds to seconds

    // Copy result back to host
    cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);

    // Calculate speedup
    double speedup = cpuTime / gpuTime;

    // Output results
    printf("CPU Execution Time: %f seconds\n", cpuTime);
    printf("GPU Execution Time: %f seconds\n", gpuTime);
    printf("Speedup: %f\n", speedup);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing matrix_addition.cu


In [2]:
!nvcc -o matrix_addition matrix_addition.cu

In [4]:
!./matrix_addition

CPU Execution Time: 0.005141 seconds
GPU Execution Time: 0.101856 seconds
Speedup: 0.050478
