<a href="https://colab.research.google.com/github/gusanitor8/matrix-multiplication-cuda/blob/main/matrix_multiplication_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpnc6tv_ty".


In [None]:
%%cuda
#include <cstdio>
#include <iostream>
// CUDA kernel to perform matrix multiplication
__global__ void matrixMultiplyKernel(float* A, float* B, float* C, int m, int n, int p) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < m && col < p) {
        float value = 0;
        for (int k = 0; k < n; ++k) {
            value += A[row * n + k] * B[k * p + col];
        }
        C[row * p + col] = value;
    }
}

// Host code to manage memory and kernel invocation
void matrixMultiply(float* A, float* B, float* C, int m, int n, int p) {
    // Size of matrices in bytes
    size_t size_A = m * n * sizeof(float);
    size_t size_B = n * p * sizeof(float);
    size_t size_C = m * p * sizeof(float);

    // Allocate memory on the device (GPU)
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size_A);
    cudaMalloc(&d_B, size_B);
    cudaMalloc(&d_C, size_C);

    // Copy matrices A and B from host (CPU) to device (GPU)
    cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);

    // Define block and grid sizes
    int blockSize = 16;
    dim3 block(blockSize, blockSize);
    dim3 grid((p + blockSize - 1) / blockSize, (m + blockSize - 1) / blockSize);

    // Create CUDA events to measure time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Record the start event
    cudaEventRecord(start, 0);

    // Launch the CUDA kernel
    matrixMultiplyKernel<<<grid, block>>>(d_A, d_B, d_C, m, n, p);

    // Record the stop event
    cudaEventRecord(stop, 0);

    // Wait for the event to complete
    cudaEventSynchronize(stop);

    // Calculate elapsed time in milliseconds
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copy the result matrix C from device (GPU) to host (CPU)
    cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Destroy events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Output the elapsed time
    std::cout << "Time elapsed: " << milliseconds << " ms" << std::endl;
}

int main() {
    int m = 500;  // Rows in A and C
    int n = 300;  // Columns in A and rows in B
    int p = 200;  // Columns in B and C

    // Allocate memory for matrices on the host (CPU)
    float* A = new float[m * n];
    float* B = new float[n * p];
    float* C = new float[m * p];

    // Initialize matrices A and B with random values
    for (int i = 0; i < m * n; i++) A[i] = static_cast<float>(rand()) / RAND_MAX;
    for (int i = 0; i < n * p; i++) B[i] = static_cast<float>(rand()) / RAND_MAX;

    // Perform matrix multiplication on GPU
    matrixMultiply(A, B, C, m, n, p);

    // Clean up
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}

Time elapsed: 0.468512 ms

