In [None]:
# A quick check there is a CUDA device associated to this session (if not use Runtime => Change runtime type and choose GPU)
# Also checking that the CUDA env (nvcc) is correctly set up
!nvidia-smi
!nvcc --version

In [None]:
# A CUDA Matric Mutiplication source code
# You can change the size of the Matrix in the main()
#     constexpr int N = 1000;
#
%%writefile matrix_multiply.cu

#include <iostream>
#include <vector>
#include <random>

using namespace std;

// CUDA kernel for matrix multiplication
__global__ void matrixMultiplyKernel(int *a, int *b, int *c, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int k = 0; k < N; ++k) {
            sum += a[row * N + k] * b[k * N + col];
        }
        c[row * N + col] = sum;
    }
}

// Function to generate a random matrix of size N x N
void generateRandomMatrix(vector<int>& matrix, int N) {
    random_device rd;
    mt19937 gen(rd());
    uniform_int_distribution<int> dis(1, 100); // Random numbers between 1 and 100

    matrix.resize(N * N);
    for (int i = 0; i < N * N; ++i) {
        matrix[i] = dis(gen);
    }
}

// Function to display a portion of the matrix
void displayMatrix(const vector<int>& matrix, int N) {
    const int MAX_DISPLAY_SIZE = 10;
    for (int i = 0; i < min(MAX_DISPLAY_SIZE, N); ++i) {
        for (int j = 0; j < min(MAX_DISPLAY_SIZE, N); ++j) {
            cout << matrix[i * N + j] << "\t";
        }
        if (N > MAX_DISPLAY_SIZE) cout << "...";
        cout << endl;
    }
}

// Function to perform matrix multiplication on GPU
void matrixMultiplyCUDA(const vector<int>& A, const vector<int>& B, vector<int>& C, int N) {
    int *d_A, *d_B, *d_C;

    // Allocate device memory
    cudaMalloc((void **)&d_A, N * N * sizeof(int));
    cudaMalloc((void **)&d_B, N * N * sizeof(int));
    cudaMalloc((void **)&d_C, N * N * sizeof(int));

    // Copy input matrices from host to device
    cudaMemcpy(d_A, A.data(), N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B.data(), N * N * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockDim(16, 16);
    dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);

    // Launch kernel
    matrixMultiplyKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);

    // Copy result matrix from device to host
    cudaMemcpy(C.data(), d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

int main() {
    constexpr int N = 1000;

    vector<int> A, B, C;
    generateRandomMatrix(A, N);
    generateRandomMatrix(B, N);
    C.resize(N * N);

    cout << "Matrix A:" << endl;
    displayMatrix(A, N);

    cout << "\nMatrix B:" << endl;
    displayMatrix(B, N);

    // Perform matrix multiplication on GPU
    matrixMultiplyCUDA(A, B, C, N);

    cout << "\nResult of Matrix Multiplication:" << endl;
    displayMatrix(C, N);

    return 0;
}

In [None]:
# Compile
!nvcc -o matrix_multiply matrix_multiply.cu

In [None]:
# Run the Matrix Multiplicationm it should take no time using CUDA vs when not using CUDA
!time ./matrix_multiply