In [1]:
# A quick check there is a CUDA device associated to this session (if not use Runtime => Change runtime type and choose GPU)
# Also checking that the CUDA env (nvcc) is correctly set up
!nvidia-smi
!nvcc --version

Wed Apr 24 09:14:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# A CUDA Matric Mutiplication source code
# You can change the size of the Matrix in the main()
#     constexpr int N = 1000;
#
%%writefile matrix_multiply.cu

#include <iostream>
#include <vector>
#include <random>

using namespace std;

// CUDA kernel for matrix multiplication
__global__ void matrixMultiplyKernel(int *a, int *b, int *c, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int k = 0; k < N; ++k) {
            sum += a[row * N + k] * b[k * N + col];
        }
        c[row * N + col] = sum;
    }
}

// Function to generate a random matrix of size N x N
void generateRandomMatrix(vector<int>& matrix, int N) {
    random_device rd;
    mt19937 gen(rd());
    uniform_int_distribution<int> dis(1, 100); // Random numbers between 1 and 100

    matrix.resize(N * N);
    for (int i = 0; i < N * N; ++i) {
        matrix[i] = dis(gen);
    }
}

// Function to display a portion of the matrix
void displayMatrix(const vector<int>& matrix, int N) {
    const int MAX_DISPLAY_SIZE = 10;
    for (int i = 0; i < min(MAX_DISPLAY_SIZE, N); ++i) {
        for (int j = 0; j < min(MAX_DISPLAY_SIZE, N); ++j) {
            cout << matrix[i * N + j] << "\t";
        }
        if (N > MAX_DISPLAY_SIZE) cout << "...";
        cout << endl;
    }
}

// Function to perform matrix multiplication on GPU
void matrixMultiplyCUDA(const vector<int>& A, const vector<int>& B, vector<int>& C, int N) {
    int *d_A, *d_B, *d_C;

    // Allocate device memory
    cudaMalloc((void **)&d_A, N * N * sizeof(int));
    cudaMalloc((void **)&d_B, N * N * sizeof(int));
    cudaMalloc((void **)&d_C, N * N * sizeof(int));

    // Copy input matrices from host to device
    cudaMemcpy(d_A, A.data(), N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B.data(), N * N * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockDim(16, 16);
    dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);

    // Launch kernel
    matrixMultiplyKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);

    // Copy result matrix from device to host
    cudaMemcpy(C.data(), d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

int main() {
    constexpr int N = 1000;

    vector<int> A, B, C;
    generateRandomMatrix(A, N);
    generateRandomMatrix(B, N);
    C.resize(N * N);

    cout << "Matrix A:" << endl;
    displayMatrix(A, N);

    cout << "\nMatrix B:" << endl;
    displayMatrix(B, N);

    // Perform matrix multiplication on GPU
    matrixMultiplyCUDA(A, B, C, N);

    cout << "\nResult of Matrix Multiplication:" << endl;
    displayMatrix(C, N);

    return 0;
}

Writing matrix_multiply.cu


In [3]:
# Compile
!nvcc -o matrix_multiply matrix_multiply.cu

In [4]:
# Run the Matrix Multiplicationm it should take no time using CUDA vs when not using CUDA
!time ./matrix_multiply

Matrix A:
63	51	95	31	7	24	17	35	35	3	...
71	37	5	57	44	96	97	71	71	85	...
88	79	96	84	83	61	95	75	2	31	...
87	29	57	3	39	29	20	52	92	52	...
16	66	69	81	61	6	75	58	27	65	...
12	64	65	1	16	32	30	79	48	12	...
74	74	91	61	51	68	52	56	50	24	...
75	42	62	31	17	43	54	74	95	93	...
67	49	7	5	17	97	3	97	46	93	...
13	89	49	63	75	18	47	97	98	76	...

Matrix B:
45	33	22	39	9	51	47	55	85	25	...
8	57	59	96	98	48	26	94	79	18	...
10	69	3	93	40	25	29	58	66	16	...
93	56	23	2	73	75	66	71	18	77	...
80	74	85	3	79	86	9	86	90	64	...
61	27	100	96	43	22	1	54	94	2	...
31	65	85	39	51	68	76	7	98	96	...
46	78	38	2	14	36	57	72	11	2	...
75	36	18	49	77	96	60	47	39	33	...
91	89	5	59	96	40	45	61	83	65	...

Result of Matrix Multiplication:
2514812	2538732	2497897	2461036	2574760	2579635	2502016	2574394	2527094	2523245	...
2561442	2551079	2532669	2508844	2635120	2618876	2599383	2664020	2610235	2577856	...
2516753	2523094	2492636	2500269	2578648	2570720	2529936	2560041	2531841	2573958	...
2458204	2483105	2407646	2463575	25