In [1]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpitq5ogn2".


In [3]:
%%cuda
#include <stdio.h>


// CUDA kernel for matrix multiplication
__global__ void matrixMultiply(int *a, int *b, int *c, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;


    if (row < width && col < width) {
        int sum = 0;
        for (int k = 0; k < width; ++k) {
            sum += a[row * width + k] * b[k * width + col];
        }
        c[row * width + col] = sum;
    }
}


// Function to display a matrix
void displayMatrix(int *matrix, int width) {
    for (int i = 0; i < width; ++i) {
        for (int j = 0; j < width; ++j) {
            printf("%d\t", matrix[i * width + j]);
        }
        printf("\n");
    }
}


int main() {
    const int width = 3;
    const int size = width * width * sizeof(int);


    // Host matrices
    int h_mat1[width][width] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
    int h_mat2[width][width] = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};
    int h_result[width][width];


    // Device matrices
    int *d_mat1, *d_mat2, *d_result;
    cudaMalloc((void **)&d_mat1, size);
    cudaMalloc((void **)&d_mat2, size);
    cudaMalloc((void **)&d_result, size);


    // Copy matrices from host to device
    cudaMemcpy(d_mat1, h_mat1, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_mat2, h_mat2, size, cudaMemcpyHostToDevice);


    // Define grid and block dimensions
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((width + 15) / 16, (width + 15) / 16);


    // Launch kernel
    matrixMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_mat1, d_mat2, d_result, width);


    // Copy result back to host
    cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);


    // Display the result
    printf("Result of matrix multiplication:\n");
    displayMatrix((int *)h_result, width);


    // Free device memory
    cudaFree(d_mat1);
    cudaFree(d_mat2);
    cudaFree(d_result);


    return 0;
}


Result of matrix multiplication:
30	24	18	
84	69	54	
138	114	90	

