<a href="https://colab.research.google.com/github/giuseppeegentile/2d-convolution-cuda/blob/main/CUDA_2d_convolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!nvidia-smi

Sat Oct 22 08:28:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [5]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-t_lump8n
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-t_lump8n
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=4751ad02cf296d0be50e4a70e1e6f2bafe586c56cee2674d7797add4281b2d10
  Stored in directory: /tmp/pip-ephem-wheel-cache-ofeb5n49/wheels/ca/33/8d/3c86eb85e97d2b6169d95c6e8f2c297fdec60db6e84cb56f5e
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [7]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [19]:
%%cu
#include <stdio.h>
#include<stdlib.h>
#include<time.h>

#define TILE_SIZE 32
#define OUT_TILE_SIZE 32
#define MAX_MASK_WIDTH 32
#define MATRIX_SIZE 1024


__global__ void convolution_2D_tiled(unsigned char * in, const unsigned char * __restrict__ mask, unsigned char * out, int mask_width , int w, int h, int pitch) {
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int col = blockIdx.x * blockDim.x + tx;
    int row = blockIdx.y * blockDim.y + ty;

    __shared__ unsigned char tile[TILE_SIZE + MAX_MASK_WIDTH -1][TILE_SIZE + MAX_MASK_WIDTH -1];
    
    //do math if thread id is in the tile size
    if (col < w && row < h){
        //picking the starting indexes of input matrix inside the mask
        //(TOP-LEFT of the mask)
        int inputRow = row - (mask_width/2);
        int inputCol = col - (mask_width/2);

        // Load tile elements
        if(inputRow >= 0 && inputRow < h && inputCol >= 0 && inputCol < w)
            tile[ty][tx] = in[inputRow*pitch + inputCol]; //linearized memory access, I consider the pitch instead of the w, because there are padded elements
        else
            tile[ty][tx] = 0.0;

        // Wait until all tile elements are loaded
        __syncthreads();

        //where to write the modified pixel
        int outputPixel = 0;

        //get the neighbour in the mask
        for(int i = 0; i < mask_width; ++i){
            int currRow = inputRow + i;
            for(int j = 0; j < mask_width; ++j){ //(Mask_Width^2) access for each thread in block -> for each block (Mask_Width^2) * (Block_width^2)
                int currCol = inputCol + j;

                // Verify we have a valid image pixel
                if(currRow > 1 && currRow < h && currCol > 1 && currCol < w) {
                    outputPixel += tile[i + ty][j + tx] * mask[mask_width + j];
                }
            }
        }
        out[(row * w) + col] = (unsigned char)(outputPixel); //back using width instead of pitch
    }
}

void launch_tests(int mask_size){
    int block_size;
    for(block_size= 4; block_size <= 32; block_size *= 2) {
        unsigned char *a, *b, *c;
        const int mask_width = 5;
        cudaMallocManaged((void **) &a, sizeof(unsigned char)*MATRIX_SIZE*MATRIX_SIZE);
        cudaMallocManaged((void **) &b, sizeof(unsigned char)*mask_width*mask_width);
        cudaMallocManaged((void **) &c, sizeof(unsigned char)*MATRIX_SIZE*MATRIX_SIZE);

        srand(time(0));
        //initialize matrix A
        for (int i = 0; i < MATRIX_SIZE; ++i) {
            for (int j = 0; j < MATRIX_SIZE; ++j) {
                a[i * MATRIX_SIZE + j] = 1 + (rand() % 10); //random values between 1-10
            }
        }

        srand(time(0));
        // initialize matrix B
        for (int i = 0; i < mask_width; ++i) {
            for (int j = 0; j < mask_width; ++j) {
                b[i * mask_width + j] =  1 + (rand() % 5); //random values between 1-5
            }
        }

        float  naive_gpu_elapsed_time_ms;

        // some events to count the execution time
        //clock_t st, end;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        
        unsigned int grid_rows = (MATRIX_SIZE + block_size - 1) / block_size;
        unsigned int grid_cols = (MATRIX_SIZE + block_size - 1) / block_size;
        dim3 dimGrid(grid_cols, grid_rows);
        dim3 dimBlock(block_size, block_size);
        //convolution_2D_tiled( unsigned char * in, const unsigned char * __restrict__ mask, unsigned char * out, int mask_width , int w, int h, int pitch) {
        
        cudaEventRecord(start, 0);
        convolution_2D_tiled<<<dimGrid, dimBlock>>>(a, b, c, mask_width, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE);
        cudaThreadSynchronize();

        //time counting terminate
       
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        //compute time elapsed on GPU computing
        cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
        printf("Time elapsed on naive GPU convolution 2d tiled ( %d ) block %f ms.\n\n", block_size, naive_gpu_elapsed_time_ms);
        
      
        //free memory
        cudaFree(a);
        cudaFree(b);
        cudaFree(c);
    }
}

int main(int argc, char const *argv[])
{
    

    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }
 
    for(int ms = 3; ms <= 9; ms+=2){
        printf("Testing with mask size = %d\n\n", ms);
        launch_tests(ms);
        printf("________________________________________________________________________\n\n");
    }

    
 
    return 0;
}

Device Number: 0
  Device name: Tesla T4
  max Blocks Per MultiProcessor: 16
  max Threads Per MultiProcessor: 1024
  max Threads Per Block: 1024
  num SM: 40
  num bytes sharedMem Per Block: 49152
  num bytes sharedMem Per Multiprocessor: 65536
  Memory Clock Rate (KHz): 5001000
  Memory Bus Width (bits): 256
  Peak Memory Bandwidth (GB/s): 320.064000

Testing with mask size = 3

Time elapsed on naive GPU convolution 2d tiled ( 4 ) block 1.748960 ms.

Time elapsed on naive GPU convolution 2d tiled ( 8 ) block 1.255424 ms.

Time elapsed on naive GPU convolution 2d tiled ( 16 ) block 1.126400 ms.

Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 1.202240 ms.

________________________________________________________________________

Testing with mask size = 5

Time elapsed on naive GPU convolution 2d tiled ( 4 ) block 1.413824 ms.

Time elapsed on naive GPU convolution 2d tiled ( 8 ) block 1.178496 ms.

Time elapsed on naive GPU convolution 2d tiled ( 16 ) block 1.298848 ms.

