<a href="https://colab.research.google.com/github/giuseppeegentile/2d-convolution-cuda/blob/main/CUDA_2d_convolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!nvidia-smi

Fri Oct 28 08:21:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-8n_diho3
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-8n_diho3
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=261f808586e59c064856261c633888916249313d8bec45af65ebd37f65768167
  Stored in directory: /tmp/pip-ephem-wheel-cache-m2jb0gc_/wheels/ca/33/8d/3c86eb85e97d2b6169d95c6e8f2c297fdec60db6e84cb56f5e
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [7]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define MATRIX_SIZE 1024
#define BLOCK_WIDTH 32


__global__ void convolution_2D_untiled(unsigned char * in, const unsigned char * mask, unsigned char * out, size_t mask_width, size_t w, size_t h) {
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int col = blockIdx.x * blockDim.x + tx;
    int row = blockIdx.y * blockDim.y + ty;

    if (col < w && row < h){
        //picking the starting indexes of input matrix inside the mask
        //(TOP-LEFT of the mask)
        int inputRow = row - (mask_width/2);
        int inputCol = col - (mask_width/2);

        //where to write the modified pixel
        int outputPixel = 0;

        for(int i = 0; i < mask_width; ++i){
            for(int j = 0; j < mask_width; ++j){ 
                int currRow = inputRow + i;
                int currCol = inputCol + j;
                if(currRow > -1 && currRow < h && currCol > -1 && currCol < w)
                    outputPixel += in[currRow * w + currCol ] * mask[ mask_width+j];
            }
        }

        out[(row * w) + col] = (unsigned char)(outputPixel); 
    }
}



void launch_tests(int mask_width){
    unsigned char *a, *b, *c;
    cudaMallocManaged((void **) &a, sizeof(unsigned char)*MATRIX_SIZE*MATRIX_SIZE);
    cudaMallocManaged((void **) &b, sizeof(unsigned char)*mask_width*mask_width);
    cudaMallocManaged((void **) &c, sizeof(unsigned char)*MATRIX_SIZE*MATRIX_SIZE);

    // initialize matrix A
    for (int i = 0; i < MATRIX_SIZE; ++i) {
        for (int j = 0; j < MATRIX_SIZE; ++j) {
            a[i * MATRIX_SIZE + j] = 1 + ((i+j) / 2) + (3*j % 20); 
        }
    }

    // initialize matrix B
    for (int i = 0; i < mask_width; ++i) {
        for (int j = 0; j < mask_width; ++j) {
            b[i * mask_width + j] =  1 + (((2*i) + j) % mask_width);
        }
    }

    float  naive_gpu_elapsed_time_ms;

    // some events to count the execution time
    //clock_t st, end;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    unsigned int grid_rows = (MATRIX_SIZE + BLOCK_WIDTH - 1) / BLOCK_WIDTH;
    unsigned int grid_cols = (MATRIX_SIZE + BLOCK_WIDTH - 1) / BLOCK_WIDTH;
    dim3 dimGrid(grid_cols, grid_rows);
    dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
    
    cudaEventRecord(start, 0);
    convolution_2D_untiled<<<dimGrid, dimBlock>>>(a, b, c, mask_width, MATRIX_SIZE, MATRIX_SIZE);
    cudaThreadSynchronize();

    //time counting terminate
    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    //compute time elapsed on GPU computing
    cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
    printf("Time elapsed on naive GPU convolution 2d untiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
  
    //free memory
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
    
}

int main(int argc, char const *argv[]) {
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }
 
    for(int ms = 3; ms <= 9; ms+=2){
        printf("Testing with mask size = %d\n\n", ms);
        launch_tests(ms);
        printf("________________________________________________________________________\n\n");
    }
    return 0;
}





In [8]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define MATRIX_SIZE 1024
#define BLOCK_WIDTH 32

__global__ void convolution_2D_tiled(unsigned char * in, const unsigned char * __restrict__ mask, unsigned char * out, int mask_width , int w, int h) {
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int tile_width = BLOCK_WIDTH - mask_width + 1; //since BLOCK_WIDTH = TILE_WIDTH + MASK_WIDTH - 1

    int col = blockIdx.x * tile_width + tx;
    int row = blockIdx.y * tile_width + ty;


    __shared__ unsigned char tile[BLOCK_WIDTH][BLOCK_WIDTH];
    
    //some thread won't write any outputs
    if (col < w && row < h){
        //picking the starting indexes of input matrix inside the mask
        //(TOP-LEFT of the mask)
        int inputRow = row - (mask_width/2);
        int inputCol = col - (mask_width/2);

        // Load tile elements
        if(inputRow >= 0 && inputRow < h && inputCol >= 0 && inputCol < w)
            tile[ty][tx] = in[inputRow*w + inputCol]; 
        else
            tile[ty][tx] = 0.0;

        // Wait until all tile elements are loaded
        __syncthreads();

        //where to write the modified pixel
        int outputPixel = 0;

        if(ty < tile_width && tx < tile_width){
          //get the neighbour in the mask
          for(int i = 0; i < mask_width; ++i){
            for(int j = 0; j < mask_width; ++j){ //(Mask_Width^2) access for each thread in block -> for each block (Mask_Width^2) * (Block_width^2)
            outputPixel += tile[i + ty][j + tx] * mask[i * mask_width + j];
            }
          }
        }
        out[(row * w) + col] = (unsigned char)(outputPixel); 
    }
}

void launch_tests(int mask_width){
    unsigned char *a, *b, *c;
    cudaMallocManaged((void **) &a, sizeof(unsigned char)*MATRIX_SIZE*MATRIX_SIZE);
    cudaMallocManaged((void **) &b, sizeof(unsigned char)*mask_width*mask_width);
    cudaMallocManaged((void **) &c, sizeof(unsigned char)*MATRIX_SIZE*MATRIX_SIZE);

    // initialize matrix A
    for (int i = 0; i < MATRIX_SIZE; ++i) {
        for (int j = 0; j < MATRIX_SIZE; ++j) {
            a[i * MATRIX_SIZE + j] = 1 + ((i+j) / 2) + (3*j % 20); 
        }
    }

    // initialize matrix B
    for (int i = 0; i < mask_width; ++i) {
        for (int j = 0; j < mask_width; ++j) {
            b[i * mask_width + j] =  1 + (((2*i) + j) % mask_width);
        }
    }

    float  naive_gpu_elapsed_time_ms;

    // some events to count the execution time
    //clock_t st, end;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int tile_width = BLOCK_WIDTH - mask_width + 1;
    unsigned int grid_size = MATRIX_SIZE / tile_width;
    dim3 dimGrid(grid_size, grid_size);
    dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
    
    cudaEventRecord(start, 0);
    convolution_2D_tiled<<<dimGrid, dimBlock>>>(a, b, c, mask_width, MATRIX_SIZE, MATRIX_SIZE);
    cudaThreadSynchronize();

    //time counting terminate
    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    //compute time elapsed on GPU computing
    cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
    printf("Time elapsed on naive GPU convolution 2d tiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
  
    //free memory
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
    
}

int main(int argc, char const *argv[]) {
    int nDevices;
    cudaGetDeviceCount(&nDevices);
    for (int i = 0; i < nDevices; i++) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);
      printf("Device Number: %d\n", i);
      printf("  Device name: %s\n", prop.name);
      printf("  max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
      printf("  max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
      printf("  max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
      printf("  num SM: %d\n", prop.multiProcessorCount);
      printf("  num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
      printf("  num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
      printf("  Memory Clock Rate (KHz): %d\n",
           prop.memoryClockRate);
      printf("  Memory Bus Width (bits): %d\n",
           prop.memoryBusWidth);
      printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
           2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    }
 
    for(int ms = 3; ms <= 9; ms+=2){
        printf("Testing with mask size = %d\n\n", ms);
        launch_tests(ms);
        printf("________________________________________________________________________\n\n");
    }
    return 0;
}


