<a href="https://colab.research.google.com/github/giuseppeegentile/2d-convolution-cuda/blob/main/Working_2d_conv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-1s9l2o4x
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-1s9l2o4x
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=7e424705fee0681728504eb4c4023c5bb91a993502a6ffb5a7c90421b7a5ed54
  Stored in directory: /tmp/pip-ephem-wheel-cache-44wytniy/wheels/ca/33/8d/3c86eb85e97d2b6169d95c6e8f2c297fdec60db6e84cb56f5e
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [14]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>

#define BLOCK_WIDTH 32

// kernel
__global__ void tiledConvolution_2D_Kernel(float* d_m, const float* __restrict__ d_mask, float* d_n, size_t a, size_t b, size_t maskWidth)
{
    float result = 0;
   
    // indexing variables
    int n_row = blockIdx.y * blockDim.y + threadIdx.y;
    int n_col = blockIdx.x * blockDim.x + threadIdx.x;
    
    int m_row = n_row - maskWidth / 2;
    int m_col = n_col - maskWidth / 2;
  
    
    // thread boundary check for calculation
    if( n_row < a && n_col < b)
    {
        for(int i = 0; i < maskWidth; ++i)
        {
            for(int j = 0; j < maskWidth; ++j)
            {
                int curCol = m_col + j;
                int curRow = m_row + i;
                if (curCol < a && curRow < b && curCol > -1 && curRow > -1) 
                  result += d_m[curRow * b + curCol] * d_mask[i * maskWidth + j];
            }
        }
        
        // write result 
        d_n[n_row * b + n_col] = result;
    }
}

// CUDA error checking
void errorCheck(unsigned int line)
{
    cudaError_t cudaError = cudaGetLastError();

    if(cudaError != cudaSuccess)
    {
        printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
        exit(EXIT_FAILURE);
    }
}

// host function containing kernel call
void convolution_2D(float* m, float* mask, float* n, size_t a, size_t b, size_t maskWidth)
{
    

    float  naive_gpu_elapsed_time_ms;

    // some events to count the execution time
    //clock_t st, end;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    dim3 numOfBlocks(ceil(b + BLOCK_WIDTH - 1/BLOCK_WIDTH), ceil(a + BLOCK_WIDTH - 1/  BLOCK_WIDTH), 1);
    dim3 numOfThreads(BLOCK_WIDTH, BLOCK_WIDTH, 1);

    size_t bytes_m = a * b * sizeof(float);
    size_t bytes_mask = maskWidth * maskWidth * sizeof(float);

    float* d_m;
    float* d_mask;
    float* d_n;

    cudaMalloc((void**) &d_m, bytes_m);
    errorCheck(__LINE__);
    cudaMalloc((void**) &d_mask, bytes_mask);
    errorCheck(__LINE__);
    cudaMalloc((void**) &d_n, bytes_m);
    errorCheck(__LINE__);
    
    cudaMemcpy(d_m, m, bytes_m, cudaMemcpyHostToDevice);
    errorCheck(__LINE__);
    cudaMemcpy(d_mask, mask, bytes_mask, cudaMemcpyHostToDevice);
    errorCheck(__LINE__);
    cudaEventRecord(start, 0);
    tiledConvolution_2D_Kernel<<<numOfBlocks, numOfThreads>>>(d_m, d_mask, d_n, a, b, maskWidth);
    errorCheck(__LINE__);
    

    cudaThreadSynchronize();

    //time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
    printf("Time elapsed on naive GPU convolution 2d untiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
    cudaMemcpy(n, d_n, bytes_m, cudaMemcpyDeviceToHost);
    errorCheck(__LINE__);

    cudaFree(d_m);
    errorCheck(__LINE__);
    cudaFree(d_mask);
    errorCheck(__LINE__);
    cudaFree(d_n);
    errorCheck(__LINE__);
}

int main()
{
    struct timespec start, end;
    
    srand(time(NULL));
    
    size_t a = 1024;
    size_t b = 1024;
    size_t maskWidth = 11;
    

    float* m = (float*) malloc(a * b * sizeof(float));
    float* mask = (float*) malloc(maskWidth * maskWidth * sizeof(float));
    float* n = (float*) malloc(a * b * sizeof(float));

    for(int i = 0; i < a * b; ++i)
    {
        m[i] = (float)(1 +(3 * i % 20));
    }
  
    for(int j = 0; j < maskWidth * maskWidth; ++j)
    {
        mask[j] = (float)(1 + (((2 * j)) % maskWidth));
    }

    // do convolution
    convolution_2D(m, mask, n, a, b, maskWidth);


    return 0;
}

Time elapsed on naive GPU convolution 2d tiled ( 32 ) block 20.066240 ms.




In [25]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>

#define BLOCK_WIDTH 32
#define MATRIX_SIZE 1024
// kernel
__global__ void tiledConvolution_2D_Kernel(float* inp, const float* __restrict__ mask, float* d_n, size_t maskWidth, int N_TILE_WIDTH)
{
    float result = 0;
   
    // indexing variables
    int n_row = blockIdx.y * N_TILE_WIDTH + threadIdx.y;
    int n_col = blockIdx.x * N_TILE_WIDTH + threadIdx.x;
    
    int m_row = n_row - maskWidth / 2;
    int m_col = n_col - maskWidth / 2;
    
    __shared__ float tile_m[BLOCK_WIDTH][BLOCK_WIDTH];
    
    // thread boundary check for loading input tiles
    if(m_row >= 0 && m_row < MATRIX_SIZE && m_col >= 0 && m_col < MATRIX_SIZE)
    {
        tile_m[threadIdx.y][threadIdx.x] = inp[m_row * MATRIX_SIZE + m_col];
    }
    else
    {
        tile_m[threadIdx.y][threadIdx.x] = 0;
    }
    
    __syncthreads();
    
    // thread boundary check for calculation
    if(threadIdx.y < N_TILE_WIDTH && threadIdx.x < N_TILE_WIDTH && n_row < MATRIX_SIZE && n_col < MATRIX_SIZE)
    {
        for(int i = 0; i < maskWidth; ++i)
        {
            for(int j = 0; j < maskWidth; ++j)
            {
                result += mask[i * maskWidth + j] * tile_m[threadIdx.y + i][threadIdx.x + j];
            }
        }
        
        // write result 
        d_n[n_row * MATRIX_SIZE + n_col] = result;
    }
}

// CUDA error checking
void errorCheck(unsigned int line)
{
    cudaError_t cudaError = cudaGetLastError();

    if(cudaError != cudaSuccess)
    {
        printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
        exit(EXIT_FAILURE);
    }
}

// host function containing kernel call
void convolution_2D(float* m, float* mask, float* n, size_t maskWidth, int N_TILE_WIDTH)
{
    
     float  naive_gpu_elapsed_time_ms;

    // some events to count the execution time
    //clock_t st, end;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    dim3 numOfBlocks(ceil(MATRIX_SIZE / (float) N_TILE_WIDTH), ceil(MATRIX_SIZE / (float) N_TILE_WIDTH), 1);
    dim3 numOfThreads(BLOCK_WIDTH, BLOCK_WIDTH, 1);

    float* d_m;
    float* d_mask;
    float* d_n;

    cudaMalloc((void**) &d_m, MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
    errorCheck(__LINE__);
    cudaMalloc((void**) &d_mask, maskWidth * maskWidth * sizeof(float));
    errorCheck(__LINE__);
    cudaMalloc((void**) &d_n, MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
    errorCheck(__LINE__);
    
    cudaMemcpy(d_m, m, maskWidth * maskWidth * sizeof(float), cudaMemcpyHostToDevice);
    errorCheck(__LINE__);
    cudaMemcpy(d_mask, mask, maskWidth * maskWidth * sizeof(float), cudaMemcpyHostToDevice);
    errorCheck(__LINE__);

    cudaEventRecord(start, 0);

    tiledConvolution_2D_Kernel<<<numOfBlocks, numOfThreads>>>(d_m, d_mask, d_n, maskWidth,  N_TILE_WIDTH);
    errorCheck(__LINE__);
 
     cudaThreadSynchronize();

    //time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
    printf("Mask size: %d\n", maskWidth);
    printf("Time elapsed on naive GPU convolution 2d untiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
    printf("______________________________________________________________________________\n");
    cudaMemcpy(n, d_n, maskWidth * maskWidth * sizeof(float), cudaMemcpyDeviceToHost);
    errorCheck(__LINE__);

    cudaFree(d_m);
    errorCheck(__LINE__);
    cudaFree(d_mask);
    errorCheck(__LINE__);
    cudaFree(d_n);
    errorCheck(__LINE__);
}

int main()
{
    struct timespec start, end;
    
    for(size_t maskWidth = 11; maskWidth <= 17; maskWidth+=2){
      int N_TILE_WIDTH = BLOCK_WIDTH - (maskWidth - 1);
      float* inp = (float*) malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
      float* mask = (float*) malloc(maskWidth * maskWidth * sizeof(float));
      float* out = (float*) malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(float));

      for(int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; ++i) {
          inp[i] = (float)(1 +(3 * i % 20));
      }
    
      for(int j = 0; j < maskWidth * maskWidth; ++j) {
          mask[j] = (float)(1 + (((2 * j)) % maskWidth));
      }
      
      // do convolution
      convolution_2D(inp, mask, out, maskWidth, N_TILE_WIDTH);
    }


    return 0;
}

Mask size: 11
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 1.241056 ms.

______________________________________________________________________________
Mask size: 13
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 1.841952 ms.

______________________________________________________________________________
Mask size: 15
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 2.666240 ms.

______________________________________________________________________________
Mask size: 17
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 3.776864 ms.

______________________________________________________________________________

