<a href="https://colab.research.google.com/github/giuseppeegentile/2d-convolution-cuda/blob/main/Working_2d_conv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-u_h1lo3m
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-u_h1lo3m
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=ed688ccaa9467464b4bc73757d1a5ed79c3a281baad47f895c828d73901031e2
  Stored in directory: /tmp/pip-ephem-wheel-cache-4yzurwit/wheels/ca/33/8d/3c86eb85e97d2b6169d95c6e8f2c297fdec60db6e84cb56f5e
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [25]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>

#define BLOCK_WIDTH 32
#define MATRIX_SIZE 1024

// kernel
__global__ void tiledConvolution_2D_Kernel(float* inp, const float* d_mask, float* out, size_t maskWidth)
{
    float result = 0;
   
    // indexing variables
    int outRow = blockIdx.y * blockDim.y + threadIdx.y;
    int outCol = blockIdx.x * blockDim.x + threadIdx.x;
    
    int inputRow = outRow - maskWidth / 2;
    int inputCol = outCol - maskWidth / 2;
  
    
    // thread boundary check for calculation
    if(outRow < MATRIX_SIZE && outCol < MATRIX_SIZE)
    {
        for(int i = 0; i < maskWidth; ++i)
        {
            for(int j = 0; j < maskWidth; ++j)
            {
                int curCol = inputCol + j;
                int curRow = inputRow + i;
                if (curCol < MATRIX_SIZE && curRow < MATRIX_SIZE && curCol > -1 && curRow > -1) 
                  result += inp[curRow * MATRIX_SIZE + curCol] * d_mask[i * maskWidth + j];
            }
        }
        
        // write result 
        out[outRow * MATRIX_SIZE + outCol] = result;
    }
}

// CUDA error checking
void errorCheck(unsigned int line)
{
    cudaError_t cudaError = cudaGetLastError();

    if(cudaError != cudaSuccess)
    {
        printf("CUDA error in line %u in file %s: %s\n", line - 1, __FILE__, cudaGetErrorString(cudaError));
        exit(EXIT_FAILURE);
    }
}

// host function containing kernel call
void convolution_2D(float* m, float* mask, float* n, size_t maskWidth) {
    float  naive_gpu_elapsed_time_ms;
    //clock_t st, end;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    dim3 dimGrid(ceil(MATRIX_SIZE + BLOCK_WIDTH - 1/BLOCK_WIDTH), ceil(MATRIX_SIZE + BLOCK_WIDTH - 1/  BLOCK_WIDTH), 1);
    dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH, 1);

    float* d_m;
    float* d_mask;
    float* d_n;

    cudaMalloc((void**) &d_m, MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
    cudaMalloc((void**) &d_mask, maskWidth * maskWidth * sizeof(float));
    cudaMalloc((void**) &d_n, MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
    
    cudaMemcpy(d_m, m, MATRIX_SIZE * MATRIX_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_mask, mask, maskWidth * maskWidth * sizeof(float), cudaMemcpyHostToDevice);
    cudaEventRecord(start, 0);
    tiledConvolution_2D_Kernel<<<dimGrid, dimBlock>>>(d_m, d_mask, d_n, maskWidth);
    

    cudaThreadSynchronize();

    //time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
    printf("Mask size: %d\n", maskWidth);
    printf("Time elapsed on naive GPU convolution 2d untiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
    printf("______________________________________________________________________________\n");
    cudaMemcpy(n, d_n, MATRIX_SIZE * MATRIX_SIZE * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(d_m);
    cudaFree(d_mask);
    cudaFree(d_n);
}

int main()
{
     for(size_t maskWidth = 11; maskWidth <= 17; maskWidth+=2){
      float* inp = (float*) malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
      float* mask = (float*) malloc(maskWidth * maskWidth * sizeof(float));
      float* out = (float*) malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(float));

      for(int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; ++i) {
          inp[i] = (float)(1 +(3 * i % 20));
      }
    
      for(int j = 0; j < maskWidth * maskWidth; ++j) {
          mask[j] = (float)(1 + (((2 * j)) % maskWidth));
      }
      
      // do convolution
      convolution_2D(inp, mask, out, maskWidth);
    }


    return 0;
}

Mask size: 11
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 18.565121 ms.

______________________________________________________________________________
Mask size: 13
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 18.814432 ms.

______________________________________________________________________________
Mask size: 15
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 19.213600 ms.

______________________________________________________________________________
Mask size: 17
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 19.587105 ms.

______________________________________________________________________________



In [26]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>

#define BLOCK_WIDTH 32
#define MATRIX_SIZE 1024
// kernel
__global__ void tiledConvolution_2D_Kernel(float* inp, const float* __restrict__ mask, float* d_n, size_t maskWidth, int N_TILE_WIDTH)
{
    float result = 0;
   
    int outputRow = blockIdx.y * N_TILE_WIDTH + threadIdx.y;
    int outputCol = blockIdx.x * N_TILE_WIDTH + threadIdx.x;
    
    int inputRow = outputRow - maskWidth / 2;
    int inputCol = outputCol - maskWidth / 2;
    
    __shared__ float tile[BLOCK_WIDTH][BLOCK_WIDTH];
    
    // thread boundary check for loading input tiles
    if(inputRow >= 0 && inputRow < MATRIX_SIZE && inputCol >= 0 && inputCol < MATRIX_SIZE){
        tile[threadIdx.y][threadIdx.x] = inp[inputRow * MATRIX_SIZE + inputCol];
    }
    else {
        tile[threadIdx.y][threadIdx.x] = 0;
    }
    
    __syncthreads();
    
    // thread boundary check for calculation
    if(threadIdx.y < N_TILE_WIDTH && threadIdx.x < N_TILE_WIDTH && outputRow < MATRIX_SIZE && outputCol < MATRIX_SIZE){
        for(int i = 0; i < maskWidth; ++i){
            for(int j = 0; j < maskWidth; ++j){
                result += mask[i * maskWidth + j] * tile[threadIdx.y + i][threadIdx.x + j];
            }
        }
        
        // write result 
        d_n[outputRow * MATRIX_SIZE + outputCol] = result;
    }
}

// host function containing kernel call
void convolution_2D(float* m, float* mask, float* n, size_t maskWidth, int N_TILE_WIDTH) {
    
     float  naive_gpu_elapsed_time_ms;

    // some events to count the execution time
    //clock_t st, end;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    dim3 dimGrid(ceil(MATRIX_SIZE / (float) N_TILE_WIDTH), ceil(MATRIX_SIZE / (float) N_TILE_WIDTH), 1);
    dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH, 1);

    float* d_m;
    float* d_mask;
    float* d_n;

    cudaMalloc((void**) &d_m, MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
    
    cudaMalloc((void**) &d_mask, maskWidth * maskWidth * sizeof(float));
    
    cudaMalloc((void**) &d_n, MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
    
    
    cudaMemcpy(d_m, m, MATRIX_SIZE * MATRIX_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    
    cudaMemcpy(d_mask, mask, maskWidth * maskWidth * sizeof(float), cudaMemcpyHostToDevice);
    

    cudaEventRecord(start, 0);

    tiledConvolution_2D_Kernel<<<dimGrid, dimBlock>>>(d_m, d_mask, d_n, maskWidth,  N_TILE_WIDTH);
    
 
     cudaThreadSynchronize();

    //time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
    printf("Mask size: %d\n", maskWidth);
    printf("Time elapsed on naive GPU convolution 2d untiled ( %d ) block %f ms.\n\n", BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
    printf("______________________________________________________________________________\n");
    cudaMemcpy(n, d_n, maskWidth * maskWidth * sizeof(float), cudaMemcpyDeviceToHost);


    cudaFree(d_m);
    
    cudaFree(d_mask);
    
    cudaFree(d_n);
    
}
int main() {
    
    for(size_t maskWidth = 11; maskWidth <= 17; maskWidth+=2){
      int N_TILE_WIDTH = BLOCK_WIDTH - (maskWidth - 1);
      float* inp = (float*) malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(float));
      float* mask = (float*) malloc(maskWidth * maskWidth * sizeof(float));
      float* out = (float*) malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(float));

      for(int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; ++i) {
          inp[i] = (float)(1 +(3 * i % 20));
      }
    
      for(int j = 0; j < maskWidth * maskWidth; ++j) {
          mask[j] = (float)(1 + (((2 * j)) % maskWidth));
      }
      
      // do convolution
      convolution_2D(inp, mask, out, maskWidth, N_TILE_WIDTH);
    }


    return 0;
}

Mask size: 11
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 1.236800 ms.

______________________________________________________________________________
Mask size: 13
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 1.863456 ms.

______________________________________________________________________________
Mask size: 15
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 2.660992 ms.

______________________________________________________________________________
Mask size: 17
Time elapsed on naive GPU convolution 2d untiled ( 32 ) block 3.772160 ms.

______________________________________________________________________________

