<a href="https://colab.research.google.com/github/choudry467/CS309CUDA/blob/main/CS309_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setting up the environment

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

#Grayscale linear vs parallel

In [7]:
%%cu
#include <stdio.h>
#include <time.h>
#define STB_IMAGE_IMPLEMENTATION
#include "/content/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "/content/stb_image_write.h"



__global__ void Kernel(int width, int height, unsigned char* p, unsigned char* pg, int channels) {
   
  
  // Getting Row and Column for each thread
  int i = blockIdx.y * blockDim.y + threadIdx.y;
	int j = blockIdx.x * blockDim.x + threadIdx.x;
  // Ignoring the threads that do not properly map onto the pixels
  if (i >= height || j >= width) return;

  int gray_channel = (channels==4)? 2 : 1; 

  unsigned char r = p[(channels * width * i) + (channels * j)];          //Red Channel read
  unsigned char g = p[(channels * width * i) + (channels * j) + 1 ];     //Green channel read
  unsigned char b = p[(channels * width * i) + (channels * j) + 2 ];     //Blue channel read
 
  pg[(gray_channel * width * i) + j * gray_channel]        = (uint8_t) (0.21*r + 0.71*g + 0.072*b);         // writing final pixel
  
  //Copying the alpha channel
  if(channels == 4)
      pg[(gray_channel * width * i) + (gray_channel * j)+1] = p[(channels * width * i) + (channels * j)+ 3];

}

// Cuda check method
void check(cudaError_t retVal) {
  //takes return value of a CUDA function and checks if it was an error
  if(retVal != cudaSuccess) {
    fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(retVal));
    exit(1);
  }
}

int main() {
    
     // Loading the image
     int width, height, channels;
     unsigned char *img = stbi_load("/content/Marbles.bmp", &width, &height, &channels, 0);
     if(img == NULL) {
         printf("Error in loading the image\n");
         exit(1);
     }
     printf("Loaded image with a width of %dpx, a height of %dpx and %d channels\n", width, height, channels);

    
    int gray_channel = (channels==4)? 2 : 1; 
    size_t img_size = width * height * channels;
    size_t img_size_out = width * height * 1;
    
    //Creating memory block for output image
    unsigned char *gray_img;
    gray_img = (unsigned char*) malloc(img_size_out);

    //allocate timers
    cudaEvent_t start;
    check(cudaEventCreate(&start));
    cudaEvent_t stop;
    check(cudaEventCreate(&stop));

  
    //start timer
    check(cudaEventRecord(start,0));

    //Cuda input and output memory blocks
    unsigned char *cpi;
    unsigned char *cpo;

    check(cudaMalloc((void**) &cpi, img_size));
    check(cudaMalloc((void**) &cpo, img_size_out));
    

    check(cudaMemcpy(cpi, img, img_size, cudaMemcpyHostToDevice));

    dim3 dimBlock(32, 32); 
    dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);
    Kernel<<<dimGrid,dimBlock>>>(width,height,cpi,cpo,channels);
    cudaError_t err = cudaGetLastError();
    check(err);

    check(cudaMemcpy(gray_img, cpo, img_size_out, cudaMemcpyDeviceToHost));

    
    //stop timer and print time
    check(cudaEventRecord(stop,0));
    check(cudaEventSynchronize(stop));
    float diff;
    check(cudaEventElapsedTime(&diff, start, stop));


    //deallocate timers
    check(cudaEventDestroy(start));
    check(cudaEventDestroy(stop));

    printf("Time: %f ms\n", diff);
    
    //Writing the image to the file. Last argument is the compression size. 100 means no compression
    stbi_write_jpg("/content/Marblesgray.jpg", width, height, 1, gray_img, 100);

    //Deallocation of memory
    stbi_image_free(img);
    free(gray_img);
    check(cudaFree(cpi));
    check(cudaFree(cpo));
}

Loaded image with a width of 1419px, a height of 1001px and 3 channels
Time: 2.470656 ms



#Gaussian Blur

In [11]:
%%cu
#include <stdio.h>
#define STB_IMAGE_IMPLEMENTATION
#include "/content/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "/content/stb_image_write.h"



__global__ void Kernel(int width, int height, unsigned char* p, unsigned char* pg, int channels) {
   
  
  // Getting Row and Column for each thread
  int i = blockIdx.y * blockDim.y + threadIdx.y;
	int j = blockIdx.x * blockDim.x + threadIdx.x;
  // Ignoring the threads that do not properly map onto the pixels
  if (i >= height || j >= width) return;
  unsigned char r;
  unsigned char g; 
  unsigned char b;
  if (i <= 1 || i>=height-2 || j <= 1 || j >= width-2){
      r = p[(channels * width * i) + (channels * j)];          //Red Channel read
      g = p[(channels * width * i) + (channels * j) + 1 ];     //Green channel read
      b = p[(channels * width * i) + (channels * j) + 2 ];     //Blue channel read
      pg[(width * i) + j] = (uint8_t) (0.21*r + 0.71*g + 0.072*b);
  }else{
    float matrix[3][3] = {1,2,1,
                          2,3,2,
                          1,2,1};
    float pixel = 0.0f;
    for (int x=-1; x<2; x++){
        for (int y=-1; y<2; y++){
            r = p[(channels * width * (x+i)) + (channels * (y+j))];          //Red Channel read
            g = p[(channels * width * (x+i)) + (channels * (y+j)) + 1 ];     //Green channel read
            b = p[(channels * width * (x+i)) + (channels * (y+j)) + 2 ];     //Blue channel read
            pixel += (uint8_t) ((r+g+b)/3)* matrix[x+1][y+1];
        }
    }
    pg[(width * i) + j]= (uint8_t) (pixel/15);
  }
  
  //Copying the alpha channel
  //if(channels == 4)
      //pg[(channels * width * i) + (channels * j)+1] = p[(channels * width * i) + (channels * j)+ 1];

}

// Cuda check method
void check(cudaError_t retVal) {
  //takes return value of a CUDA function and checks if it was an error
  if(retVal != cudaSuccess) {
    fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(retVal));
    exit(1);
  }
}

int main() {
    
     // Loading the image
     int width, height, channels;
     unsigned char *img = stbi_load("/content/Marbles.bmp", &width, &height, &channels, 0);
     if(img == NULL) {
         printf("Error in loading the image\n");
         exit(1);
     }
     printf("Loaded image with a width of %dpx, a height of %dpx and %d channels\n", width, height, channels);

    
   
    size_t img_size = width * height * channels;
    size_t img_size_out = width * height * 1;
    
    //Creating memory block for output image
    unsigned char *blur_img;
    blur_img = (unsigned char*) malloc(img_size_out);

    //allocate timers
    cudaEvent_t start;
    check(cudaEventCreate(&start));
    cudaEvent_t stop;
    check(cudaEventCreate(&stop));

  
    //start timer
    check(cudaEventRecord(start,0));

    //Cuda input and output memory blocks
    unsigned char *cpi;
    unsigned char *cpo;

    check(cudaMalloc((void**) &cpi, img_size));
    check(cudaMalloc((void**) &cpo, img_size));
    

    check(cudaMemcpy(cpi, img, img_size, cudaMemcpyHostToDevice));

    dim3 dimBlock(32, 32); 
    dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);
    Kernel<<<dimGrid,dimBlock>>>(width,height,cpi,cpo,channels);
    cudaError_t err = cudaGetLastError();
    check(err);

    check(cudaMemcpy(blur_img, cpo, img_size_out, cudaMemcpyDeviceToHost));

    
    //stop timer and print time
    check(cudaEventRecord(stop,0));
    check(cudaEventSynchronize(stop));
    float diff;
    check(cudaEventElapsedTime(&diff, start, stop));


    //deallocate timers
    check(cudaEventDestroy(start));
    check(cudaEventDestroy(stop));

    printf("Time: %f ms\n", diff);
    
    //Writing the image to the file. Last argument is the compression size. 100 means no compression
    stbi_write_jpg("/content/Marblesblur.jpg", width, height, 1, blur_img, 100);

    //Deallocation of memory
    stbi_image_free(img);
    free(blur_img);
    check(cudaFree(cpi));
    check(cudaFree(cpo));
}

Loaded image with a width of 1419px, a height of 1001px and 3 channels
Time: 2.546016 ms



#Edge Detection
##sobel

In [17]:
%%cu
#include <stdio.h>
#define STB_IMAGE_IMPLEMENTATION
#include "/content/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "/content/stb_image_write.h"



__global__ void Kernel(int width, int height, unsigned char* p, unsigned char* pg, int channels) {
   
  
  // Getting Row and Column for each thread
  int i = blockIdx.y * blockDim.y + threadIdx.y;
	int j = blockIdx.x * blockDim.x + threadIdx.x;
  // Ignoring the threads that do not properly map onto the pixels
  if (i >= height || j >= width) return;
  unsigned char r;
  unsigned char g; 
  unsigned char b;
  if (i <= 1 || i>=height-2 || j <= 1 || j >= width-2){
      r = p[(channels * width * i) + (channels * j)];          //Red Channel read
      g = p[(channels * width * i) + (channels * j) + 1 ];     //Green channel read
      b = p[(channels * width * i) + (channels * j) + 2 ];     //Blue channel read
      pg[(width * i) + j] = (uint8_t) (0.21*r + 0.71*g + 0.072*b);
  }else{
    int gx[3][3] = {-1,0,1,
                    -2,0,2,
                    -1,0,1};
    int gy[3][3] = {1,2,1,
                    0,0,0,
                    -1,-2,-1};
    float pixelx = 0.0f;
    float pixely = 0.0f;
    for (int x=-1; x<2; x++){
        for (int y=-1; y<2; y++){
            r = p[(channels * width * (x+i)) + (channels * (y+j))];          //Red Channel read
            g = p[(channels * width * (x+i)) + (channels * (y+j)) + 1 ];     //Green channel read
            b = p[(channels * width * (x+i)) + (channels * (y+j)) + 2 ];     //Blue channel read
            pixelx +=((r+b+g)/3* gx[x+1][y+1]);
            pixely +=((r+b+g)/3* gy[x+1][y+1]);
        }
    }

    float sum = abs(pixelx) + abs(pixely);
    if (sum>255) sum = 255;
    if (sum<0) sum = 0;
    pg[(width * i) + j]= (uint8_t)sum;
  }
  
  //Copying the alpha channel
  //if(channels == 4)
      //pg[(channels * width * i) + (channels * j)+1] = p[(channels * width * i) + (channels * j)+ 1];

}

// Cuda check method
void check(cudaError_t retVal) {
  //takes return value of a CUDA function and checks if it was an error
  if(retVal != cudaSuccess) {
    fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(retVal));
    exit(1);
  }
}

int main() {
    
     // Loading the image
     int width, height, channels;
     unsigned char *img = stbi_load("/content/Marbles.bmp", &width, &height, &channels, 0);
     if(img == NULL) {
         printf("Error in loading the image\n");
         exit(1);
     }
     printf("Loaded image with a width of %dpx, a height of %dpx and %d channels\n", width, height, channels);

    
   
    size_t img_size = width * height * channels;
    size_t img_size_out = width * height * 1;
    
    //Creating memory block for output image
    unsigned char *edge_img;
    edge_img = (unsigned char*) malloc(img_size_out);

    //allocate timers
    cudaEvent_t start;
    check(cudaEventCreate(&start));
    cudaEvent_t stop;
    check(cudaEventCreate(&stop));

  
    //start timer
    check(cudaEventRecord(start,0));

    //Cuda input and output memory blocks
    unsigned char *cpi;
    unsigned char *cpo;

    check(cudaMalloc((void**) &cpi, img_size));
    check(cudaMalloc((void**) &cpo, img_size));
    

    check(cudaMemcpy(cpi, img, img_size, cudaMemcpyHostToDevice));

    dim3 dimBlock(32, 32); 
    dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);
    Kernel<<<dimGrid,dimBlock>>>(width,height,cpi,cpo,channels);
    cudaError_t err = cudaGetLastError();
    check(err);

    check(cudaMemcpy(edge_img, cpo, img_size_out, cudaMemcpyDeviceToHost));

    
    //stop timer and print time
    check(cudaEventRecord(stop,0));
    check(cudaEventSynchronize(stop));
    float diff;
    check(cudaEventElapsedTime(&diff, start, stop));


    //deallocate timers
    check(cudaEventDestroy(start));
    check(cudaEventDestroy(stop));

    printf("Time: %f ms\n", diff);
    
    //Writing the image to the file. Last argument is the compression size. 100 means no compression
    stbi_write_jpg("/content/Marblesedge.jpg", width, height, 1, edge_img, 100);

    //Deallocation of memory
    stbi_image_free(img);
    free(edge_img);
    check(cudaFree(cpi));
    check(cudaFree(cpo));
}

Loaded image with a width of 1419px, a height of 1001px and 3 channels
Time: 2.351072 ms



##Shared Version

In [22]:
%%cu
#include <stdio.h>
#define STB_IMAGE_IMPLEMENTATION
#include "/content/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "/content/stb_image_write.h"

#define tile 20



__global__ void Kernel(int width, int height, unsigned char* p, unsigned char* pg, int channels) {

  __shared__ int local[tile+2][tile+2];
  float pixelx = 0.0f;
  float pixely = 0.0f;
  // Getting Row and Column for each thread
  int i = blockIdx.y * tile + threadIdx.y-1;
	int j = blockIdx.x * tile + threadIdx.x-1;

  i = max(0,i);
  i = min(height-1,i);
  j = max(0,j);
  j = min(width-1,j);
  // Ignoring the threads that do not properly map onto the pixels
  if (i >= height || j >= width) return;
  local[threadIdx.x][threadIdx.y] = (p[(channels * width * i) + (channels * j)] + p[(channels * width * i) + (channels * j) + 1 ] + p[(channels * width * i) + (channels * j) + 2 ])/3;
  __syncthreads();

    int gx[3][3] = {-1,0,1,
                    -2,0,2,
                    -1,0,1};
    int gy[3][3] = {1,2,1,
                    0,0,0,
                    -1,-2,-1};
  if ((threadIdx.x>0) && (threadIdx.x < tile +1) && (threadIdx.y>0) && (threadIdx.y < tile+1)){      
    if (i == 0 || i==height-1 || j == 0 || j == width-1){
        pg[(width * i) + j] = (uint8_t)local[threadIdx.x][threadIdx.y];
    }else{
      for (int x=-1; x<2; x++){
          for (int y=-1; y<2; y++){
              pixelx +=(local[threadIdx.x+x][threadIdx.y+y]* gx[x+1][y+1]);
              pixely +=(local[threadIdx.x+x][threadIdx.y+y]* gy[x+1][y+1]);
          }
      }

      float sum = abs(pixelx) + abs(pixely);
      if (sum>255) sum = 255;
      if (sum<0) sum = 0;
      pg[(width * i) + j]= (uint8_t)sum;
    }
  }
  
  //Copying the alpha channel
  //if(channels == 4)
      //pg[(channels * width * i) + (channels * j)+1] = p[(channels * width * i) + (channels * j)+ 1];

}

// Cuda check method
void check(cudaError_t retVal) {
  //takes return value of a CUDA function and checks if it was an error
  if(retVal != cudaSuccess) {
    fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(retVal));
    exit(1);
  }
}

int main() {
    
     // Loading the image
     int width, height, channels;
     unsigned char *img = stbi_load("/content/Portrait.jpg", &width, &height, &channels, 0);
     if(img == NULL) {
         printf("Error in loading the image\n");
         exit(1);
     }
     printf("Loaded image with a width of %dpx, a height of %dpx and %d channels\n", width, height, channels);

    
   
    size_t img_size = width * height * channels;
    size_t img_size_out = width * height * 1;
    
    //Creating memory block for output image
    unsigned char *edge_img;
    edge_img = (unsigned char*) malloc(img_size_out);

    //allocate timers
    cudaEvent_t start;
    check(cudaEventCreate(&start));
    cudaEvent_t stop;
    check(cudaEventCreate(&stop));

  
    //start timer
    check(cudaEventRecord(start,0));

    //Cuda input and output memory blocks
    unsigned char *cpi;
    unsigned char *cpo;

    check(cudaMalloc((void**) &cpi, img_size));
    check(cudaMalloc((void**) &cpo, img_size));
    

    check(cudaMemcpy(cpi, img, img_size, cudaMemcpyHostToDevice));

    dim3 dimBlock(tile+2, tile+2); 
    dim3 dimGrid((width + tile - 1) / tile, (height + tile - 1) / tile);
    Kernel<<<dimGrid,dimBlock>>>(width,height,cpi,cpo,channels);
    cudaError_t err = cudaGetLastError();
    check(err);

    check(cudaMemcpy(edge_img, cpo, img_size_out, cudaMemcpyDeviceToHost));

    
    //stop timer and print time
    check(cudaEventRecord(stop,0));
    check(cudaEventSynchronize(stop));
    float diff;
    check(cudaEventElapsedTime(&diff, start, stop));


    //deallocate timers
    check(cudaEventDestroy(start));
    check(cudaEventDestroy(stop));

    printf("Time: %f ms\n", diff);
    
    //Writing the image to the file. Last argument is the compression size. 100 means no compression
    stbi_write_jpg("/content/Cloudsedge.jpg", width, height, 1, edge_img, 100);

    //Deallocation of memory
    stbi_image_free(img);
    free(edge_img);
    check(cudaFree(cpi));
    check(cudaFree(cpo));
}

Loaded image with a width of 2000px, a height of 1160px and 3 channels
Time: 2.930144 ms



#Paint Algorithm linear vs parallel

In [23]:
%%cu
#include <stdio.h>
#define STB_IMAGE_IMPLEMENTATION
#include "/content/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "/content/stb_image_write.h"


// GPU method
__global__ void Kernel(int width, int height, unsigned char* p, unsigned char* pg, int radius, int nBins,int channels) {
  //Setting up main constants
  #define nBins 30
  #define radius 5
   
  
  // Getting Row and Column for each thread
  int i = blockIdx.y * blockDim.y + threadIdx.y;
	int j = blockIdx.x * blockDim.x + threadIdx.x;
  // Ignoring the threads that do not properly map onto the pixels
  if (i >= height || j >= width) return;


  // Setting up arrays for bins
  #define maxIntensity 256
  int intensityCount[maxIntensity];
	int avgR[maxIntensity];
	int avgG[maxIntensity];
	int avgB[maxIntensity];

  //Setting those arrays to zero
	for (int k=0; k <= nBins; k++)
	{
		intensityCount[k] = 0;
		avgR[k] = 0;
		avgG[k] = 0;
		avgB[k] = 0;
	}

  int maxIntensityCount = 0;
	int maxIntensityCountIndex = 0;

  // Nested for loops to go over all the surrounding pixels in the array within the radius
	for (int k=i-radius; k <= i+radius;k++){
		if (k < 0 || k >= height) continue; // Boundry condition for pixels on the edge
		for (int l=j-radius; l <= j+radius; l++){
			if (l < 0 || l >= width) continue; // Boundry condition for pixels on the edge

            unsigned char r = (uint8_t)p[(channels * width * k) + (channels * l)];          //Red Channel read
            unsigned char g = (uint8_t)p[(channels * width * k) + (channels * l) + 1 ];     //Green channel read
            unsigned char b = (uint8_t)p[(channels * width * k) + (channels * l) + 2 ];     //Blue channel read

            // Binned Intensity
            int curIntensity = ((r+g+b)/3*nBins)/255;  

            intensityCount[curIntensity]++;

            // Checking for the most frequent bin Value      
            if (intensityCount[curIntensity] > maxIntensityCount){
                    
                maxIntensityCount = intensityCount[curIntensity];
                maxIntensityCountIndex = curIntensity;
            }
            // Summing all the pixel values that fall in the same bin. Later used to find avg pixel value
            avgR[curIntensity] += r;
            avgG[curIntensity] += g;
            avgB[curIntensity] += b;

        }
    }

    //Assigning final values to the pixels
    unsigned char finalR = avgR[maxIntensityCountIndex] / maxIntensityCount;
    unsigned char finalG = avgG[maxIntensityCountIndex] / maxIntensityCount;
    unsigned char finalB = avgB[maxIntensityCountIndex] / maxIntensityCount;
    
    pg[(channels * width * i) + (channels * j)]        = (uint8_t)finalR;         // red
    pg[(channels * width * i) + (channels * j)+1]      = (uint8_t)finalG;         // green
    pg[(channels * width * i) + (channels * j)+2]      = (uint8_t)finalB;         // blue 

    // Copying the alpha channel
    if(channels == 4)
        pg[(channels * width * i) + (channels * j)+3] = p[(channels * width * i) + (channels * j)+3];

}

// Cuda check method
void check(cudaError_t retVal) {
  //takes return value of a CUDA function and checks if it was an error
  if(retVal != cudaSuccess) {
    fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(retVal));
    exit(1);
  }
}

int main() {
    
     // Loading the image
     int width, height, channels;
     unsigned char *img = stbi_load("/content/Marbles.bmp", &width, &height, &channels, 0);
     if(img == NULL) {
         printf("Error in loading the image\n");
         exit(1);
     }
     printf("Loaded image with a width of %dpx, a height of %dpx and %d channels\n", width, height, channels);

    

    size_t img_size = width * height * channels;
    
    //Creating memory block for output image
    unsigned char *paint_img;
    paint_img = (unsigned char*) malloc(img_size);

    //allocate timers
    cudaEvent_t start;
    check(cudaEventCreate(&start));
    cudaEvent_t stop;
    check(cudaEventCreate(&stop));

  
    //start timer
    check(cudaEventRecord(start,0));

    //Cuda input and output memory blocks
    unsigned char *cpi;
    unsigned char *cpo;

    check(cudaMalloc((void**) &cpi, img_size));
    check(cudaMalloc((void**) &cpo, img_size));
    

    check(cudaMemcpy(cpi, img, img_size, cudaMemcpyHostToDevice));

    dim3 dimBlock(32, 32); 
    dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);
    Kernel<<<dimGrid,dimBlock>>>(width,height,cpi,cpo,radius,nBins,channels);
    cudaError_t err = cudaGetLastError();
    check(err);

    check(cudaMemcpy(paint_img, cpo, img_size, cudaMemcpyDeviceToHost));

    
    //stop timer and print time
    check(cudaEventRecord(stop,0));
    check(cudaEventSynchronize(stop));
    float diff;
    check(cudaEventElapsedTime(&diff, start, stop));


    //deallocate timers
    check(cudaEventDestroy(start));
    check(cudaEventDestroy(stop));

    printf("Time: %f ms\n", diff);
    
    //Writing the image to the file. Last argument is the compression size. 100 means no compression
    stbi_write_jpg("/content/paint.jpg", width, height, channels, paint_img, 100);

    //Deallocation of memory
    stbi_image_free(img);
    free(paint_img);
    check(cudaFree(cpi));
    check(cudaFree(cpo));
}

Loaded image with a width of 1419px, a height of 1001px and 3 channels
Time: 116.468674 ms

