---
# **LAB 2 - Modello di programmazione CUDA**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

To use also a plugin for cpp sintax highlighting...

In [None]:
!wget -O cpp_plugin.py https://gist.github.com/akshaykhadse/7acc91dd41f52944c6150754e5530c4b/raw/cpp_plugin.py
%load_ext cpp_plugin

# ✅ Blocks and grids

**Grid 1D**: prints DIMs and IDs of grid, block and thread


In [None]:
%%cuda_group_save --name "checkIndex.cu" --group "lez2"

#include <stdio.h>

__global__ void checkIndex(void) {
	printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) "
					"blockDim:(%d, %d, %d) gridDim:(%d, %d, %d)\n",
					threadIdx.x, threadIdx.y, threadIdx.z,
					blockIdx.x, blockIdx.y, blockIdx.z,
					blockDim.x, blockDim.y, blockDim.z,
					gridDim.x,gridDim.y,gridDim.z);
}

/*
* MAIN
*/
int main(int argc, char **argv) {

	// grid and block definition
	dim3 block(4);
	dim3 grid(3);

	// Print from host
	printf("Print from host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// Print from device
	printf("Print from device:\n");
	checkIndex<<<grid, block>>>();

	// reset device
	cudaDeviceReset();
	return(0);
}

↩ Run...

In [None]:
!nvcc -arch=sm_75 src/lez2/checkIndex.cu -o checkIndex
!./checkIndex


Definire un kernel con block 2D e grid 2D che:

1. usando ID di thread e block calcola la seguente espressione `s = threadIdx.x * blockDim.x + threadIdx.y * blockDim.y + blockIdx.x + blockIdx.y`
2. stampa `sum =  2  <--> threadIdx:(*,*), blockIdx:(*, *), blockDim:(*, *)` se `s` è un numero della sequenza di Fibonacci


NB: Sequenza di Fibonacci ([Fibonacci-wikipedia](https://it.wikipedia.org/wiki/Successione_di_Fibonacci))
$$
\begin{align}
s_0 &= 0,\\
s_1 &= 1,\\
s_{n}&=s_{{n-1}}+s_{{n-2}},\quad \text{(per ogni $n>1$)}
\end{align}
$$


↘️ **SOL...**

In [None]:
%%cuda_group_save --name "checkIndex.cu" --group "lez2"
#include <stdio.h>

/*
 * Show DIMs & IDs for grid, block and thread
 */
__global__ void checkIndex(void) {
  uint tx = threadIdx.x;
  uint ty = threadIdx.y;
  uint bx = blockIdx.x;
  uint by = blockIdx.y;
  uint bxd = blockDim.x;
  uint byd = blockDim.y;
  uint sum = tx * bxd + ty * byd + bx + by;

  // iterative def of fibonacci
  int fn2 = 0;
  int fn1 = 1;
  int fn = fn2 + fn1;
  while (fn < sum) {
    fn2 = fn1;
    fn1 = fn;
    fn = fn1 + fn2;
  }
  if (sum == fn || sum == 0 )
    printf("sum = %2d  <--> threadIdx:(%d, %d), blockIdx:(%d, %d), blockDim:(%d, %d)\n", sum, tx, ty, bx, by, bxd, byd);
}

/*
* MAIN
*/
int main(int argc, char **argv) {

	// grid and block structure
	dim3 block(5,5);
	dim3 grid(3,3);

	// Print from host
	printf("Print from host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// Print from device
	printf("Print from device:\n");
	checkIndex<<<grid, block>>>();

	// reset device
	cudaDeviceReset();
	return (0);
}

In [None]:
!nvcc -arch=sm_75 src/lez2/checkIndex.cu -o checkIndex
!./checkIndex


↘️ **TODO...**

In [None]:
%%cuda
#include <stdio.h>

/*
 * Show DIMs & IDs for grid, block and thread
 */
__global__ void checkIndex(void) {

  // TODO

}

int main(int argc, char **argv) {

	// grid and block structure


	// Print from host
	printf("Print from host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// Print from device

	// reset device
	cudaDeviceReset();
	return (0);
}

In [None]:
!nvcc -arch=sm_75 src/lez2/checkIndex.cu -o checkIndex
!./checkIndex


# ✅ Sum of vectors

In [None]:
%%cuda_group_save --name "vector_sum.cu" --group "lez2"
#include <stdio.h>

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess) {                                                \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
    }                                                                          \
}

/**
 * CUDA Kernel: vector addition
 */
__global__ void vector_sum(const float *A, const float *B, float *C, int numElements) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < numElements)
		C[i] = A[i] + B[i];
}

/**
 * MAIN
 */
int main(void) {
	// Error code to check return values for CUDA calls
	cudaError_t err = cudaSuccess;

	// Print the vector length to be used, and compute its size
	int N = 10000000;
	size_t size = N * sizeof(float);
	printf("Vector addition of %d elements (%f MB)\n", N, size/1E6);

	// Allocate the host input vector A,B,C
	float *h_A = (float *) malloc(size);
	float *h_B = (float *) malloc(size);
	float *h_C = (float *) malloc(size);

	// Initialize the host input vectors
	for (int i = 0; i < N; ++i) {
		h_A[i] = rand() % 10;
		h_B[i] = rand() % 10;
	}

	// Allocate the device input vector A,B,C
	float *d_A = NULL;
	CHECK(cudaMalloc((void **) &d_A, size));
	float *d_B = NULL;
	CHECK(cudaMalloc((void **) &d_B, size));
	float *d_C = NULL;
	CHECK(cudaMalloc((void **) &d_C, size));

	// Copy the host input vectors A and B in device memory
	printf("Copy input data from the host memory to the CUDA device\n");
	CHECK(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));

	// Launch the Vector Add CUDA Kernel
	int threadsPerBlock = 1024;
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
	printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
	vector_sum<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
	CHECK(cudaGetLastError());
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
				cudaGetErrorString(err));
		exit (EXIT_FAILURE);
	}

	// Copy the device result vector in host memory
	printf("Copy output data from the CUDA device to the host memory\n");
	CHECK(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));

	// Verify that the result vector is correct
	for (int i = 0; i < N; ++i) {
		if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
			fprintf(stderr, "Result verification failed at element %d!\n", i);
			exit (EXIT_FAILURE);
		}
	}

	printf("Test PASSED\n");

	// Free device global memory
	CHECK(cudaFree(d_A));
	CHECK(cudaFree(d_B));
	CHECK(cudaFree(d_C));

	// Free host memory
	free(h_A);
	free(h_B);
	free(h_C);

	printf("Done\n");
	return 0;
}



↩ Run...

In [None]:
!nvcc -arch=sm_75 src/lez2/vector_sum.cu -o vector_sum
!./vector_sum


# ✅ Image flip - CPU

**Visualizza immagine in python**: Librerie python per lettura/scrittura file di immagini e loro display: [openCV](https://docs.opencv.org/master/index.html) e [matplotlib](https://matplotlib.org/)

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
dog = cv.imread('/content/GPUcomputing/images/dog.ppm')
print('Image size: ', dog.shape)
# BGR is converted to RGB
dog = cv.cvtColor(dog, cv.COLOR_BGR2RGB)
plt.imshow(dog)
plt.show()

In [None]:
%%cpp -n flip_PPM.cpp -s xcode

#include "ppm.h"

int main(void) {
    char path[] = "GPUcomputing/images/dog.ppm";
    PPM *img = ppm_load(path);
    printf("PPM image size (w x h): %d x %d\n", img->width, img->height);

    // flip horizontally
    PPM *img2 = ppm_copy(img);
    ppm_flipH(img2);
    ppm_write(img2, "dogH.ppm");

    // flip vertically
    PPM *img3 = ppm_copy(img);
    ppm_flipV(img3);
    ppm_write(img3, "dogV.ppm");

    return 0;
}

↩ Run...

In [None]:
!g++ -I GPUcomputing/utils/PPM GPUcomputing/utils/PPM/ppm.cpp flip_PPM.cpp -o flip_PPM
!./flip_PPM

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
dogV = cv.imread('dogV.ppm')
dogV = cv.cvtColor(dogV, cv.COLOR_BGR2RGB)
dogH = cv.imread('dogH.ppm')
dogH = cv.cvtColor(dogH, cv.COLOR_BGR2RGB)
plt.imshow(dogV)
plt.show()
plt.imshow(dogH)
plt.show()

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
julia_jet = cv.imread('images/julia_jet.bmp')
print('Image size: ', julia_jet.shape)
# BGR is converted to RGB
julia_jet = cv.cvtColor(julia_jet, cv.COLOR_BGR2RGB)
julia_jetV = cv.imread('julia_jetV.bmp')
julia_jetV = cv.cvtColor(julia_jetV, cv.COLOR_BGR2RGB)
julia_jetH = cv.imread('julia_jetH.bmp')
julia_jetH = cv.cvtColor(julia_jetH, cv.COLOR_BGR2RGB)
plt.imshow(julia_jet)
plt.show()
plt.imshow(julia_jetV)
plt.show()
plt.imshow(julia_jetH)
plt.show()

# ✅ Image flip - GPU

↘️ **SOL...**

In [None]:
%%cuda_group_save --name "ppm_flipH_GPU.cu" --group "lez2"

#include <stdio.h>
#include <stdlib.h>
#include "ppm.h"
#include "../../GPUcomputing/utils/common.h"

/*
 * Kernel 1D that flips inplace the PPM image horizontally:
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void ppm_flipH_GPU(PPM ppm) {

   // ** pixel granularity **
   uint tid = blockIdx.x * blockDim.x + threadIdx.x;
   uint WIDTH = ppm.width;
   uint HALF = WIDTH;
   uint y = tid / HALF;	     // row y
   uint x1 = tid % HALF;       // col x1
   uint x2 = WIDTH - x1 - 1;    // col x2

   // check if y1 is in the first half of the image
   if (x1 >= WIDTH/2)
      return;

   //  ** byte granularity **
   pel tmp;
   uint pel_idx1 = 3 * (x1 + y * HALF);
   tmp.r = ppm.image[pel_idx1];
   tmp.g = ppm.image[pel_idx1 + 1];
   tmp.b = ppm.image[pel_idx1 + 2];

   // copy pel[y1,x] in pel[y2,x]
   uint pel_idx2 = 3 * (x2 + y * WIDTH);
   ppm.image[pel_idx1] =     ppm.image[pel_idx2];
   ppm.image[pel_idx1 + 1] = ppm.image[pel_idx2 + 1];
   ppm.image[pel_idx1 + 2] = ppm.image[pel_idx2 + 2];

   // copy pel[y2,x] from tmp
   ppm.image[pel_idx2]     = tmp.r;
   ppm.image[pel_idx2 + 1] = tmp.g;
   ppm.image[pel_idx2 + 2] = tmp.b;
}

/*
 * MAIN
 */
int main(int argc, char **argv) {

   // PPM images
   PPM *ppm, *ppm1, *ppm2;  // Where images are stored in CPU
   PPM ppm_d;	             // Where images are stored in GPU

   // load a PPM image from file
   char path[] = "GPUcomputing/images/dog.ppm";
   ppm = ppm_load(path);
   ppm1 = ppm_copy(ppm);
   uint WIDTH = ppm->width;
   uint HEIGHT = ppm->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // set main params
   size_t nBytes= WIDTH * HEIGHT * sizeof(pel);
   ppm_d.width = WIDTH;
   ppm_d.height = HEIGHT;
   ppm_d.maxval = ppm->maxval;

   // Allocate GPU buffer for the input and output images
   CHECK(cudaMalloc(&ppm_d.image, nBytes));

   // copy image from CPU to GPU
   CHECK(cudaMemcpy(ppm_d.image, ppm->image, nBytes, cudaMemcpyHostToDevice));

   // invoke kernels (define grid and block sizes)
   uint dimBlock = 256;
   uint dimGrid = (WIDTH/2 * HEIGHT + dimBlock - 1) / dimBlock;

   double start = seconds();
   ppm_flipH_GPU <<<dimGrid, dimBlock>>> (ppm_d);
   CHECK(cudaDeviceSynchronize());
   double stopGPU = seconds() - start;

   // copy image from GPU to CPU
   CHECK(cudaMemcpy(ppm1->image, ppm_d.image, nBytes, cudaMemcpyDeviceToHost));
   ppm_write(ppm1, "ppm_flippedH.ppm");

   // check results with CPU
   ppm2 = ppm_copy(ppm);
   start = seconds();
   ppm_flipH(ppm2);
   double stopCPU = seconds() - start;
   char res = ppm_equal(ppm1, ppm2) ? 'Y' : 'N';
   printf("Are equal? %c\n", res);
   ppm_write(ppm2, "output_flippedV_CPU.ppm");

   // times & speedup
   printf("CPU elapsed time: %.4f (msec) \n", stopCPU*1000);
   printf("CPU elapsed time: %.4f (msec) - Speedup %.1f\n", stopGPU*1000, stopCPU/stopGPU);

   return (EXIT_SUCCESS);
}


↩ Run...

In [None]:
!nvcc -arch=sm_75 src/lez2/ppm_flipH_GPU.cu -o flipH -I GPUcomputing/utils/PPM  GPUcomputing/utils/PPM/ppm.cpp
!./flipH

↘️ **TODO...**

In [None]:
%%cuda_group_save --name "ppm_flipH_GPU.cu" --group "lez2"

#include <stdio.h>
#include <stdlib.h>
#include "ppm.h"
#include "../../GPUcomputing/utils/common.h"

/*
 * Kernel 1D that flips inplace the PPM image horizontally:
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void ppm_flipH_GPU() {

   // TODO

}

/*
 * MAIN
 */
int main(int argc, char **argv) {

   // PPM images

   // load a PPM image from file
   char path[] = "GPUcomputing/images/dog.ppm";
   ppm = ppm_load(path);
   ppm1 = ppm_copy(ppm);
   uint WIDTH = ppm->width;
   uint HEIGHT = ppm->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // set main params
   size_t nBytes= WIDTH * HEIGHT * sizeof(pel);
   ppm_d.width = WIDTH;
   ppm_d.height = HEIGHT;
   ppm_d.maxval = ppm->maxval;

   // Allocate GPU buffer for the input and output images

   // copy image from CPU to GPU

   // invoke kernels (define grid and block sizes)

   // copy image from GPU to CPU

   // check results with CPU
   ppm2 = ppm_copy(ppm);
   start = seconds();
   ppm_flipH(ppm2);
   double stopCPU = seconds() - start;
   char res = ppm_equal(ppm1, ppm2) ? 'Y' : 'N';
   printf("Are equal? %c\n", res);
   ppm_write(ppm2, "output_flippedV_CPU.ppm");

   // times & speedup
   printf("CPU elapsed time: %.4f (msec) \n", stopCPU*1000);
   printf("CPU elapsed time: %.4f (msec) - Speedup %.1f\n", stopGPU*1000, stopCPU/stopGPU);

   return (EXIT_SUCCESS);
}


↩ Run...

In [None]:
!nvcc -arch=sm_75 src/lez2/ppm_flipH_GPU.cu -o flipH -I GPUcomputing/utils/PPM  GPUcomputing/utils/PPM/ppm.cpp
!./flipH

↘️ **SOL...**

In [None]:
%%cuda_group_save --name "ppm_flipV_GPU.cu" --group "lez2"

#include <stdio.h>
#include <stdlib.h>
#include "ppm.h"
#include "../../GPUcomputing/utils/common.h"

/*
 * Kernel 1D that flips inplace the PPM image vertically:
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void ppm_flipV_GPU(PPM ppm) {

   // ** pixel granularity **
   uint tid = blockIdx.x * blockDim.x + threadIdx.x;
   uint WIDTH = ppm.width;
   uint HEIGHT = ppm.height;
   uint y1 = tid / WIDTH;	     // row y1
   uint y2 = HEIGHT - y1 - 1;   // row y2
   uint x = tid % WIDTH;        // col x (unique)

   // check if y1 is in the first half of the image
   if (y1 >= HEIGHT/2)
      return;

   //  ** byte granularity **
   pel tmp;
   uint pel_idx1 = 3 * (x + y1 * WIDTH);
   tmp.r = ppm.image[pel_idx1];
   tmp.g = ppm.image[pel_idx1 + 1];
   tmp.b = ppm.image[pel_idx1 + 2];

   // copy pel[y1,x] in pel[y2,x]
   uint pel_idx2 = 3 * (x + y2 * WIDTH);
   ppm.image[pel_idx1] =     ppm.image[pel_idx2];
   ppm.image[pel_idx1 + 1] = ppm.image[pel_idx2 + 1];
   ppm.image[pel_idx1 + 2] = ppm.image[pel_idx2 + 2];

   // copy pel[y2,x] from tmp
   ppm.image[pel_idx2]     = tmp.r;
   ppm.image[pel_idx2 + 1] = tmp.g;
   ppm.image[pel_idx2 + 2] = tmp.b;
}

/*
 * MAIN
 */
int main(int argc, char **argv) {

   // PPM images
   PPM *ppm, *ppm1, *ppm2;  // Where images are stored in CPU
   PPM ppm_d;	             // Where images are stored in GPU

   // load a PPM image from file
   char path[] = "GPUcomputing/images/dog.ppm";
   ppm = ppm_load(path);
   ppm1 = ppm_copy(ppm);
   uint WIDTH = ppm->width;
   uint HEIGHT = ppm->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // set main params
   size_t nBytes= WIDTH * HEIGHT * sizeof(pel);
   ppm_d.width = WIDTH;
   ppm_d.height = HEIGHT;
   ppm_d.maxval = ppm->maxval;

   // Allocate GPU buffer for the input and output images
   CHECK(cudaMalloc(&ppm_d.image, nBytes));

   // copy image from CPU to GPU
   CHECK(cudaMemcpy(ppm_d.image, ppm->image, nBytes, cudaMemcpyHostToDevice));

   // invoke kernels (define grid and block sizes)
   uint dimBlock = 256;
   uint dimGrid = (WIDTH * HEIGHT/2 + dimBlock - 1) / dimBlock;

   double start = seconds();
   ppm_flipV_GPU <<<dimGrid, dimBlock>>> (ppm_d);
   CHECK(cudaDeviceSynchronize());
	 double stopGPU = seconds() - start;

   // copy image from GPU to CPU
   CHECK(cudaMemcpy(ppm1->image, ppm_d.image, nBytes, cudaMemcpyDeviceToHost));
   ppm_write(ppm1, "ppm_flippedV_GPU.ppm");

   // check results with CPU
   ppm2 = ppm_copy(ppm);
   start = seconds();
   ppm_flipV(ppm2);
   double stopCPU = seconds() - start;
   char res = ppm_equal(ppm1, ppm2) ? 'Y' : 'N';
   printf("Are equal? %c\n", res);
   ppm_write(ppm2, "ppm_flippedV_CPU.ppm");

   // times & speedup
   printf("CPU elapsed time: %.4f (msec) \n", stopCPU*1000);
   printf("CPU elapsed time: %.4f (msec) - Speedup %.1f\n", stopGPU*1000, stopCPU/stopGPU);

   return (EXIT_SUCCESS);
}

↩ Run...

In [None]:
!nvcc -arch=sm_75 src/lez2/ppm_flipV_GPU.cu -o flipV -I GPUcomputing/utils/PPM  GPUcomputing/utils/PPM/ppm.cpp
!./flipV

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
dogV = cv.imread('ppm_flippedV_CPU.ppm')
dogV = cv.cvtColor(dogV, cv.COLOR_BGR2RGB)
dogH = cv.imread('ppm_flippedV_GPU.ppm')
dogH = cv.cvtColor(dogH, cv.COLOR_BGR2RGB)
plt.imshow(dogV)
plt.show()
plt.imshow(dogH)
plt.show()

↘️ **TODO...**

In [None]:
%%cuda_group_save --name "ppm_flipV_GPU.cu" --group "lez2"

#include <stdio.h>
#include <stdlib.h>
#include "ppm.h"
#include "../../GPUcomputing/utils/common.h"

/*
 * Kernel 1D that flips inplace the PPM image vertically:
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void ppm_flipV_GPU() {

  // TODO
}

/*
 * MAIN
 */
int main(int argc, char **argv) {

   // PPM images
   PPM *ppm, *ppm1, *ppm2;  // Where images are stored in CPU

   // load a PPM image from file
   char path[] = "GPUcomputing/images/dog.ppm";
   ppm = ppm_load(path);
   ppm1 = ppm_copy(ppm);
   uint WIDTH = ppm->width;
   uint HEIGHT = ppm->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // set main params
   size_t nBytes= WIDTH * HEIGHT * sizeof(pel);
   ppm_d.width = WIDTH;
   ppm_d.height = HEIGHT;
   ppm_d.maxval = ppm->maxval;

   // Allocate GPU buffer for the input and output images

   // copy image from CPU to GPU

   // invoke kernels (define grid and block sizes)

   // copy image from GPU to CPU

   // check results with CPU
   ppm2 = ppm_copy(ppm);
   start = seconds();
   ppm_flipV(ppm2);
   double stopCPU = seconds() - start;
   char res = ppm_equal(ppm1, ppm2) ? 'Y' : 'N';
   printf("Are equal? %c\n", res);
   ppm_write(ppm2, "output_flippedV_CPU.ppm");

   // times & speedup
   printf("CPU elapsed time: %.4f (msec) \n", stopCPU*1000);
   printf("CPU elapsed time: %.4f (msec) - Speedup %.1f\n", stopGPU*1000, stopCPU/stopGPU);

   return (EXIT_SUCCESS);
}

↩ Run...

In [None]:
!nvcc -arch=sm_75 src/lez2/ppm_flipV_GPU.cu -o flipV -I GPUcomputing/utils/PPM  GPUcomputing/utils/PPM/ppm.cpp
!./flipV

# ✅ Sum of matrices

In [None]:
%%cuda_group_save --name "matrix_sum.cu" --group "lez2"

#include <stdio.h>
#include <stdlib.h>

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
    }                                                                          \
}

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
typedef struct {
  int width;
  int height;
  float* elements;
} Matrix;

// Thread block size
#define BLOCK_SIZE 16

/* Matrix sum kernel: Each thread calculates an element of C
*  by summing the corresponding elements of A and B
*/
__global__ void matSumKernel(Matrix A, Matrix B, Matrix C) {

  int y = blockIdx.y * blockDim.y + threadIdx.y;
  int x = blockIdx.x * blockDim.x + threadIdx.x;

  // Check if the thread is within the matrix bounds
  if ((y < A.height) && (x < A.width))
    C.elements[y * C.width + x] = A.elements[y * A.width + x] + B.elements[y * B.width + x];
}

/*
 * MAIN
 */
 int main(int argc, char **argv) {

  // Matrix size
  int nx = 1 << 5;
  int ny = 1 << 5;
  int size = nx * ny;
  int nBytes = nx * ny * sizeof(float);
  printf("Matrix size: %d x %d\n", nx, ny);

  // Matrix on the host (CPU) and device (GPU)
  Matrix A, B, C, d_A, d_B, d_C;
  A.width = B.width = C.width = nx;
  d_A.width = d_B.width = d_C.width = nx;
  A.height = B.height = C.height = ny;
  d_A.height = d_B.height = d_C.height = ny;
  A.elements = (float*)malloc(nBytes);
  B.elements = (float*)malloc(nBytes);
  C.elements = (float*)malloc(nBytes);

  // Initialize A and B with 1.0f and 2.0f
  for (int i = 0; i < size; i++) {
    A.elements[i] = 1.0f;
    B.elements[i] = 2.0f;
  }

  // device mem allocation & copy of A and B
  CHECK(cudaMalloc(&d_A.elements, nBytes));
  CHECK(cudaMalloc(&d_B.elements, nBytes));
  CHECK(cudaMalloc(&d_C.elements, nBytes));
  CHECK(cudaMemcpy(d_A.elements, A.elements, nBytes, cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(d_B.elements, B.elements, nBytes, cudaMemcpyHostToDevice));

  // Invoke kernel
  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  dim3 dimGrid((A.width + dimBlock.x - 1) / dimBlock.x, (A.height + dimBlock.y - 1) / dimBlock.y);
  matSumKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

  // Read C from device memory
  CHECK(cudaMemcpy(C.elements, d_C.elements, nBytes, cudaMemcpyDeviceToHost));

  // check resutls
  int i;
  for (i = 0; i < size; i++) {
    if (C.elements[i] != 3.0f) {
      printf("Error: C[%d] = %f\n", i, C.elements[i]);
      break;
    }
  }
  if (i == size)
    printf("Test PASSED\n");
  else
    printf("Test not PASSED\n");

  // Free device memory
  cudaFree(d_A.elements);
  cudaFree(d_B.elements);
  cudaFree(d_C.elements);
}




↩ Run...

In [None]:
!nvcc -arch=sm_75 src/lez2/matrix_sum.cu -o matrix_sum
!./matrix_sum

# ✅ Image blurring

↘️ **SOL...**

In [None]:
%%cuda_group_save --name "ppm_blurGPU.cu" --group "lez2"

#include <stdio.h>
#include <stdlib.h>
#include "ppm.h"
#include "../../GPUcomputing/utils/common.h"

/*
 * Set pel (pixel element) in ppm image.
 */
 __device__ void ppm_setGPU(PPM ppm, int x, int y, pel c) {
  int i = x + y*ppm.width;
  ppm.image[3*i] = c.r;
  ppm.image[3*i + 1] = c.g;
  ppm.image[3*i + 2] = c.b;
}

/*
* Get pel (pixel element) from ppm image.
*/
__device__ pel ppm_getGPU(PPM ppm, int x, int y) {
  pel p;
  int i = x + y*ppm.width;
  p.r = ppm.image[3*i];
  p.g = ppm.image[3*i + 1];
  p.b = ppm.image[3*i + 2];
  return p;
}

/*
 * Kernel 2D for PPM image blurring
 */
__global__ void ppm_blurGPU(PPM ppm, PPM ppm1, int MASK_SIZE) {

  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;

  if(x < ppm.width && y < ppm.height) {
    float R=0, G=0, B=0;
    int numPixels = 0;
    int RADIUS = MASK_SIZE/2;
    for(int r = -RADIUS; r < RADIUS; ++r) {
        for(int c = -RADIUS; c < RADIUS; ++c) {
            int row = y + r;
            int col = x + c;
            if(row > -1 && row < ppm.height && col > -1 && col < ppm.width) {
                int i = col + row*ppm.width;
                R += ppm.image[3*i];
                G += ppm.image[3*i + 1];
                B += ppm.image[3*i + 2];
                numPixels++;
            }
        }
    }
    int i = x + y*ppm.width;
    ppm1.image[3*i]     = (color)(R/numPixels);
    ppm1.image[3*i + 1] = (color)(G/numPixels);
    ppm1.image[3*i + 2] = (color)(B/numPixels);
  }
}

/*
 * MAIN
 */
int main(int argc, char **argv) {

  // PPM images
  PPM *ppm, *ppm1, *ppm2;  // Where images are stored in CPU
  PPM ppm_d, ppm1_d;	     // Where images are stored in GPU

  // load a PPM image from file
  char path[] = "GPUcomputing/images/dog.ppm";
  ppm = ppm_load(path);
  ppm1 = ppm_copy(ppm);
  uint WIDTH = ppm->width;
  uint HEIGHT = ppm->height;
  printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

  // set main params
  size_t nBytes= WIDTH * HEIGHT * sizeof(pel);
  ppm_d.width = WIDTH;
  ppm_d.height = HEIGHT;
  ppm_d.maxval = ppm->maxval;
  int MASK_SIZE = 21;

  // Allocate GPU buffer for the input and output images
  CHECK(cudaMalloc(&ppm_d.image, nBytes));
  CHECK(cudaMalloc(&ppm1_d.image, nBytes));

  // copy image from CPU to GPU
  CHECK(cudaMemcpy(ppm_d.image, ppm->image, nBytes, cudaMemcpyHostToDevice));

  // invoke kernels (define grid and block sizes)
  uint dimBlock = 16;
  dim3 block(dimBlock, dimBlock);
  dim3 grid((WIDTH + block.x - 1) / block.x, (HEIGHT + block.y - 1) / block.y);

  double start = seconds();
  ppm_blurGPU <<<grid, block>>> (ppm_d, ppm1_d, MASK_SIZE);
  CHECK(cudaDeviceSynchronize());
  double stopGPU = seconds() - start;

  // copy image from GPU to CPU
  CHECK(cudaMemcpy(ppm1->image, ppm1_d.image, nBytes, cudaMemcpyDeviceToHost));
  ppm_write(ppm1, "ppm_blurredGPU.ppm");

  // check results with CPU
  ppm2 = ppm_make(ppm->width, ppm->height, (pel){0,0,0});
  start = seconds();
  ppm_blur(ppm, ppm2, MASK_SIZE);
  double stopCPU = seconds() - start;
  ppm_write(ppm2, "ppm_blurredCPU.ppm");
  printf("PPM images are %s\n", ppm_equal(ppm1, ppm2) ? "equal" : "not equal");

  // free device memory
  CHECK(cudaFree(ppm_d.image));
  CHECK(cudaFree(ppm1_d.image));

  // times & speedup
  printf("CPU elapsed time: %.4f (msec) \n", stopCPU*1000);
  printf("GPU elapsed time: %.4f (msec) - Speedup %.1f\n", stopGPU*1000, stopCPU/stopGPU);

  return (EXIT_SUCCESS);
}



↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez2/ppm_blurGPU.cu -o blur -I GPUcomputing/utils/PPM  GPUcomputing/utils/PPM/ppm.cpp
!./blur

↘️ **TODO...**

In [None]:
%%cuda_group_save --name "ppm_blurGPU.cu" --group "lez2"

#include <stdio.h>
#include <stdlib.h>
#include "ppm.h"
#include "../../GPUcomputing/utils/common.h"

/*
 * Set pel (pixel element) in ppm image.
 */
 __device__ void ppm_setGPU(PPM ppm, int x, int y, pel c) {
  int i = x + y*ppm.width;
  ppm.image[3*i] = c.r;
  ppm.image[3*i + 1] = c.g;
  ppm.image[3*i + 2] = c.b;
}

/*
* Get pel (pixel element) from ppm image.
*/
__device__ pel ppm_getGPU(PPM ppm, int x, int y) {
  pel p;
  int i = x + y*ppm.width;
  p.r = ppm.image[3*i];
  p.g = ppm.image[3*i + 1];
  p.b = ppm.image[3*i + 2];
  return p;
}

/*
 * Kernel 1D that flips inplace the PPM image vertically:
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void ppm_blurGPU() {

  // TODO
}

/*
 * MAIN
 */
int main(int argc, char **argv) {

  // PPM images

  // load a PPM image from file

  // set main params

  // Allocate GPU buffer for the input and output images

  // copy image from CPU to GPU

  // invoke kernels (define grid and block sizes)

  // copy image from GPU to CPU

  // check results with CPU
  start = seconds();
  ppm_blur(ppm, ppm2, KERNEL_SIZE);
  double stopCPU = seconds() - start;
  ppm_write(ppm2, "ppm_blurredCPU.ppm");
  printf("PPM images are %s\n", ppm_equal(ppm1, ppm2) ? "equal" : "not equal");

  // free device memory

  // times & speedup
  printf("CPU elapsed time: %.4f (msec) \n", stopCPU*1000);
  printf("CPU elapsed time: %.4f (msec) - Speedup %.1f\n", stopGPU*1000, stopCPU/stopGPU);

  return (EXIT_SUCCESS);
}



↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez2/ppm_blurGPU.cu -o blur -I GPUcomputing/utils/PPM  GPUcomputing/utils/PPM/ppm.cpp
!./blur

In [None]:
#include "../utils/common.h"
#include "ppm.h"

#define BLOCK_SIZE   32
#define MASK_SIZE    21
#define TILE_SIZE    (BLOCK_SIZE + MASK_SIZE - 1)

typedef struct {
   int width;
   int height;
   float* elements;
 } Matrix;

 /*
  * 2D convolution using shared memory
  *   A: input matrix
  *   B: output matrix
  *   M: convolution mask matrix
 */
__global__ void conv2D(Matrix A, Matrix B, Matrix M) {

   int x = blockIdx.x * blockDim.x + threadIdx.x; // Column index of matrix A
   int y = blockIdx.y * blockDim.y + threadIdx.y; // Row index of matrix A

   int tile_size = BLOCK_SIZE + MASK_SIZE - 1;
   int radius = MASK_SIZE / 2;

   // Allocate shared memory
   __shared__ float smem[TILE_SIZE][TILE_SIZE];

   // Load data into shared memory
   for (int row = 0; row <= tile_size/blockDim.y; row++) {
      for (int col = 0; col <= tile_size/blockDim.x; col++) {
         int row_data = y + blockDim.y * row - radius;   // input data index row
         int col_data = x + blockDim.x * col - radius;   // input data index column
         int row_smem = threadIdx.y + blockDim.y * row;  // mask index row
         int col_smem = threadIdx.x + blockDim.x * col;  // mask index column

         // Check valid range for smem and data
         if (row_smem < tile_size && col_smem < tile_size) {
            if (row_data >= 0 && row_data < A.height && col_data >= 0 && col_data < A.width) {
               smem[row_smem][col_smem] = A.elements[row_data * A.width + col_data];
            } else {
               smem[row_smem][col_smem] = 0.0f;
            }
         }
      }
   }

   // Synchronize threads
   __syncthreads();

   // Apply convolution
   float sum = 0.0f;
   for (int i = 0; i < MASK_SIZE; i++) {
      for (int j = 0; j < MASK_SIZE; j++) {
         int r = threadIdx.y + i;
         int c = threadIdx.x + j;
         if (r >= 0 && r < tile_size && c >= 0 && c < tile_size) {
            sum += smem[r][c] * M.elements[i * MASK_SIZE + j];
         }
      }
   }

   // Write output
   if (y < A.height && x < A.width) {
      B.elements[y * B.width + x] = sum;
   }
}

/*
 * Main function
 */
int main(void) {
   // Load image
   char path[] = "../images/dog.ppm";
   PPM *img = ppm_load(path);
   int WIDTH = img->width;
   int HEIGHT = img->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // extract channels and set matrices
   Matrix R, G, B;
   R.width = WIDTH; R.height = HEIGHT;
   G.width = WIDTH; G.height = HEIGHT;
   B.width = WIDTH; B.height = HEIGHT;
   R.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   G.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   B.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   color *r = ppm_extract_channel(img, 0); // get red channel
   color *g = ppm_extract_channel(img, 1); // get green channel
   color *b = ppm_extract_channel(img, 2); // get blue channel
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      R.elements[i] = (float) r[i];
      G.elements[i] = (float) g[i];
      B.elements[i] = (float) b[i];
   }

   // get gaussian filter mask
   float SIGMA = 10.0;
   Matrix M;
   M.width = WIDTH; M.height = HEIGHT;
   M.elements = gaussMask(MASK_SIZE, SIGMA);

   // Allocate device memory
   Matrix d_R, d_G, d_B, d_M;
   d_R.width = R.width; d_R.height = R.height;
   d_G.width = G.width; d_G.height = G.height;
   d_B.width = B.width; d_B.height = B.height;
   d_M.width = M.width; d_M.height = M.height;
   CHECK(cudaMalloc(&d_R.elements, R.width * R.height * sizeof(float)));
   CHECK(cudaMalloc(&d_G.elements, G.width * G.height * sizeof(float)));
   CHECK(cudaMalloc(&d_B.elements, B.width * B.height * sizeof(float)));
   CHECK(cudaMalloc(&d_M.elements, M.width * M.height * sizeof(float)));

   // Copy data to device
   CHECK(cudaMemcpy(d_R.elements, R.elements, R.width * R.height * sizeof(float), cudaMemcpyHostToDevice));
   CHECK(cudaMemcpy(d_G.elements, G.elements, G.width * G.height * sizeof(float), cudaMemcpyHostToDevice));
   CHECK(cudaMemcpy(d_B.elements, B.elements, B.width * B.height * sizeof(float), cudaMemcpyHostToDevice));
   CHECK(cudaMemcpy(d_M.elements, M.elements, M.width * M.height * sizeof(float), cudaMemcpyHostToDevice));

   /***********************************************************/
	/*                    conv2D on host                       */
	/***********************************************************/
   printf("\nCPU procedure...\n");
	double start = seconds();
   PPM *img_filtered = ppm_make(WIDTH, HEIGHT, (pel) {0,0,0}); // create a new image
   ppm_gaussFilter(img, img_filtered, MASK_SIZE, SIGMA);
   ppm_write(img_filtered, "output_gaussian.ppm");
	double stopCPU = seconds() - start;
   printf("   Host elapsed time: %f\n", stopCPU);

   /***********************************************************/
	/*                  GPU conv2D wih smem                    */
	/***********************************************************/
   printf("\nGPU conv2D with smem...\n");
   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
   dim3 dimGrid((WIDTH + BLOCK_SIZE - 1) / BLOCK_SIZE, (HEIGHT + BLOCK_SIZE - 1) / BLOCK_SIZE);
   start = seconds();
   conv2D<<<dimGrid, dimBlock>>>(d_R, d_R, d_M);
   conv2D<<<dimGrid, dimBlock>>>(d_G, d_G, d_M);
   conv2D<<<dimGrid, dimBlock>>>(d_B, d_B, d_M);
   CHECK(cudaDeviceSynchronize());
   double stopGPU = seconds() - start;
   printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU, stopCPU / stopGPU);

   // Copy data back to host
   Matrix R1, G1, B1;
   R1.width = WIDTH; R1.height = HEIGHT;
   G1.width = WIDTH; G1.height = HEIGHT;
   B1.width = WIDTH; B1.height = HEIGHT;
   R1.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   G1.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   B1.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   CHECK(cudaMemcpy(R1.elements, d_R.elements, WIDTH * HEIGHT * sizeof(float), cudaMemcpyDeviceToHost));
   CHECK(cudaMemcpy(G1.elements, d_G.elements, WIDTH * HEIGHT * sizeof(float), cudaMemcpyDeviceToHost));
   CHECK(cudaMemcpy(B1.elements, d_B.elements, WIDTH * HEIGHT * sizeof(float), cudaMemcpyDeviceToHost));
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      r[i] = (color) R1.elements[i];
      g[i] = (color) G1.elements[i];
      b[i] = (color) B1.elements[i];
   }

   // check results
   PPM *ppm_filtered = ppm_combine_channels(r, g, b, WIDTH, HEIGHT);
   ppm_write(img_filtered, "output_gaussianGPU.ppm");

   return 0;
}

↘️ **TODO...**