---
# **LAB 2 - Modello di programmazione CUDA**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

To use also a plugin for cpp sintax highlighting...

In [None]:
!wget -O cpp_plugin.py https://gist.github.com/akshaykhadse/7acc91dd41f52944c6150754e5530c4b/raw/cpp_plugin.py
%load_ext cpp_plugin

In [None]:
# get images
%mkdir -p images
!wget -P images/ http://gpu.di.unimi.it/images/dog.bmp
!wget -P images/ http://gpu.di.unimi.it/images/julia_jet.bmp

# ✅ Sum of vectors

In [None]:
%%cuda
#include <stdio.h>

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess) {                                                \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
    }                                                                          \
}

/**
 * CUDA Kernel: vector addition
 */
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < numElements)
		C[i] = A[i] + B[i];
}

/**
 * MAIN
 */
int main(void) {
	// Error code to check return values for CUDA calls
	cudaError_t err = cudaSuccess;

	// Print the vector length to be used, and compute its size
	int N = 50000;
	size_t size = N * sizeof(float);
	printf("[Vector addition of %d elements]\n", N);

	// Allocate the host input vector A,B,C
	float *h_A = (float *) malloc(size);
	float *h_B = (float *) malloc(size);
	float *h_C = (float *) malloc(size);

	// Initialize the host input vectors
	for (int i = 0; i < N; ++i) {
		h_A[i] = rand() % 10;
		h_B[i] = rand() % 10;
	}

	// Allocate the device input vector A,B,C
	float *d_A = NULL;
	CHECK(cudaMalloc((void **) &d_A, size));
	float *d_B = NULL;
	CHECK(cudaMalloc((void **) &d_B, size));
	float *d_C = NULL;
	CHECK(cudaMalloc((void **) &d_C, size));

	// Copy the host input vectors A and B in device memory
	printf("Copy input data from the host memory to the CUDA device\n");
	CHECK(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));

	// Launch the Vector Add CUDA Kernel
	int threadsPerBlock = 256;
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
	printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
	vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
	CHECK(cudaGetLastError());
	if (err != cudaSuccess) {
		fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
				cudaGetErrorString(err));
		exit (EXIT_FAILURE);
	}

	// Copy the device result vector in host memory
	printf("Copy output data from the CUDA device to the host memory\n");
	CHECK(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));

	// Verify that the result vector is correct
	for (int i = 0; i < N; ++i) {
		if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
			fprintf(stderr, "Result verification failed at element %d!\n", i);
			exit (EXIT_FAILURE);
		}
	}

	printf("Test PASSED\n");

	// Free device global memory
	CHECK(cudaFree(d_A));
	CHECK(cudaFree(d_B));
	CHECK(cudaFree(d_C));

	// Free host memory
	free(h_A);
	free(h_B);
	free(h_C);

	printf("Done\n");
	return 0;
}



# ✅ Sum of matrices

In [None]:
%%cuda

#include <stdio.h>

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess) {                                                \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
    }                                                                          \
}

void initialData(float *ip, const float ival, int size)
{
    for (int i = 0; i < size; i++)
    {
        ip[i] = (float)(rand() & 0xFF) / 100.0f;
    }

    return;
}


// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny) {
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy * nx + ix;

    if (ix < nx && iy < ny)
        MatC[idx] = MatA[idx] + MatB[idx];
}


int main(int argc, char **argv) {

    // set up device
    int dev = 0;

    // set up data size of matrix
    int nx = 1 << 14;
    int ny = 1 << 14;

    int nxy = nx * ny;
    int nBytes = nxy * sizeof(float);
    printf("Matrix size: nx %d ny %d\n", nx, ny);

    // malloc host memory
    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A = (float *)malloc(nBytes);
    h_B = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef = (float *)malloc(nBytes);

    // initialize data at host side
    initialData(h_A,  2.0f, nxy);
    initialData(h_B,  0.5f, nxy);

    memset(hostRef, 0, nBytes);
    memset(gpuRef, 0, nBytes);

    // device mem allocation
    float *d_MatA, *d_MatB, *d_MatC;
    cudaMalloc((void **)&d_MatA, nBytes);
    cudaMalloc((void **)&d_MatB, nBytes);
    cudaMalloc((void **)&d_MatC, nBytes);

    cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);

    int dimx = 32;
    int dimy = 32;
    dim3 block(dimx, dimy);
    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

    sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
    cudaDeviceSynchronize();

    CHECK(cudaFree(d_MatA));
    CHECK(cudaFree(d_MatB));
    CHECK(cudaFree(d_MatC));

    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return (0);
}


# ✅ Blocks and grids

**Grid 1D**: prints DIMs and IDs of grid, block and thread


In [None]:
%%cuda

#include <stdio.h>

__global__ void checkIndex(void) {
	printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) "
					"blockDim:(%d, %d, %d) gridDim:(%d, %d, %d)\n",
					threadIdx.x, threadIdx.y, threadIdx.z,
					blockIdx.x, blockIdx.y, blockIdx.z,
					blockDim.x, blockDim.y, blockDim.z,
					gridDim.x,gridDim.y,gridDim.z);
}

/*
* MAIN
*/
int main(int argc, char **argv) {

	// grid and block definition
	dim3 block(4);
	dim3 grid(3);

	// Print from host
	printf("Print from host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// Print from device
	printf("Print from device:\n");
	checkIndex<<<grid, block>>>();

	// reset device
	cudaDeviceReset();
	return(0);
}

### ↘️ *`TODO...`*

Definire un kernel con block 2D e grid 2D che:

1. usando ID di thread e block calcola la seguente espressione `s = threadIdx.x * blockDim.x + threadIdx.y * blockDim.y + blockIdx.x + blockIdx.y`
2. stampa `sum =  2  <--> threadIdx:(*,*), blockIdx:(*, *), blockDim:(*, *)` se `s` è un numero della sequenza di Fibonacci


NB: Sequenza di Fibonacci ([Fibonacci-wikipedia](https://it.wikipedia.org/wiki/Successione_di_Fibonacci))
$$
\begin{align}
s_0 &= 0,\\
s_1 &= 1,\\
s_{n}&=s_{{n-1}}+s_{{n-2}},\quad \text{(per ogni $n>1$)}
\end{align}
$$


In [None]:
%%cuda
#include <stdio.h>

/*
 * Show DIMs & IDs for grid, block and thread
 */
__global__ void checkIndex(void) {
  uint tx = threadIdx.x;
  uint ty = threadIdx.y;
  uint bx = blockIdx.x;
  uint by = blockIdx.y;
  uint bxd = blockDim.x;
  uint byd = blockDim.y;
  uint sum = tx * bxd + ty * byd + bx + by;

  // iterative def of fibonacci
  int fn2 = 0;
  int fn1 = 1;
  int fn = fn2 + fn1;
  while (fn < sum) {
    fn2 = fn1;
    fn1 = fn;
    fn = fn1 + fn2;
  }
  if (sum == fn || sum == 0 )
    printf("sum = %2d  <--> threadIdx:(%d, %d), blockIdx:(%d, %d), blockDim:(%d, %d)\n", sum, tx, ty, bx, by, bxd, byd);
}

/*
* MAIN
*/
int main(int argc, char **argv) {

	// grid and block structure
	dim3 block(5,5);
	dim3 grid(3,3);

	// Print from host
	printf("Print from host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// Print from device
	printf("Print from device:\n");
	checkIndex<<<grid, block>>>();

	// reset device
	cudaDeviceReset();
	return (0);
}

Template:

In [None]:
%%cuda
#include <stdio.h>

/*
 * Show DIMs & IDs for grid, block and thread
 */
__global__ void checkIndex(void) {

  // TODO

}

int main(int argc, char **argv) {

	// grid and block structure


	// Print from host
	printf("Print from host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// Print from device

	// reset device
	cudaDeviceReset();
	return (0);
}

# ✅ Image flip - CPU (multithreading)

In [None]:
%%cpp -n src/ImageStuff.h -s xcode

#ifndef _IMAGESTUFF_H
#define _IMAGESTUFF_H

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

struct ImgProp {
	int Hpixels;
	int Vpixels;
	unsigned char HeaderInfo[54];
	unsigned long int Hbytes;
};

struct Pixel {
	unsigned char R;
	unsigned char G;
	unsigned char B;
};

typedef unsigned char pel;    // pixel element

pel** ReadBMP(char*);         // Load a BMP image
void WriteBMP(pel**, char*);  // Store a BMP image

extern struct ImgProp ip;

#endif

In [None]:
%%cpp -n src/ImageStuff.c -s xcode

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#include "ImageStuff.h"

/*
 * Load a BMP image
 */

pel** ReadBMP(char* filename) {
	FILE* f = fopen(filename, "rb");
	if (f == NULL) {
		printf("\n\n%s NOT FOUND\n\n", filename);
		exit(1);
	}

	pel HeaderInfo[54];
	fread(HeaderInfo, sizeof(pel), 54, f); // read the 54-byte header

	// extract image height and width from header
	int width = *(int*) &HeaderInfo[18];
	int height = *(int*) &HeaderInfo[22];

	//copy header for re-use
	for (unsigned int i = 0; i < 54; i++)
		ip.HeaderInfo[i] = HeaderInfo[i];

	ip.Vpixels = height;
	ip.Hpixels = width;
	int RowBytes = (width * 3 + 3) & (~3);
	ip.Hbytes = RowBytes;

	printf("\n   Input BMP File name: %20s  (%u x %u)", filename, ip.Hpixels, ip.Vpixels);

	pel tmp;
	pel **TheImage = (pel **) malloc(height * sizeof(pel*));
	for (unsigned int i = 0; i < height; i++)
		TheImage[i] = (pel *) malloc(RowBytes * sizeof(pel));

	for (unsigned int i = 0; i < height; i++)
		fread(TheImage[i], sizeof(unsigned char), RowBytes, f);

	fclose(f);
	return TheImage;  // remember to free() it in caller!
}

/*
 * Store a BMP image
 */
void WriteBMP(pel** img, char* filename) {
	FILE* f = fopen(filename, "wb");
	if (f == NULL) {
		printf("\n\nFILE CREATION ERROR: %s\n\n", filename);
		exit(1);
	}

	//write header
	for (unsigned int x = 0; x < 54; x++)
		fputc(ip.HeaderInfo[x], f);

	//write data
	for (unsigned int x = 0; x < ip.Vpixels; x++)
		for (unsigned int y = 0; y < ip.Hbytes; y++) {
			char temp = img[x][y];
			fputc(temp, f);
		}

	printf("\n  Output BMP File name: %20s  (%u x %u)", filename, ip.Hpixels,
			ip.Vpixels);

	fclose(f);
}

In [None]:
%%cpp -n src/Imflip.c -s xcode

#include "ImageStuff.h"

struct ImgProp ip;

pel** FlipImageV(pel** img) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	//vertical flip
	for (col = 0; col < ip.Hbytes; col += 3) {
		row = 0;
		while (row < ip.Vpixels / 2) {
			pix.B = img[row][col];
			pix.G = img[row][col + 1];
			pix.R = img[row][col + 2];

			img[row][col] = img[ip.Vpixels - (row + 1)][col];
			img[row][col + 1] = img[ip.Vpixels - (row + 1)][col + 1];
			img[row][col + 2] = img[ip.Vpixels - (row + 1)][col + 2];

			img[ip.Vpixels - (row + 1)][col] = pix.B;
			img[ip.Vpixels - (row + 1)][col + 1] = pix.G;
			img[ip.Vpixels - (row + 1)][col + 2] = pix.R;

			row++;
		}
	}
	return img;
}

pel** FlipImageH(pel** img) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	//horizontal flip
	for (row = 0; row < ip.Vpixels; row++) {
		col = 0;
		while (col < (ip.Hpixels * 3) / 2) {
			pix.B = img[row][col];
			pix.G = img[row][col + 1];
			pix.R = img[row][col + 2];

			img[row][col] = img[row][ip.Hpixels * 3 - (col + 3)];
			img[row][col + 1] = img[row][ip.Hpixels * 3 - (col + 2)];
			img[row][col + 2] = img[row][ip.Hpixels * 3 - (col + 1)];

			img[row][ip.Hpixels * 3 - (col + 3)] = pix.B;
			img[row][ip.Hpixels * 3 - (col + 2)] = pix.G;
			img[row][ip.Hpixels * 3 - (col + 1)] = pix.R;

			col += 3;
		}
	}
	return img;
}

int main(int argc, char** argv) {
	if (argc < 4) {
		printf("\n\nUsage: imflipPh [input] [output] [V | H]");
		printf("\n\nExample: imflipPh square.bmp square_h.bmp h\n\n");
		return 0;
	}

	pel** data = ReadBMP(argv[1]);
	double timer;
	unsigned int a;
	clock_t start, stop;

	start = clock();
	switch (argv[3][0]) {
	case 'v':
	case 'V':
		data = FlipImageV(data);
		break;
	case 'h':
	case 'H':
		data = FlipImageH(data);
		break;
	default:
		printf("\nINVALID OPTION\n");
		return 0;
	}
	stop = clock();
	timer = ((double)(stop-start))/(double)CLOCKS_PER_SEC;

	// merge with header and write to file
	WriteBMP(data, argv[2]);

	// free() the allocated memory for the image
	for (int i = 0; i < ip.Vpixels; i++)
		free(data[i]);
	free(data);

	printf("\n\nTotal execution time: %9.4f sec", timer);
	printf(" (%7.3f ns per pixel)\n", 1000000 * timer / (double) (ip.Hpixels * ip.Vpixels));

	return 0;
}

In [None]:
!gcc src/ImageStuff.c src/Imflip.c -o imflip

In [None]:
!./imflip images/dog.bmp dogV.bmp V
!./imflip images/dog.bmp dogH.bmp H

In [None]:
!./imflip images/julia_jet.bmp julia_jetV.bmp V
!./imflip images/julia_jet.bmp julia_jetH.bmp H

Librerie python per lettura/scrittura file di immagini e loro display: [openCV](https://docs.opencv.org/master/index.html) e [matplotlib](https://matplotlib.org/). Le immagini vengono rappresentate come array multidimensionali tratti dalla libreria fondamentale per il calcolo scientifico [NumPy](https://numpy.org/)

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
dog = cv.imread('/content/GPUcomputing/images/dog.bmp')
print('Image size: ', dog.shape)
# BGR is converted to RGB
dog = cv.cvtColor(dog, cv.COLOR_BGR2RGB)
dogV = cv.imread('dogV.bmp')
dogV = cv.cvtColor(dogV, cv.COLOR_BGR2RGB)
dogH = cv.imread('dogH.bmp')
dogH = cv.cvtColor(dogH, cv.COLOR_BGR2RGB)
plt.imshow(dog)
plt.show()
plt.imshow(dogV)
plt.show()
plt.imshow(dogH)
plt.show()

Using Pthreads lib...

In [None]:
%%cpp -n src/ImflipPth.c -s xcode

#include <pthread.h>
#include <stdint.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include "ImageStuff.h"

#define MAXTHREADS   128

int NumThreads;         		       // Total number of threads working in parallel
int ThParam[MAXTHREADS];		       // Thread parameters ...
pthread_t ThHandle[MAXTHREADS];	   // Thread handles
pthread_attr_t ThAttr;			       // Pthread attrributes
void (*FlipFunc)(pel** img);	     // Function pointer to flip the image
void* (*MTFlipFunc)(void *arg);	   // Function pointer to flip the image, multi-threaded version
pel** TheImage;       			       // This is the main image
struct ImgProp ip;

// multi-threaded version
void *MTFlipV(void* tid) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	long ts = *((int *) tid);                 	// My thread ID is stored here
	ts *= ip.Hbytes / NumThreads;               // start index
	long te = ts + ip.Hbytes / NumThreads - 1; 	// end index

	for (col = ts; col <= te; col += 3) {
		row = 0;
		while (row < ip.Vpixels / 2) {
			pix.B = TheImage[row][col];
			pix.G = TheImage[row][col + 1];
			pix.R = TheImage[row][col + 2];

			TheImage[row][col] = TheImage[ip.Vpixels - (row + 1)][col];
			TheImage[row][col + 1] = TheImage[ip.Vpixels - (row + 1)][col + 1];
			TheImage[row][col + 2] = TheImage[ip.Vpixels - (row + 1)][col + 2];

			TheImage[ip.Vpixels - (row + 1)][col] = pix.B;
			TheImage[ip.Vpixels - (row + 1)][col + 1] = pix.G;
			TheImage[ip.Vpixels - (row + 1)][col + 2] = pix.R;

			row++;
		}
	}
	pthread_exit(0);
}

// multi-threaded version
void *MTFlipH(void* tid) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	long ts = *((int *) tid);         	// My thread ID is stored here
	ts *= ip.Vpixels / NumThreads;			// start index
	long te = ts + ip.Vpixels / NumThreads - 1; 	// end index

	for (row = ts; row <= te; row++) {
		col = 0;
		while (col < ip.Hpixels * 3 / 2) {
			pix.B = TheImage[row][col];
			pix.G = TheImage[row][col + 1];
			pix.R = TheImage[row][col + 2];

			TheImage[row][col] = TheImage[row][ip.Hpixels * 3 - (col + 3)];
			TheImage[row][col + 1] = TheImage[row][ip.Hpixels * 3 - (col + 2)];
			TheImage[row][col + 2] = TheImage[row][ip.Hpixels * 3 - (col + 1)];

			TheImage[row][ip.Hpixels * 3 - (col + 3)] = pix.B;
			TheImage[row][ip.Hpixels * 3 - (col + 2)] = pix.G;
			TheImage[row][ip.Hpixels * 3 - (col + 1)] = pix.R;

			col += 3;
		}
	}
	pthread_exit(NULL);
}

// MAIN
int main(int argc, char** argv) {
	char Flip;
	int a, i, ThErr;
	struct timeval t;
	double StartTime, EndTime;
	double TimeElapsed;

	switch (argc) {
	case 3:
		NumThreads = 1;
		Flip = 'V';
		break;
	case 4:
		NumThreads = 1;
		Flip = toupper(argv[3][0]);
		break;
	case 5:
		NumThreads = atoi(argv[4]);
		Flip = toupper(argv[3][0]);
		break;
	default:
		printf("\n\nUsage: imflipP input output [v/h] [thread count]");
		printf("\n\nExample: imflipP infilename.bmp outname.bmp h 8\n\n");
		return 0;
	}

  printf("\nExecuting the multi-threaded version with %d threads ...\n", NumThreads);
  MTFlipFunc = (Flip == 'V') ? MTFlipV : MTFlipH;


	// load image
	TheImage = ReadBMP(argv[1]);

	gettimeofday(&t, NULL);
	StartTime = (double) t.tv_sec * 1000000.0 + ((double) t.tv_usec);

	// loop
	int R = 10;
	for (int round = 0; round < R; round++) {
		pthread_attr_init(&ThAttr);
		pthread_attr_setdetachstate(&ThAttr, PTHREAD_CREATE_JOINABLE);
		for (i = 0; i < NumThreads; i++) {
			ThParam[i] = i;
			ThErr = pthread_create(&ThHandle[i], &ThAttr, MTFlipFunc, (void *) &ThParam[i]);
			if (ThErr != 0) {
				printf("\nThread Creation Error %d. Exiting abruptly... \n", ThErr);
				exit(EXIT_FAILURE);
			}
		}
		pthread_attr_destroy(&ThAttr);
		for (i = 0; i < NumThreads; i++) {
			pthread_join(ThHandle[i], NULL);
		}
	}
	gettimeofday(&t, NULL);
	EndTime = (double) t.tv_sec * 1000000.0 + ((double) t.tv_usec);
	TimeElapsed = (EndTime - StartTime) / (R * 1000000.00);

	//merge with header and write to file
	WriteBMP(TheImage, argv[2]);

	// free() the allocated memory for the image
	for (i = 0; i < ip.Vpixels; i++) {
		free(TheImage[i]);
	}
	free(TheImage);

	printf("\n\nTotal execution time: %9.4f sec (%s flip)", TimeElapsed,
			Flip == 'V' ? "Vertical" : "Horizontal");
	printf(" (%6.3f ns/pixel)\n",
			1000000 * TimeElapsed / (double) (ip.Hpixels * ip.Vpixels));

	return (EXIT_SUCCESS);
}

In [None]:
!gcc -o imflipPth src/ImageStuff.c src/ImflipPth.c -pthread

In [None]:
!./imflipPth images/dog.bmp dogV.bmp V 4

In [None]:
!./imflipPth images/julia_jet.bmp julia_jetV.bmp V 4
!./imflipPth images/julia_jet.bmp julia_jetH.bmp H 4

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
julia_jet = cv.imread('images/julia_jet.bmp')
print('Image size: ', julia_jet.shape)
# BGR is converted to RGB
julia_jet = cv.cvtColor(julia_jet, cv.COLOR_BGR2RGB)
julia_jetV = cv.imread('julia_jetV.bmp')
julia_jetV = cv.cvtColor(julia_jetV, cv.COLOR_BGR2RGB)
julia_jetH = cv.imread('julia_jetH.bmp')
julia_jetH = cv.cvtColor(julia_jetH, cv.COLOR_BGR2RGB)
plt.imshow(julia_jet)
plt.show()
plt.imshow(julia_jetV)
plt.show()
plt.imshow(julia_jetH)
plt.show()

### ↘️ *`TODO...`*

Individuare il numero di pthread che dà la prestazione migliore in termine di tempo impiegato.

```
best #ptherad =
```

# ✅ Image flip - GPU

In [None]:
%%cuda_group_save --name "bmpUtil.h" --group "BMP"

#ifndef _BPMUTIL_H
#define _BPMUTIL_H

struct imgBMP {
	int width;
	int height;
	unsigned char headInfo[54];
	unsigned long int rowByte;
} img;

#define	WIDTHB		img.rowByte
#define	WIDTH		  img.width
#define	HEIGHT		img.height
#define	IMAGESIZE	(WIDTHB*HEIGHT)

struct pixel {
	unsigned char R;
	unsigned char G;
	unsigned char B;
};

typedef unsigned long ulong;
typedef unsigned int uint;
typedef unsigned char pel;    // pixel element

pel *ReadBMPlin(char*);         // Load a BMP image
void WriteBMPlin(pel *, char*); // Store a BMP image

#endif

In [None]:
%%cuda_group_save --name "common.h" --group "BMP"

#ifndef _COMMON_H
#define _COMMON_H

#include <sys/time.h>

inline double seconds() {
  struct timeval tp;
  struct timezone tzp;
  int i = gettimeofday(&tp, &tzp);
  return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}

#endif

### ↘️ *`TODO...`*

In [None]:
%%cuda_group_save --name "ImgFlipGPU.cu" --group "BMP"

#include <stdio.h>
#include <stdlib.h>
#include "bmpUtil.h"
#include "common.h"


/*
 * Kernel 1D that flips the given image vertically
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void VflipGPU(pel *imgDst, const pel *imgSrc, const uint w, const uint h) {
	// ** pixel granularity **
	uint i = blockIdx.x;               // block ID
	uint j = threadIdx.x;              // thread ID
	uint b = blockDim.x;               // block dim
	uint x = b * i + j;                // 1D pixel linear index
	uint m = (w + b - 1) / b;          // num of blocks in a row
	uint r = i / m;                    // row of the source pixel
	uint c = x - r * w;                // col of the source pixel

	if (c >= w)                        // col out of range
		return;

	//  ** byte granularity **
	uint s = (w * 3 + 3) & (~3);       // num bytes x row (mult. 4)
	uint r1 = h - 1 - r;               // dest. row (mirror)
	uint p = s * r + 3*c;              // src byte position of the pixel
	uint q = s * r1 + 3*c;             // dst byte position of the pixel

	// swap pixels RGB
	imgDst[q] = imgSrc[p];             // R
	imgDst[q + 1] = imgSrc[p + 1];     // G
	imgDst[q + 2] = imgSrc[p + 2];     // B
}

/*
 *  Kernel that flips the given image horizontally
 *  each thread only flips a single pixel (R,G,B)
 */
__global__ void HflipGPU(pel *ImgDst, pel *ImgSrc, uint width) {
	uint b = blockDim.x;
	uint i = blockIdx.x;
	uint j = threadIdx.x;
	uint x = b * i + j;
	uint w = (width + b - 1) / b;  // ceil
	uint s = (width * 3 + 3) & (~3);
	uint r = i / w;
	uint c = x - r * w * b;

	if (c >= width)
		return;			// col out of range

	uint c1 = width - 1 - c;
	uint p = r * s;
	uint sr = p + 3 * c;
	uint ds = p + 3 * c1;

	// swap pixels RGB   @c , @c1
	ImgDst[ds] = ImgSrc[sr];
	ImgDst[ds + 1] = ImgSrc[sr + 1];
	ImgDst[ds + 2] = ImgSrc[sr + 2];
}

/*
 *  Read a 24-bit/pixel BMP file into a 1D linear array.
 *  Allocate memory to store the 1D image and return its pointer
 */
pel *ReadBMPlin(char* fn) {
	static pel *Img;
	FILE* f = fopen(fn, "rb");
	if (f == NULL) {
		printf("\n\n%s NOT FOUND\n\n", fn);
		exit(EXIT_FAILURE);
	}

	pel HeaderInfo[54];
	size_t nByte = fread(HeaderInfo, sizeof(pel), 54, f); // read the 54-byte header
	// extract image height and width from header
	int width = *(int*) &HeaderInfo[18];
	img.width = width;
	int height = *(int*) &HeaderInfo[22];
	img.height = height;
	int RowBytes = (width * 3 + 3) & (~3);  // row is multiple of 4 pixel
	img.rowByte = RowBytes;
	//save header for re-use
	memcpy(img.headInfo, HeaderInfo, 54);
	printf("\n Input File name: %5s  (%d x %d)   File Size=%lu", fn, img.width,
			img.height, IMAGESIZE);
	// allocate memory to store the main image (1 Dimensional array)
	Img = (pel *) malloc(IMAGESIZE);
	if (Img == NULL)
		return Img;      // Cannot allocate memory
	// read the image from disk
	size_t out = fread(Img, sizeof(pel), IMAGESIZE, f);
	fclose(f);
	return Img;
}

/*
 *  Write the 1D linear-memory stored image into file
 */
void WriteBMPlin(pel *Img, char* fn) {
	FILE* f = fopen(fn, "wb");
	if (f == NULL) {
		printf("\n\nFILE CREATION ERROR: %s\n\n", fn);
		exit(1);
	}
	//write header
	fwrite(img.headInfo, sizeof(pel), 54, f);
	//write data
	fwrite(Img, sizeof(pel), IMAGESIZE, f);
	printf("\nOutput File name: %5s  (%u x %u)   File Size=%lu", fn, img.width,
			img.height, IMAGESIZE);
	fclose(f);
}

/*
 * MAIN
 */
int main(int argc, char **argv) {
	char flip = 'V';
	uint dimBlock = 256, dimGrid;
	pel *imgSrc, *imgDst;		 // Where images are stored in CPU
	pel *imgSrcGPU, *imgDstGPU;	 // Where images are stored in GPU

	if (argc > 4) {
		dimBlock = atoi(argv[4]);
		flip = argv[3][0];
	}
	else if (argc > 3) {
		flip = argv[3][0];
	}
	else if (argc < 3) {
		printf("\n\nUsage:   imflipGPU InputFilename OutputFilename [V/H] [dimBlock]");
		exit(EXIT_FAILURE);
	}
	if ((flip != 'V') && (flip != 'H')) {
		printf("Invalid flip option '%c'. Must be 'V','H'... \n",flip);
		exit(EXIT_FAILURE);
	}

	// Create CPU memory to store the input and output images
	imgSrc = ReadBMPlin(argv[1]); // Read the input image if memory can be allocated
	if (imgSrc == NULL) {
		printf("Cannot allocate memory for the input image...\n");
		exit(EXIT_FAILURE);
	}
	imgDst = (pel *) malloc(IMAGESIZE);
	if (imgDst == NULL) {
		free(imgSrc);
		printf("Cannot allocate memory for the input image...\n");
		exit(EXIT_FAILURE);
	}

	// Allocate GPU buffer for the input and output images
	cudaMalloc((void**) &imgSrcGPU, IMAGESIZE);
	cudaMalloc((void**) &imgDstGPU, IMAGESIZE);

	// Copy input vectors from host memory to GPU buffers.
	cudaMemcpy(imgSrcGPU, imgSrc, IMAGESIZE, cudaMemcpyHostToDevice);

	// invoke kernels (define grid and block sizes)
	int rowBlock = (WIDTH + dimBlock - 1) / dimBlock;
	dimGrid = HEIGHT * rowBlock;

	double start = seconds();   // start time

	switch (flip) {
	case 'H':
		HflipGPU<<<dimGrid, dimBlock>>>(imgDstGPU, imgSrcGPU, WIDTH);
		break;
	case 'V':
		VflipGPU<<<dimGrid, dimBlock>>>(imgDstGPU, imgSrcGPU, WIDTH, HEIGHT);
		break;
	}
	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	cudaDeviceSynchronize();

	double stop = seconds();   // elapsed time

	// Copy output (results) from GPU buffer to host (CPU) memory.
	cudaMemcpy(imgDst, imgDstGPU, IMAGESIZE, cudaMemcpyDeviceToHost);

	// Write the flipped image back to disk
	WriteBMPlin(imgDst, argv[2]);

	printf("\nKernel elapsed time %f sec \n\n", stop - start);

	// Deallocate CPU, GPU memory and destroy events.
	cudaFree(imgSrcGPU);
	cudaFree(imgDstGPU);

	// cudaDeviceReset must be called before exiting in order for profiling and
	// tracing tools spel as Parallel Nsight and Visual Profiler to show complete traces.
	cudaError_t	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceReset failed!");
		free(imgSrc);
		free(imgDst);
		exit(EXIT_FAILURE);
	}
	free(imgSrc);
	free(imgDst);
	return (EXIT_SUCCESS);
}



Template...

In [None]:
%%cuda_group_save --name "ImgFlipGPU.cu" --group "BMP"

#include <stdio.h>
#include <stdlib.h>
#include "bmpUtil.h"
#include "common.h"


/*
 * Kernel 1D that flips the given image vertically
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void VflipGPU(pel *imgDst, const pel *imgSrc, const uint w, const uint h) {

  // TODO

}

/*
 *  Kernel that flips the given image horizontally
 *  each thread only flips a single pixel (R,G,B)
 */
__global__ void HflipGPU(pel *ImgDst, pel *ImgSrc, uint width) {

        // TODO
}

/*
 *  Read a 24-bit/pixel BMP file into a 1D linear array.
 *  Allocate memory to store the 1D image and return its pointer
 */
pel *ReadBMPlin(char* fn) {
	static pel *Img;
	FILE* f = fopen(fn, "rb");
	if (f == NULL) {
		printf("\n\n%s NOT FOUND\n\n", fn);
		exit(EXIT_FAILURE);
	}

	pel HeaderInfo[54];
	size_t nByte = fread(HeaderInfo, sizeof(pel), 54, f); // read the 54-byte header

 // extract image height and width from header
	int width = *(int*) &HeaderInfo[18];
	img.width = width;
	int height = *(int*) &HeaderInfo[22];
	img.height = height;
	int RowBytes = (width * 3 + 3) & (~3);  // row is multiple of 4 pixel
	img.rowByte = RowBytes;

 //save header for re-use
	memcpy(img.headInfo, HeaderInfo, 54);
	printf("\n Input File name: %5s  (%d x %d)   File Size=%lu", fn, img.width,
			img.height, IMAGESIZE);

 // allocate memory to store the main image (1 Dimensional array)
	Img = (pel *) malloc(IMAGESIZE);
	if (Img == NULL)
		return Img;      // Cannot allocate memory
	// read the image from disk
	size_t out = fread(Img, sizeof(pel), IMAGESIZE, f);
	fclose(f);
	return Img;
}

/*
 *  Write the 1D linear-memory stored image into file
 */
void WriteBMPlin(pel *Img, char* fn) {
	FILE* f = fopen(fn, "wb");
	if (f == NULL) {
		printf("\n\nFILE CREATION ERROR: %s\n\n", fn);
		exit(1);
	}
	//write header
	fwrite(img.headInfo, sizeof(pel), 54, f);
	//write data
	fwrite(Img, sizeof(pel), IMAGESIZE, f);
	printf("\nOutput File name: %5s  (%u x %u)   File Size=%lu", fn, img.width,
			img.height, IMAGESIZE);
	fclose(f);
}

/*
 * MAIN
 */
int main(int argc, char **argv) {
	char flip = 'V';
	uint dimBlock = 256, dimGrid;
	pel *imgSrc, *imgDst;		 // Where images are stored in CPU
	pel *imgSrcGPU, *imgDstGPU;	 // Where images are stored in GPU

	if (argc > 4) {
		dimBlock = atoi(argv[4]);
		flip = argv[3][0];
	}
	else if (argc > 3) {
		flip = argv[3][0];
	}
	else if (argc < 3) {
		printf("\n\nUsage:   imflipGPU InputFilename OutputFilename [V/H] [dimBlock]");
		exit(EXIT_FAILURE);
	}
	if ((flip != 'V') && (flip != 'H')) {
		printf("Invalid flip option '%c'. Must be 'V','H'... \n",flip);
		exit(EXIT_FAILURE);
	}

	// Create CPU memory to store the input and output images
	imgSrc = ReadBMPlin(argv[1]); // Read the input image if memory can be allocated
	if (imgSrc == NULL) {
		printf("Cannot allocate memory for the input image...\n");
		exit(EXIT_FAILURE);
	}
	imgDst = (pel *) malloc(IMAGESIZE);
	if (imgDst == NULL) {
		free(imgSrc);
		printf("Cannot allocate memory for the input image...\n");
		exit(EXIT_FAILURE);
	}

	// Allocate GPU buffer for the input and output images


 	// Copy input vectors from host memory to GPU buffers.


	// invoke kernels (define grid and block sizes)


	// cudaDeviceSynchronize waits for the kernel to finish, and returns


	// Copy output (results) from GPU buffer to host (CPU) memory.
	cudaMemcpy(imgDst, imgDstGPU, IMAGESIZE, cudaMemcpyDeviceToHost);

	// Write the flipped image back to disk
	WriteBMPlin(imgDst, argv[2]);

	printf("\nKernel elapsed time %f sec \n\n", stop - start);

	// Deallocate CPU, GPU memory and destroy events.


	// cuda free vars

	free(imgSrc);
	free(imgDst);
	return (EXIT_SUCCESS);
}



In [None]:
!nvcc -arch=sm_75 src/BMP/ImgFlipGPU.cu -o imfpliGPU

In [None]:
!./imfpliGPU images/dog.bmp julia_jetV.bmp V

In [None]:
!./imfpliGPU images/julia_jet.bmp julia_jetH.bmp H

In [None]:
# reads as a NumPy array: row (height) x column (width) x color (3)
julia_jet = cv.imread('images/julia_jet.bmp')
print('Image size: ', julia_jet.shape)
# BGR is converted to RGB
julia_jet = cv.cvtColor(julia_jet, cv.COLOR_BGR2RGB)
julia_jetV = cv.imread('julia_jetV.bmp')
julia_jetV = cv.cvtColor(julia_jetV, cv.COLOR_BGR2RGB)
julia_jetH = cv.imread('julia_jetH.bmp')
julia_jetH = cv.cvtColor(julia_jetH, cv.COLOR_BGR2RGB)
plt.imshow(julia_jet)
plt.show()
plt.imshow(julia_jetV)
plt.show()
plt.imshow(julia_jetH)
plt.show()

In [None]:
!nvprof ./imfpliGPU images/julia_jet.bmp julia_jetV.bmp V

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
img1 = cv.imread('/content/drive/MyDrive/images/julia_jet.bmp')
print('Image size: ', img1.shape)
# BGR is converted to RGB
img1 = cv.cvtColor(img1, cv.COLOR_BGR2RGB)
img2 = cv.imread('julia_jet.bmp')
img2 = cv.cvtColor(img2, cv.COLOR_BGR2RGB)
plt.imshow(img1)
plt.show()
plt.imshow(img2)
plt.show()

### ↘️ *`TODO...`*

Individuare la grid che dà la prestazione migliore in termine di tempo impiegato.

```
best grid = (#blocks, #threads)
```