---
# **LAB 4 - Shared memory (SMEM)**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ✅ Warp shuffle

In [None]:
#include <stdio.h>

/*
* __shfl_sync(mask, var, srcLane) returns the value of var from the thread in srcLane within the warp.
* __shfl_down_sync(mask, var, delta) returns the value of var from the thread delta lanes below the current thread within the warp.
* __shfl_up_sync(mask, var, delta) returns the value of var from the thread delta lanes above the current thread within the warp.
* __shfl_xor_sync(mask, var, laneMask) returns the value of var from the thread whose lane ID is the XOR of the current thread’s lane ID and laneMask.
*/

/*
broadcast value to all other lanes
*/
__global__ void bcast(int arg) {
   int laneId = threadIdx.x;
   int value;        // unused variable for all threads except lane 0
   if (laneId == 0)  // only lane 0 will write to value
      value = arg;

   // Synchronize all threads in warp, and get "value" from lane 0
   value = __shfl_sync(0xffffffff, value, 0);  // broadcast value to all other lanes
   printf("value[%d] = %d\n", laneId, value);
}

/*
* warpReduce performs a reduction across all threads in a warp
*/
__global__ void warpReduce(int arg) {
   int laneId = threadIdx.x;
   int value = 1;  // value of 1 for all threads

   // Use DOWN mode to perform reduction
   for (int i=warpSize/2; i>0; i/=2)
      value += __shfl_down_sync(0xffffffff, value, i);

   // "value" now contains the sum across all threads
   printf("Thread %d final value = %d\n", laneId, value);
}

/*
* Main function
*/
int main() {
   bcast<<< 1, 32 >>>(1);     // 1 block, 32 threads, 1 warp
   warpReduce<<< 1, 32 >>>(1); // 1 block, 32 threads, 1 warp
   cudaDeviceSynchronize();
   return 0;
}

# ✅ Parallel reduction con SMEM e warp shuffle


↘️ **TODO...**

In [None]:
%%cuda_group_save --name "pred.cu" --group "lez4"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "../../GPUcomputing/utils/common.h"

#define SMEM_DIM 1024

/*
*  Block by block parallel implementation with divergence (sequential schema)
*/
__global__ void blockParReduce1(int *in, int *out, ulong n) {

	int tid = threadIdx.x;
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	// boundary check
	if (idx >= n)
		return;

	// convert global data pointer to the local pointer of this block
	int *thisBlock = in + blockIdx.x * blockDim.x;

	// in-place reduction in global memory
	for (int stride = 1; stride < blockDim.x; stride *= 2) {
		if ((tid % (2 * stride)) == 0)
			thisBlock[tid] += thisBlock[tid + stride];

		// synchronize within threadblock
		__syncthreads();
	}

	// write result for this block to global mem
	if (tid == 0)
		out[blockIdx.x] = thisBlock[0];
}

/*
*  Block by block parallel implementation without divergence (interleaved schema)
*/
__global__ void blockParReduce2(int *in, int *out, ulong n) {

	uint tid = threadIdx.x;
	ulong idx = blockIdx.x * blockDim.x + threadIdx.x;

	// boundary check
	if (idx >= n)
		return;

	// convert global data pointer to the local pointer of this block
	int *thisBlock = in + blockIdx.x * blockDim.x;

	// in-place reduction in global memory
	for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)  {
		if (tid < stride)
			thisBlock[tid] += thisBlock[tid + stride];

		// synchronize within threadblock
		__syncthreads();
	}

	// write result for this block to global mem
	if (tid == 0)
		out[blockIdx.x] = thisBlock[0];
}

/*
* Using shared memory (no divergence nor bank conflicts)
*/
__global__ void blockParReduce3(int *in, int *out, ulong n) {

	// shared mem

	// load shared mem

	// synchronize within threadblock

	// do reduction in shared mem

	// write result for this block to global mem

}

/*
*  Block by block parallel implementation using warp reduction
*/
__global__ void blockParReduce4(int *in, int *out, ulong n) {

	// TODO

}


/*
* MAIN: test on parallel reduction
*/
int main() {
	int *a, *b, *d_a, *d_b;
	int blockSize = 1024;             // block dim 1D
	size_t numBlock = 1024*1024;      // grid dim 1D
	size_t n = blockSize * numBlock;  // array dims
	size_t sum_CPU = 0, sum_GPU = 0;
	size_t nByte = n*sizeof(int);
	size_t mByte = numBlock * sizeof(int);
	double start, stopGPU, stopCPU, speedup;

	printf("\n****  test on parallel reduction  ****\n");

	// init
	a = (int *) malloc(nByte);
	b = (int *) malloc(mByte);
	for (size_t i = 0; i < n; i++) a[i] = 1;  // initialize a[] = 1

	CHECK(cudaMalloc(&d_a, nByte));
	CHECK(cudaMemcpy(d_a, a, nByte, cudaMemcpyHostToDevice));
	CHECK(cudaMalloc(&d_b, mByte));
	CHECK(cudaMemset(d_b, 0, mByte));

	/***********************************************************/
	/*                     CPU reduction                       */
	/***********************************************************/
	printf("  Vector length: %.2f MB\n",n/(1024.0*1024.0));
	printf("\n  CPU procedure...\n");
	start = seconds();
	for (ulong i = 0; i < n; i++)
	sum_CPU += a[i];
	stopCPU = seconds() - start;
	printf("    Elapsed time: %f (sec) \n", stopCPU);
	printf("    sum: %lu\n",sum_CPU);

	printf("\n  GPU kernels (mem required %lu bytes)\n", nByte);

	/***********************************************************/
	/*         KERNEL blockParReduce1 (divergent)              */
	/***********************************************************/
	// block by block parallel implementation with divergence
	printf("\n  Launch kernel: blockParReduce1...\n");
	start = seconds();
	blockParReduce1<<<numBlock, blockSize>>>(d_a, d_b, n);
	CHECK(cudaGetLastError());
	CHECK(cudaDeviceSynchronize());
	stopGPU = seconds() - start;
	speedup = stopCPU/stopGPU;
	printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU,speedup);

	// memcopy D2H
	CHECK(cudaMemcpy(b, d_b, mByte, cudaMemcpyDeviceToHost));

	// check result
	sum_GPU = 0;
	for (uint i = 0; i < numBlock; i++) sum_GPU += b[i];
	assert(sum_GPU == n);

	// copy and reset vectors on GPU
	CHECK(cudaMemcpy(d_a, a, nByte, cudaMemcpyHostToDevice));
	CHECK(cudaMemset(d_b, 0, mByte));

	/***********************************************************/
	/*        KERNEL blockParReduce2  (non divergent)          */
	/***********************************************************/
	// block by block parallel implementation without divergence
	printf("\n  Launch kernel: blockParReduce2...\n");
	start = seconds();
	blockParReduce2<<<numBlock, blockSize>>>(d_a, d_b, n);
	CHECK(cudaDeviceSynchronize());
	stopGPU = seconds() - start;
	speedup = stopCPU/stopGPU;
	printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU,speedup);
	CHECK(cudaGetLastError());

	// memcopy D2H
	CHECK(cudaMemcpy(b, d_b, mByte, cudaMemcpyDeviceToHost));

	// check result
	sum_GPU = 0;
	for (uint i = 0; i < numBlock; i++) sum_GPU += b[i];
	assert(sum_GPU == n);

	// copy and reset vectors on GPU
	CHECK(cudaMemcpy(d_a, a, nByte, cudaMemcpyHostToDevice));
	CHECK(cudaMemset(d_b, 0, mByte));

	/***********************************************************/
	/*           KERNEL blockParReduce3 (with smem)            */
	/***********************************************************/
	// block by block parallel implementation using warp reduction
	printf("\n  Launch kernel: blockParReduce3...\n");

	// TODO

	// copy and reset vectors on GPU
	CHECK(cudaMemcpy(d_a, a, nByte, cudaMemcpyHostToDevice));
	CHECK(cudaMemset(d_b, 0, mByte));

	/***********************************************************/
	/*        KERNEL blockParReduce4  (warp reducton)          */
	/***********************************************************/
	// block by block parallel implementation with smem
	printf("\n  Launch kernel: blockParReduce4...\n");

	// TODO

	cudaFree(d_a);
	return 0;
}


↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez4/pred.cu -o pred
!./pred

# ✅ Moltiplicazione matriciale con SMEM


↘️ **TODO...**


Scrivere un programma CUDA per prodotto matrici $C = A*B$ che usi la SMEM e riduca così il 'traffico' in global mem

**passi:**
1. Definire la SMEM per ogni blocco della matrice $C$
2. Svolgere un ciclo sui blocchi per caricare la SMEM da global mem
3. Sincronizzare -1-
4. Nel ciclo effettuare localmente all’interno di ogni blocco il calcolo del prodotto riga-colonna e caricare su registro
5. sincronizzare -2-
6. Scrivere il risultato finale su matrice prodotto in global mem


↘️ **TODO...**

In [None]:
%%cuda_group_save --name "matmul.cu" --group "lez4"
#include <stdio.h>
#include <stdlib.h>
#include "../../GPUcomputing/utils/common.h"

#define IDX(i,j,n) (i*n+j)
#define ABS(x,y) (x-y>=0?x-y:y-x)
#define N 2048
#define P 2048
#define M 1024
#define BLOCK_SIZE 16


/*
 * Kernel for matrix product with static SMEM
 *      C  =  A  *  B
 *    (NxM) (MxP) (PxM)
 */
__global__ void matmulSMEMstatic(float* A, float* B, float* C) {

	// static shared memory

	//	loop over blocks from block row of matrix A and block column of matrix B

	// copy block from matrix to shared memory

	//  BARRIER SYNC on SMEM loading

	// length of this part of row-column product is BLOCK_SIZE except for last block when it may be smaller

	// compute this part of row-column product

	//  BARRIER SYNC on prod over blocks

	// store computed element in matrix C

}

/*
 * Kernel for matrix product using dynamic SMEM
 */
__global__ void matmulSMEMdynamic(float* A, float* B, float* C, const uint SMEMsize) {

	// TODO

}

// functions definition
__global__ void matmul_naive(float*, float*, float*);
void matmulCPU(float*, float*, float*);
void checkResult(float*, float*);


/*
 * MAIN
 */
int main(void) {
	 // Kernels for matrix product
	 //      C  =  A  *  B
	 //    (NxM) (NxP) (PxM)
	printf("N = %d, M = %d, K = %d\n", N, M, P);
	uint rowA = N, rowB = P;
	uint colA = P, colB = M;
	uint rowC = N, colC = M;
	float *A, *B, *C, *C1;
	float *dev_A, *dev_B, *dev_C;

	// dims
	size_t Asize = rowA * colA * sizeof(float);
	size_t Bsize = rowB * colB * sizeof(float);
	size_t Csize = rowC * colC * sizeof(float);

	// malloc host memory
	A = (float*) malloc(Asize);
	B = (float*) malloc(Bsize);
	C = (float*) malloc(Csize);
	C1 = (float*) malloc(Csize);

	// fill the matrices A and B
	for (size_t i = 0; i < N * P; i++) A[i] = 1.0;
	for (size_t i = 0; i < P * M; i++) B[i] = 1.0;

	// malloc device memory
	CHECK(cudaMalloc(&dev_A, Asize));
	CHECK(cudaMalloc(&dev_B, Bsize));
	CHECK(cudaMalloc(&dev_C, Csize));
	printf("Total amount of allocated memory on GPU %.2f MB\n\n", (float)(Asize + Bsize + Csize)/(1024.0*1024.0));

   // copy matrices A and B to the GPU
	CHECK(cudaMemcpy(dev_A, A, Asize, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(dev_B, B, Bsize, cudaMemcpyHostToDevice));

	/***********************************************************/
	/*                       CPU matmul                       */
	/***********************************************************/
	printf("\n  CPU procedure...\n");
	double start = seconds();
	matmulCPU(A, B, C);
   double stopCPU = seconds() - start;
	printf("    Elapsed time: %f (sec) \n", stopCPU);

	/***********************************************************/
	/*                    GPU naive matmul                     */
	/***********************************************************/
	// grid block dims = smem dims = BLOCK_SIZE
   printf("\n  Launch kernel: naive matmul...\n");
	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
	start = seconds();
	matmul_naive<<<grid, block>>>(dev_A, dev_B, dev_C);
	CHECK(cudaDeviceSynchronize());
	double stopGPU = seconds() - start;
	printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU, stopCPU / stopGPU);

	// copy the array 'C' back from the GPU to the CPU
	CHECK(cudaMemcpy(C1, dev_C, Csize, cudaMemcpyDeviceToHost));
	checkResult(C, C1);
	CHECK(cudaMemset(dev_C, 0, Csize));

	/***********************************************************/
	/*              GPU matmulSMEM static SMEM                 */
	/***********************************************************/
	// grid block dims = shared mem dims = BLOCK_SIZE
	printf("\n  Launch kernel: matmul with static smem...\n");

	// TODO

	/***********************************************************/
	/*            GPU matmulSMEMD dynamic SMEM                */
	/***********************************************************/
	// set cache size
	cudaDeviceSetCacheConfig (cudaFuncCachePreferShared);
	printf("\n  Launch kernel: matmul with dynamic smem...\n");

	// TODO

	// free the memory allocated on the GPU
	cudaFree(dev_A);
	cudaFree(dev_B);
	cudaFree(dev_C);

	cudaDeviceReset();
	return EXIT_SUCCESS;
}

// Kernel for naive matrix product
__global__ void matmul_naive(float* A, float* B, float* C) {
	// indexes
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// each thread computes an entry of the product matrix
	if ((row < N) && (col < M)) {
		float sum = 0;
		for (int k = 0; k < P; k++)
			sum += A[IDX(row, k, P)] * B[IDX(k, col, M)];
		C[IDX(row, col, M)] = sum;
	}
}

// matrix product on CPU
void matmulCPU(float* A, float* B, float* C) {
	for (int row = 0; row < N; row++)
		for (int col = 0; col < M; col++) {
			float sum = 0;
			for (int k = 0; k < P; k++)
				sum += A[IDX(row, k, P)] * B[IDX(k, col, M)];
			C[IDX(row, col, M)] = sum;
		}
}

// Elementwise comparison between two mqdb
void checkResult(float *A, float *B) {
	double epsilon = 1.0E-8;
	bool match = 1;
	for (int i = 0; i < N*M; i++)
		if (ABS(A[i], B[i]) > epsilon) {
			match = 0;
			printf("   * Arrays do not match!\n");
			break;
		}
	if (!match)
		printf("   Arrays do not match\n\n");
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez4/matmul.cu -o matmul
!./matmul

# ✅ Convoluzione con SMEM

## 1D Convolution

In [None]:
%%cuda_group_save --name "conv1D.cu" --group "lez4"

#include "/content/GPUcomputing/utils/common.h"

#define MASK_RADIUS  500
#define MASK_SIZE    2 * MASK_RADIUS + 1
#define BLOCK_SIZE   1024
#define TILE_SIZE    BLOCK_SIZE + MASK_SIZE - 1


__device__ __constant__ float d_mask[MASK_SIZE];

// functions definition
void initialData(float*, int);
void movingAverage(float*, int n);
void printData(float*, const int);
void convolutionHost(float*, float*, float*, const int);
void checkResult(float*, float*, int);

/*
 * kernel for 1D convolution: it holds only if MASK_RADIUS < BLOCK_SIZE
 */
__global__ void conv1D(float *result, float *data, int n) {
	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;

	// shared memory size = BLOCK_SIZE + MASK
	__shared__ float tile[TILE_SIZE];

	// boundary
	int left = blockIdx.x * blockDim.x - MASK_RADIUS;
	int right = (blockIdx.x + 1) * blockDim.x;

  // left halo
	if (threadIdx.x < MASK_RADIUS)
		tile[threadIdx.x] = left < 0 ? 0 : data[left + threadIdx.x];

  // center
	tile[threadIdx.x + MASK_RADIUS] = data[i];

  // right halo
	if (threadIdx.x >= blockDim.x - MASK_RADIUS)
		tile[threadIdx.x + MASK_SIZE - 1] = right >= n ? 0 : data[right + threadIdx.x - blockDim.x + MASK_RADIUS];

	__syncthreads();

	// convolution: tile * mask
	float sum = 0;
	for (int i = -MASK_RADIUS; i <= MASK_RADIUS; i++)
		sum += tile[threadIdx.x + MASK_RADIUS + i] * d_mask[i + MASK_RADIUS];

	// final result
	result[i] = sum;
}

/*
 * Basic kernel for 1D convolution
 */
__global__ void conv1D_basic(float *result, float *data, int n) {

	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
	float sum = 0;

	// convolution of tile size elements
  int start_point = i - MASK_RADIUS;
	for (int j = 0; j < MASK_SIZE; j++) {
    if (start_point + j >= 0 && start_point + j < n)
      sum += data[start_point + j] * d_mask[j];
  }

	// final result
	result[i] = sum;
}


/*
 * MAIN: convolution 1D host & device
 */
int main(int argc, char **argv) {

	// set up array size
	int n = 1 << 25;
	int N = MASK_SIZE;

	printf("Array of size = %.1f MB\n", n/(1024.0*1024.0));
	printf("Mask size     = %d elements\n\n", N);

	// mem sizes
	size_t nBytes = n * sizeof(float);
	size_t nBytes_mask = N * sizeof(float);

	// grid configuration
	dim3 block(BLOCK_SIZE);
	dim3 grid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);

	// allocate host memory
	float *h_data = (float *) malloc(nBytes);
	float *h_result = (float *) malloc(nBytes);
	float *h_result_basic = (float *) malloc(nBytes);
	float *result = (float *) malloc(nBytes);
	float *h_mask = (float *) malloc(nBytes_mask);

	//  initialize host array
	movingAverage(h_mask, N);
	initialData(h_data, n);

  /***********************************************************/
	/*               convolution on host                       */
	/***********************************************************/
	double start = seconds();
	convolutionHost(h_data, result, h_mask, n);
	double hostElaps = seconds() - start;

	/***********************************************************/
	/*               convolution on device                     */
	/***********************************************************/
	// allocate device memory
	float *d_data, *d_result;
	CHECK(cudaMalloc((void**)&d_data, nBytes));
	CHECK(cudaMalloc((void**)&d_result, nBytes));

	// copy data from host to device
	CHECK(cudaMemcpy(d_data, h_data, nBytes, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpyToSymbol(d_mask, h_mask, nBytes_mask));

	start = seconds();
	conv1D<<<grid, block>>>(d_result, d_data, n);
	CHECK(cudaDeviceSynchronize());
	double devElaps = seconds() - start;

	// check result
	CHECK(cudaMemcpy(h_result, d_result, nBytes, cudaMemcpyDeviceToHost));
	checkResult(h_result, result, n);

	/***********************************************************/
	/*            convolution on device basic                  */
	/***********************************************************/
	start = seconds();
	conv1D_basic<<<grid, block>>>(d_result, d_data, n);
	CHECK(cudaDeviceSynchronize());
	double devElaps1 = seconds() - start;

	// check result
	CHECK(cudaMemcpy(h_result_basic, d_result, nBytes, cudaMemcpyDeviceToHost));
	checkResult(h_result_basic, result, n);

	// print exec times
	printf("Times:\n");
	printf("   - CPU elapsed time         = %f\n", hostElaps);
  printf("   - GPU elapsed time (SMEM)  = %f\n", devElaps);
	printf("   - GPU elapsed time (basic) = %f\n", devElaps1);
  printf("   - Speed-up (H/SMEM)        = %f\n", hostElaps / devElaps);
	printf("   - Speed-up (basic/SMEM)    = %f\n", devElaps1 / devElaps);


	// free host and device memory
	CHECK(cudaFree(d_result));
	CHECK(cudaFree(d_data));

	return EXIT_SUCCESS;
}

void initialData(float *h_data, int n) {
	// initialize the data
	for (int i = 0; i < n; i++)
		h_data[i] = 1.0;
}

void movingAverage(float *h_mask, int n) {
	// initialize mask moving average
	for (int i = 0; i < n; i++)
		h_mask[i] = 1.0 / ((float) n);
	return;
}

void printData(float *a, const int size) {
	printf("\n");
	for (int i = 0; i < size; i++)
		printf("%.2f ", a[i]);
	printf("\n");
	return;
}

void convolutionHost(float *data, float *result, float *mask, const int n) {
	for (int i = 0; i < n; i++) {
		float sum = 0;
		for (int j = 0; j < MASK_SIZE; j++) {
			int idx = i - MASK_RADIUS + j;
			if (idx >= 0 && idx < n)
				sum += data[idx] * mask[j];
		}
		result[i] = sum;
	}
}

void checkResult(float *d_result, float *h_result, int n) {
	double epsilon = 1.0E-8;

	for (int i = 0; i < n; i++)
		if (abs(h_result[i] - d_result[i]) > epsilon) {
			printf("different on entry (%d) |h_result - d_result| >  %f\n", i, epsilon);
			break;
		}
}



↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez4/conv1D.cu -o conv1D
!./conv1D

## 2D Convolution...

↘️ **TODO...**

In [None]:
%%cuda_group_save --name "conv2D.cu" --group "lez4"
#include "/content/GPUcomputing/utils/common.h"

#define BLOCK_SIZE   16
#define MASK_SIZE    21
#define TILE_SIZE    (BLOCK_SIZE + MASK_SIZE - 1)

typedef struct {
   int width;
   int height;
   float* elements;
 } Matrix;

// Function declarations
 void conv2D_host(Matrix A, Matrix B, Matrix M);
__global__ void conv2D_basic(Matrix A, Matrix B, Matrix M);

 /*
  * 2D convolution using shared memory
  *   A: input matrix
  *   B: output matrix
  *   M: convolution mask matrix
 */
__global__ void conv2D(Matrix A, Matrix B, Matrix M) {

   // Allocate shared memory

   // Load data into shared memory

   // Synchronize threads

   // Apply convolution

   // Write output

}

/*
 * Main function
 */
int main(void) {
   // define matrices and params
   int block_size = BLOCK_SIZE;
   int mask_size = MASK_SIZE;
   int width = 256* block_size, height = 256* block_size;
   Matrix A, B, H, M;
   A.width = width; A.height = height;
   B.width = width; B.height = height;
   M.width = mask_size; M.height = mask_size;
   H.width = width; H.height = height;
   A.elements = (float *)malloc(width * height * sizeof(float));
   B.elements = (float *)malloc(width * height * sizeof(float));
   M.elements = (float *)malloc(mask_size * mask_size * sizeof(float));
   H.elements = (float *)malloc(width * height * sizeof(float));

   // Initialize A, B, M
   // print data sizes
   printf("Data matrix A: %d x %d\n", width, height);
   printf("Mask matrix M: %d x %d\n", mask_size, mask_size);
   for (int i = 0; i < width * height; i++) {
      A.elements[i] = 1.0f;
      B.elements[i] = 0.0f;
   }
   for (int i = 0; i < mask_size * mask_size; i++) {
      M.elements[i] = 1.0f;
   }

   // Allocate device memory
   Matrix d_A, d_B, d_M;
   d_A.width = A.width; d_A.height = A.height;
   d_B.width = B.width; d_B.height = B.height;
   d_M.width = M.width; d_M.height = M.height;
   CHECK(cudaMalloc(&d_A.elements, width * height * sizeof(float)));
   CHECK(cudaMalloc(&d_B.elements, width * height * sizeof(float)));
   CHECK(cudaMalloc(&d_M.elements, mask_size * mask_size * sizeof(float)));

   // Copy data to device
   CHECK(cudaMemcpy(d_A.elements, A.elements, width * height * sizeof(float), cudaMemcpyHostToDevice));
   CHECK(cudaMemcpy(d_M.elements, M.elements, mask_size * mask_size * sizeof(float), cudaMemcpyHostToDevice));

   /***********************************************************/
	/*                    conv2D on host                       */
	/***********************************************************/
   printf("\nCPU procedure...\n");
	double start = seconds();
	conv2D_host(A, H, M);
	double stopCPU = seconds() - start;
   printf("   Host elapsed time: %f\n", stopCPU);

   /***********************************************************/
	/*                    GPU naive conv2D                     */
	/***********************************************************/
   printf("\nGPU naive conv2D...\n");
   dim3 dimBlock(block_size, block_size);
   dim3 dimGrid((width + block_size - 1) / block_size, (height + block_size - 1) / block_size);
   start = seconds();
   conv2D_basic<<<dimGrid, dimBlock>>>(d_A, d_B, d_M);
   CHECK(cudaDeviceSynchronize());
   double stopGPU = seconds() - start;
   printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU, stopCPU / stopGPU);

   // Copy data back to host
   CHECK(cudaMemcpy(B.elements, d_B.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost));

   // check results
   for (int i = 0; i < width; i++) {
      for (int j = 0; j < height; j++) {
         if (B.elements[j * width + i] != H.elements[j * width + i]) {
            printf("Error at B[%d][%d] = %f\n", i, j, B.elements[j * width + i]);
         }
      }
   }

   // zero out B in device
   CHECK(cudaMemset(d_B.elements, 0, width * height * sizeof(float)));

   /***********************************************************/
	/*                  GPU conv2D wih smem                    */
	/***********************************************************/
   printf("\nGPU conv2D with smem...\n");

   // TODO

   return 0;
}

/*
 * 2D convolution on host
 */
void conv2D_host(Matrix A, Matrix B, Matrix M) {

   int radius = MASK_SIZE / 2;

   // loop through all elements in the output array
   for (int y = 0; y < A.height; y++) {
	   for (int x = 0; x < A.width; x++) {
			float sum = 0.0f;

			// compute convolution
			for (int i = 0; i < MASK_SIZE; i++) {
            for (int j = 0; j < MASK_SIZE; j++) {
               int r = y - radius + i;
					int c = x - radius + j;

					//boundary check
					if ((c >= 0) && (c < A.width) && (r >= 0) && (r < A.height)) {
						sum += A.elements[(r * A.width) + c] * M.elements[(j * MASK_SIZE) + i];
					}
				}
         }

         //store final value
         B.elements[y * B.width + x] = sum;
	   }
   }
}

/*
 * Basic kernel for 2D convolution
 */
 __global__ void conv2D_basic(Matrix A, Matrix B, Matrix M) {

	//index computation
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
   int radius = MASK_SIZE / 2;

   //boundary check
   if (x >= A.width && y >= A.height)  return;

   // Apply convolution
   float sum = 0.0f;
   for (int i = 0; i < MASK_SIZE; i++) {
      for (int j = 0; j < MASK_SIZE; j++) {
         int r = y - radius + i;
         int c = x - radius + j;
         if (r >= 0 && r < A.height && c >= 0 && c < A.width) {
            sum += A.elements[r * A.width + c] * M.elements[i * MASK_SIZE + j];
         }
      }
   }

   //store final value
   B.elements[y * B.width + x] = sum;
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez4/conv2D.cu -o conv2D
!./conv2D

## PPM Gaussian filter

Utilizzare la convoluzione `conv2D` per il filtraggio gaussiano su immagini PMM
- Caricare immagine PPM (`ppm_load(path)`)
- estrarre i canali RGB (`m_extract_channel(img, c)`)
- definire mask gaussiana (`gaussMask(MASK_SIZE, SIGMA)`)
- applicare il filtraggio all'immagine con mask gaussiana (separatamente su ogni canale)
- ricostruire l'immagine filtrata dai singoli canali filtrati (`ppm_combine_channels(r, g, b, WIDTH, HEIGHT)`)

↘️ **TODO...**

In [None]:
%%cuda_group_save --name "ppm_conv2D.cu" --group "lez4"
#include "/content/GPUcomputing/utils/common.h"
#include "ppm.h"

#define BLOCK_SIZE   32
#define MASK_SIZE    21
#define TILE_SIZE    (BLOCK_SIZE + MASK_SIZE - 1)

typedef struct {
   int width;
   int height;
   float* elements;
 } Matrix;

 /*
  * 2D convolution using shared memory
  *   A: input matrix
  *   B: output matrix
  *   M: convolution mask matrix
 */
__global__ void conv2D(Matrix A, Matrix B, Matrix M) {

   int x = blockIdx.x * blockDim.x + threadIdx.x; // Column index of matrix A
   int y = blockIdx.y * blockDim.y + threadIdx.y; // Row index of matrix A

   int tile_size = BLOCK_SIZE + MASK_SIZE - 1;
   int radius = MASK_SIZE / 2;

   // Allocate shared memory
   __shared__ float smem[TILE_SIZE][TILE_SIZE];

   // Load data into shared memory
   for (int row = 0; row <= tile_size/blockDim.y; row++) {
      for (int col = 0; col <= tile_size/blockDim.x; col++) {
         int row_data = y + blockDim.y * row - radius;   // input data index row
         int col_data = x + blockDim.x * col - radius;   // input data index column
         int row_smem = threadIdx.y + blockDim.y * row;  // mask index row
         int col_smem = threadIdx.x + blockDim.x * col;  // mask index column

         // Check valid range for smem and data
         if (row_smem < tile_size && col_smem < tile_size) {
            if (row_data >= 0 && row_data < A.height && col_data >= 0 && col_data < A.width) {
               smem[row_smem][col_smem] = A.elements[row_data * A.width + col_data];
            } else {
               smem[row_smem][col_smem] = 0.0f;
            }
         }
      }
   }

   // Synchronize threads
   __syncthreads();

   // Apply convolution
   float sum = 0.0f;
   for (int i = 0; i < MASK_SIZE; i++) {
      for (int j = 0; j < MASK_SIZE; j++) {
         int r = threadIdx.y + i;
         int c = threadIdx.x + j;
         if (r >= 0 && r < tile_size && c >= 0 && c < tile_size) {
            sum += smem[r][c] * M.elements[i * MASK_SIZE + j];
         }
      }
   }

   // Write output
   if (y < A.height && x < A.width) {
      B.elements[y * B.width + x] = sum;
   }
}

/*
 * Main function
 */
int main(void) {
   // Load image
   char path[] = "GPUcomputing/images/dog.ppm";
   PPM *img = ppm_load(path);
   int WIDTH = img->width;
   int HEIGHT = img->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // extract channels and set matrices

   // get gaussian filter mask

   // Allocate device memory

   // Copy data to device


   /***********************************************************/
	/*                    conv2D on host                       */
	/***********************************************************/
   printf("\nCPU procedure...\n");
	 double start = seconds();
   PPM *img_filtered = ppm_make(WIDTH, HEIGHT, (pel) {0,0,0}); // create a new image
   ppm_gaussFilter(img, img_filtered, MASK_SIZE, SIGMA);
   ppm_write(img_filtered, "output_gaussian.ppm");
	 double stopCPU = seconds() - start;
   printf("   Host elapsed time: %f\n", stopCPU);

   /***********************************************************/
	/*                  GPU conv2D wih smem                    */
	/***********************************************************/
   printf("\nGPU conv2D with smem...\n");

   // TODO

   // check results

   return 0;
}


↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez4/ppm_conv2D.cu -o conv2D -I GPUcomputing/utils/PPM GPUcomputing/utils/PPM/ppm.cpp
!./conv2D