---
# **LAB 4 - Shared memory (SMEM)**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ✅ Parallel reduction con SMEM


In [None]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "/content/GPUcomputing/utils/common.h"

#define SMEM_DIM 1024


/*
 *  Block by block parallel implementation with divergence (sequential schema)
 */
__global__ void blockParReduce1(int *in, int *out, ulong n) {

	uint tid = threadIdx.x;
	ulong idx = blockIdx.x * blockDim.x + threadIdx.x;

	// boundary check
	if (idx >= n)
		return;

	// convert global data pointer to the local pointer of this block
	int *thisBlock = in + blockIdx.x * blockDim.x;

	// in-place reduction in global memory
	for (int stride = 1; stride < blockDim.x; stride *= 2) {
		if ((tid % (2 * stride)) == 0)
			thisBlock[tid] += thisBlock[tid + stride];

		// synchronize within threadblock
		__syncthreads();
	}

	// write result for this block to global mem
	if (tid == 0)
		out[blockIdx.x] = thisBlock[0];
}

/*
 *  Block by block parallel implementation without divergence (interleaved schema)
 */
__global__ void blockParReduce2(int *in, int *out, ulong n) {

	uint tid = threadIdx.x;
	ulong idx = blockIdx.x * blockDim.x + threadIdx.x;

	// boundary check
	if (idx >= n)
		return;

	// convert global data pointer to the local pointer of this block
	int *thisBlock = in + blockIdx.x * blockDim.x;

	// in-place reduction in global memory
	for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)  {
		if (tid < stride)
			thisBlock[tid] += thisBlock[tid + stride];

		// synchronize within threadblock
		__syncthreads();
	}

	// write result for this block to global mem
	if (tid == 0)
		out[blockIdx.x] = thisBlock[0];
}

/*
    This version uses sequential addressing -- no divergence or bank conflicts.
*/
__global__ void blockParReduce_SMEM(int *in, int *out, ulong n) {

	// shared mem
	__shared__ int smem[SMEM_DIM];

	unsigned int tid = threadIdx.x;
	ulong idx = blockIdx.x * blockDim.x + threadIdx.x;

	// load shared mem
	smem[tid] = (idx < n) ? in[idx] : 0;

	// synchronize within threadblock
	__syncthreads();

	// do reduction in shared mem
	for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
		if (tid < stride)
			smem[tid] += smem[tid + stride];

		// synchronize within threadblock
		__syncthreads();
	}

	// write result for this block to global mem
	if (tid == 0)
		out[blockIdx.x] = smem[0];
}


/*
 * MAIN: test on parallel reduction
 */
int main(void) {
	int *a, *b, *d_a, *d_b;
	int blockSize = 1024;            // block dim 1D
	ulong numBlock = 1024*1024;      // grid dim 1D
	ulong n = blockSize * numBlock;  // array dim
	float sum_CPU = 0.0, sum_GPU = 0.0;
	long nByte = n*sizeof(int), mByte = numBlock * sizeof(int);
	double start, stopGPU, stopCPU, speedup;

	printf("\n****  test on parallel reduction  ****\n");

	// init
	a = (int *) malloc(nByte);
	b = (int *) malloc(mByte);
	for (ulong i = 0; i < n; i++) a[i] = 1;

	CHECK(cudaMalloc((void **) &d_a, nByte));
	CHECK(cudaMalloc((void **) &d_b, mByte));
	CHECK(cudaMemset((void *) d_b, 0, mByte));

	/***********************************************************/
	/*                     CPU reduction                       */
	/***********************************************************/
	printf("  Vector length: %.2f MB\n",n/(1024.0*1024.0));
	printf("\n  CPU procedure...\n");
	start = seconds();
	for (ulong i = 0; i < n; i++)
    sum_CPU += a[i];
	stopCPU = seconds() - start;
	printf("    Elapsed time: %f (sec) \n", stopCPU);
	if (sum_CPU != n)
		printf("    ERROR: %f\n", sum_CPU);

	printf("\n  GPU kernels (mem required %lu bytes)\n", nByte);

	// reset input vector on GPU
	CHECK(cudaMemcpy(d_a, a, nByte, cudaMemcpyHostToDevice));
	/***********************************************************/
	/*         KERNEL blockParReduce1 (divergent)              */
	/***********************************************************/
	// block by block parallel implementation with divergence
	printf("\n  Launch kernel: blockParReduce1...\n");
	start = seconds();
	blockParReduce1<<<numBlock, blockSize>>>(d_a, d_b, n);
	CHECK(cudaGetLastError());
	CHECK(cudaDeviceSynchronize());
	stopGPU = seconds() - start;
	speedup = stopCPU/stopGPU;
	printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU,speedup);

  // memcopy D2H
	CHECK(cudaMemcpy(b, d_b, mByte, cudaMemcpyDeviceToHost));

  // check result
	sum_GPU = 0;
	for (uint i = 0; i < numBlock; i++)
		sum_GPU += b[i];
	if (sum_GPU != n)
		printf("    ERROR: %f\n", sum_GPU);

	// reset input vector on GPU
	for (ulong i = 0; i < n; i++) a[i]=1;
	CHECK(cudaMemcpy(d_a, a, nByte, cudaMemcpyHostToDevice));

	/***********************************************************/
	/*        KERNEL blockParReduce2  (non divergent)          */
	/***********************************************************/
	// block by block parallel implementation without divergence
	printf("\n  Launch kernel: blockParReduce2...\n");
	start = seconds();
	blockParReduce2<<<numBlock, blockSize>>>(d_a, d_b, n);
	CHECK(cudaDeviceSynchronize());
	stopGPU = seconds() - start;
	speedup = stopCPU/stopGPU;
	printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU,speedup);
	CHECK(cudaGetLastError());

  // memcopy D2H
	CHECK(cudaMemcpy(b, d_b, mByte, cudaMemcpyDeviceToHost));

  // check result
	sum_GPU = 0;
	for (uint i = 0; i < numBlock; i++) {
		sum_GPU += b[i];
	}
	if (sum_GPU != n)
		printf("    ERROR: %f\n", sum_GPU);

  // reset input vector on GPU
	for (ulong i = 0; i < n; i++) a[i] = 1;
	CHECK(cudaMemcpy(d_a, a, nByte, cudaMemcpyHostToDevice));

	/***********************************************************/
	/*              KERNEL blockParReduce_SMEM                 */
	/***********************************************************/
	// block by block parallel implementation without divergence
	printf("\n  Launch kernel: blockParReduce_SMEM...\n");
	start = seconds();
	blockParReduce_SMEM<<<numBlock, blockSize>>>(d_a, d_b, n);
	CHECK(cudaDeviceSynchronize());
	stopGPU = seconds() - start;
	speedup = stopCPU/stopGPU;
	printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU, speedup);
	CHECK(cudaGetLastError());

  // memcopy D2H
	CHECK(cudaMemcpy(b, d_b, mByte, cudaMemcpyDeviceToHost));

  // check result
	sum_GPU = 0;
	for (uint i = 0; i < numBlock; i++) {
		sum_GPU += b[i];
	}
	if (sum_GPU != n)
		printf("    ERROR: %f\n", sum_GPU);

	cudaFree(d_a);
	cudaFree(d_b);

	return 0;
}


# ✅ Moltiplicazione matriciale con SMEM


**Prodotto di matrici con SMEM**

Scrivere un programma CUDA per prodotto matrici $C = A*B$ che usi la SMEM e riduca così il 'traffico' in global mem

**passi:**
1. Definire la SMEM per ogni blocco della matrice $C$
2. Svolgere un ciclo sui blocchi per caricare la SMEM da global mem
3. Sincronizzare -1-
4. Nel ciclo effettuare localmente all’interno di ogni blocco il calcolo del prodotto riga-colonna e caricare su registro
5. sincronizzare -2-
6. Scrivere il risultato finale su matrice prodotto in global mem


### ↘️ *`TODO...`*

In [None]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include "/content/GPUcomputing/utils/common.h"

#define IDX(i,j,n) (i*n+j)
#define ABS(x,y) (x-y>=0?x-y:y-x)
#define N 2048
#define P 2048
#define M 1024
#define BLOCK_SIZE 16


/*
 * Kernel for matrix product with static SMEM
 *      C  =  A  *  B
 *    (NxM) (MxP) (PxM)
 */
__global__ void matmulSMEMstatic(float* A, float* B, float* C) {
	// indexes
	uint row = blockIdx.y * blockDim.y + threadIdx.y;
	uint col = blockIdx.x * blockDim.x + threadIdx.x;

	// target: compute the right sum for the given row and col
	float sum = 0.0;

	// static shared memory
	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

	/*	loop over blocks from block row of matrix A
	  	and block column of matrix B  */
	uint numBlocks = (P + BLOCK_SIZE - 1) / BLOCK_SIZE;
	for (uint m = 0; m < numBlocks; m++) {

		// copy block from matrix to shared memory
		uint r = m * BLOCK_SIZE + threadIdx.y;
		uint c = m * BLOCK_SIZE + threadIdx.x;
		As[threadIdx.y][threadIdx.x] = A[IDX(row, c, P)];
		Bs[threadIdx.y][threadIdx.x] = B[IDX(r, col, M)];

		//---------------------------------------------------------------
		__syncthreads();  //  BARRIER SYNC on SMEM loading
		//---------------------------------------------------------------

		// length of this part of row-column product is BLOCK_SIZE
		// except for last block when it may be smaller
		uint K = BLOCK_SIZE;
		if (m == numBlocks - 1) K = P - m * BLOCK_SIZE; // tune last block

		// compute this part of row-column product
		for (uint k = 0; k < K; k++)
			sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];

		//---------------------------------------------------------------
		__syncthreads();  //  BARRIER SYNC on prod over blocks
		//---------------------------------------------------------------
	}

	// store computed element in matrix C
	if (row < N && col < M)
		C[IDX(row, col, M)] = sum;
}

/*
 * Kernel for matrix product using dynamic SMEM
 */
__global__ void matmulSMEMdynamic(float* A, float* B, float* C, const uint SMEMsize) {
	// indexes
	uint row = blockIdx.y * blockDim.y + threadIdx.y;
	uint col = blockIdx.x * blockDim.x + threadIdx.x;

	// dynamic shared memory (inside or outside kernel)
	extern __shared__ float smem[];

	// Var As is manually set at beginning of shared
	float *As = smem;
	// Var Bs is manually set at the end of As
	float *Bs = &smem[SMEMsize];

	// loop over blocks from block row of matrix A
	// and block column of matrix B
	float sum = 0.0;
	uint numBlocks = (P + blockDim.x - 1) / blockDim.x;
	for (uint m = 0; m < numBlocks; m++) {

		// copy block from matrix to shared memory
		uint c = m * blockDim.x + threadIdx.x;
		uint r = m * blockDim.y + threadIdx.y;
		As[threadIdx.y * blockDim.y + threadIdx.x] = A[IDX(row, c, P)];
		Bs[threadIdx.y * blockDim.y + threadIdx.x] = B[IDX(r, col, M)];

		//---------------------------------------------------------------
		__syncthreads();
		//---------------------------------------------------------------

		// length of this part of row-column product is BLOCK_SIZE
		// except for last block when it may be smaller
		uint K = (m == numBlocks - 1 ? P - m * blockDim.x : blockDim.x);

		// compute this part of row-column product
		for (int k = 0; k < K; k++)
			sum += As[threadIdx.y * blockDim.x + k] * Bs[k * blockDim.y + threadIdx.x];

		//---------------------------------------------------------------
		__syncthreads();
		//---------------------------------------------------------------
	}

	// store computed element in matrix C
	if (row < N && col < M)
		C[IDX(row, col, M)] = sum;
}

// functions definition
__global__ void matmul(float*, float*, float*);
void matmulCPU(float*, float*, float*);
void checkResult(float*, float*);


/*
 * MAIN
 */
int main(void) {
	 // Kernels for matrix product
	 //      C  =  A  *  B
	 //    (NxM) (NxP) (PxM)
	printf("N = %d, M = %d, K = %d\n", N, M, P);
	uint rowA = N, rowB = P;
	uint colA = P, colB = M;
	uint rowC = N, colC = M;
	float *A, *B, *C, *C1;
	float *dev_A, *dev_B, *dev_C;

	// dims
	unsigned long Asize = rowA * colA * sizeof(float);
	unsigned long Bsize = rowB * colB * sizeof(float);
	unsigned long Csize = rowC * colC * sizeof(float);

	// malloc host memory
	A = (float*) malloc(Asize);
	B = (float*) malloc(Bsize);
	C = (float*) malloc(Csize);
	C1 = (float*) malloc(Csize);

	// malloc device memory
	CHECK(cudaMalloc((void** )&dev_A, Asize));
	CHECK(cudaMalloc((void** )&dev_B, Bsize));
	CHECK(cudaMalloc((void** )&dev_C, Csize));
	printf("Total amount of allocated memory on GPU %.2f MB\n\n", (float)(Asize + Bsize + Csize)/(1024.0*1024.0));

	// fill the matrices A and B
	for (int i = 0; i < N * P; i++) A[i] = 1.0;
	for (int i = 0; i < P * M; i++) B[i] = 1.0;

	/***********************************************************/
	/*                       CPU matmul                       */
	/***********************************************************/
	printf("\n   *** CPU & NAIVE KERNEL ***\n\n");
	double start = seconds();
	matmulCPU(A, B, C);
	double cpu_time = seconds() - start;
	printf("   matmul elapsed time CPU = %f\n\n", cpu_time);


	// copy matrices A and B to the GPU
	CHECK(cudaMemcpy(dev_A, A, Asize, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(dev_B, B, Bsize, cudaMemcpyHostToDevice));

	/***********************************************************/
	/*                    GPU naive matmul                     */
	/***********************************************************/
	// grid block dims = shared mem dims = BLOCK_SIZE
	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
	start = seconds();
	matmul<<<grid, block>>>(dev_A, dev_B, dev_C);
	CHECK(cudaDeviceSynchronize());
	double gpu_time1 = seconds() - start;
	printf("   Kernel naive matmul elapsed time GPU = %f\n", gpu_time1);
	printf("   - Speed-up                           = %f\n", cpu_time / gpu_time1);

	// copy the array 'C' back from the GPU to the CPU
	CHECK(cudaMemcpy(C1, dev_C, Csize, cudaMemcpyDeviceToHost));
	checkResult(C,C1);
	CHECK(cudaMemset((void *) dev_C, 0, Csize));

	/***********************************************************/
	/*              GPU matmulSMEM static SMEM                 */
	/***********************************************************/
	// grid block dims = shared mem dims = BLOCK_SIZE
	printf("\n   *** USING STATIC SMEM ***\n\n");
	start = seconds();
	matmulSMEMstatic<<<grid, block>>>(dev_A, dev_B, dev_C);
	CHECK(cudaDeviceSynchronize());
	 double gpu_time2 = seconds() - start;
	printf("   Kernel matmulSMEM static elapsed time GPU = %f\n", gpu_time2);
	printf("   - Speed-up                                = %f\n", gpu_time1 / gpu_time2);

	// copy the array 'C' back from the GPU to the CPU
	CHECK(cudaMemcpy(C1, dev_C, Csize, cudaMemcpyDeviceToHost));
	checkResult(C,C1);
	CHECK(cudaMemset((void *) dev_C, 0, Csize));

	/***********************************************************/
	/*            GPU matmulSMEMD dynamic SMEM                */
	/***********************************************************/
	// set cache size
	cudaDeviceSetCacheConfig (cudaFuncCachePreferShared);
	printf("\n   *** USING DYNAMIC SMEM ***\n\n");

	// try with various SMEM sizes
	uint sizes[] = {8, 16, 32};
	for (int i = 0; i < 3; i++) {
		uint blockSize = sizes[i];
		block.x = blockSize;
		block.y = blockSize;
		grid.x = (M + block.x - 1) / block.x;
		grid.y = (N + block.y - 1) / block.y;
		uint SMEMsize = blockSize * blockSize;
		uint SMEMbyte = 2 * SMEMsize * sizeof(float);
		start = seconds();
		matmulSMEMdynamic<<< grid, block, SMEMbyte >>>(dev_A, dev_B, dev_C, SMEMsize);
		CHECK(cudaDeviceSynchronize());
		printf("   Kernel matmulSMEM dynamic (SMEM size %d) elapsed time GPU = %f\n", blockSize, seconds() - start);

		// amount of SMEM used
		//printf("   Total amount of shared memory required per block %.1f KB\n", (float) SMEMbyte / (float) 1024);

		// copy the array 'C' back from the GPU to the CPU
		CHECK(cudaMemcpy(C1, dev_C, Csize, cudaMemcpyDeviceToHost));
		checkResult(C,C1);
		CHECK(cudaMemset((void *) dev_C, 0, Csize));
	}

	// free the memory allocated on the GPU
	cudaFree(dev_A);
	cudaFree(dev_B);
	cudaFree(dev_C);

	cudaDeviceReset();
	return EXIT_SUCCESS;
}

// Kernel for naive matrix product
__global__ void matmul(float* A, float* B, float* C) {
	// indexes
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// each thread computes an entry of the product matrix
	if ((row < N) && (col < M)) {
		float sum = 0;
		for (int k = 0; k < P; k++)
			sum += A[IDX(row, k, P)] * B[IDX(k, col, M)];
		C[IDX(row, col, M)] = sum;
	}
}

// matrix product on CPU
void matmulCPU(float* A, float* B, float* C) {
	for (int row = 0; row < N; row++)
		for (int col = 0; col < M; col++) {
			float sum = 0;
			for (int k = 0; k < P; k++)
				sum += A[IDX(row, k, P)] * B[IDX(k, col, M)];
			C[IDX(row, col, M)] = sum;
		}
}

// Elementwise comparison between two mqdb
void checkResult(float *A, float *B) {
	double epsilon = 1.0E-8;
	bool match = 1;
	for (int i = 0; i < N*M; i++)
		if (ABS(A[i], B[i]) > epsilon) {
			match = 0;
			printf("   * Arrays do not match!\n");
			break;
		}
	if (!match)
		printf("   Arrays do not match\n\n");
}

In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/matmulSMEM.cu  -o matmulSMEM
!matmulSMEM

In [None]:
!ls -la

# ✅ Convoluzione con SMEM

In [None]:
%%cuda_group_save --name "conv1D.cu" --group "SMEM"
#include <stdlib.h>
#include <stdio.h>
#include "/content/GPUcomputing/utils/common.h"

#define MASK_RADIUS  500
#define MASK_SIZE    2 * MASK_RADIUS + 1
#define BLOCK_SIZE   1024
#define TILE_SIZE    BLOCK_SIZE + MASK_SIZE - 1


__device__ __constant__ float d_mask[MASK_SIZE];

// functions definition
void initialData(float*, int);
void movingAverage(float*, int n);
void printData(float*, const int);
void convolutionHost(float*, float*, float*, const int);
void checkResult(float*, float*, int);

/*
 * kernel for 1D convolution: it holds only if MASK_RADIUS < BLOCK_SIZE
 */
__global__ void conv1D(float *result, float *data, int n) {
	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;

	// shared memory size = BLOCK_SIZE + MASK
	__shared__ float tile[TILE_SIZE];

	// boundary
	int left = blockIdx.x * blockDim.x - MASK_RADIUS;
	int right = (blockIdx.x + 1) * blockDim.x;

  // left halo
	if (threadIdx.x < MASK_RADIUS)
		tile[threadIdx.x] = left < 0 ? 0 : data[left + threadIdx.x];

  // center
	tile[threadIdx.x + MASK_RADIUS] = data[i];

  // right halo
	if (threadIdx.x >= blockDim.x - MASK_RADIUS)
		tile[threadIdx.x + MASK_SIZE - 1] = right >= n ? 0 : data[right + threadIdx.x - blockDim.x + MASK_RADIUS];

	__syncthreads();

	// convolution: tile * mask
	float sum = 0;
	for (int i = -MASK_RADIUS; i <= MASK_RADIUS; i++)
		sum += tile[threadIdx.x + MASK_RADIUS + i] * d_mask[i + MASK_RADIUS];

	// final result
	result[i] = sum;
}

/*
 * Basic kernel for 1D convolution
 */
__global__ void conv1D_basic(float *result, float *data, int n) {

	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
	float sum = 0;

	// convolution of tile size elements
  int start_point = i - MASK_RADIUS;
	for (int j = 0; j < MASK_SIZE; j++) {
    if (start_point + j >= 0 && start_point + j < n)
      sum += data[start_point + j] * d_mask[j];
  }

	// final result
	result[i] = sum;
}


/*
 * MAIN: convolution 1D host & device
 */
int main(int argc, char **argv) {

	// set up array size
	int n = 1 << 25;
	int N = MASK_SIZE;

	printf("Array of size = %.1f MB\n", n/(1024.0*1024.0));
	printf("Mask size     = %d elements\n\n", N);

	// mem sizes
	size_t nBytes = n * sizeof(float);
	size_t nBytes_mask = N * sizeof(float);

	// grid configuration
	dim3 block(BLOCK_SIZE);
	dim3 grid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);

	// allocate host memory
	float *h_data = (float *) malloc(nBytes);
	float *h_result = (float *) malloc(nBytes);
	float *h_result_basic = (float *) malloc(nBytes);
	float *result = (float *) malloc(nBytes);
	float *h_mask = (float *) malloc(nBytes_mask);

	//  initialize host array
	movingAverage(h_mask, N);
	initialData(h_data, n);

  /***********************************************************/
	/*               convolution on host                       */
	/***********************************************************/
	double start = seconds();
	convolutionHost(h_data, result, h_mask, n);
	double hostElaps = seconds() - start;

	/***********************************************************/
	/*               convolution on device                     */
	/***********************************************************/
	// allocate device memory
	float *d_data, *d_result;
	CHECK(cudaMalloc((void**)&d_data, nBytes));
	CHECK(cudaMalloc((void**)&d_result, nBytes));

	// copy data from host to device
	CHECK(cudaMemcpy(d_data, h_data, nBytes, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpyToSymbol(d_mask, h_mask, nBytes_mask));

	start = seconds();
	conv1D<<<grid, block>>>(d_result, d_data, n);
	CHECK(cudaDeviceSynchronize());
	double devElaps = seconds() - start;

	// check result
	CHECK(cudaMemcpy(h_result, d_result, nBytes, cudaMemcpyDeviceToHost));
	checkResult(h_result, result, n);

	/***********************************************************/
	/*            convolution on device basic                  */
	/***********************************************************/
	start = seconds();
	conv1D_basic<<<grid, block>>>(d_result, d_data, n);
	CHECK(cudaDeviceSynchronize());
	double devElaps1 = seconds() - start;

	// check result
	CHECK(cudaMemcpy(h_result_basic, d_result, nBytes, cudaMemcpyDeviceToHost));
	checkResult(h_result_basic, result, n);

	// print exec times
	printf("Times:\n");
	printf("   - CPU elapsed time         = %f\n", hostElaps);
  printf("   - GPU elapsed time (SMEM)  = %f\n", devElaps);
	printf("   - GPU elapsed time (basic) = %f\n", devElaps1);
  printf("   - Speed-up (H/SMEM)        = %f\n", hostElaps / devElaps);
	printf("   - Speed-up (basic/SMEM)    = %f\n", devElaps1 / devElaps);


	// free host and device memory
	CHECK(cudaFree(d_result));
	CHECK(cudaFree(d_data));

	return EXIT_SUCCESS;
}

void initialData(float *h_data, int n) {
	// initialize the data
	for (int i = 0; i < n; i++)
		h_data[i] = 1.0;
}

void movingAverage(float *h_mask, int n) {
	// initialize mask moving average
	for (int i = 0; i < n; i++)
		h_mask[i] = 1.0 / ((float) n);
	return;
}

void printData(float *a, const int size) {
	printf("\n");
	for (int i = 0; i < size; i++)
		printf("%.2f ", a[i]);
	printf("\n");
	return;
}

void convolutionHost(float *data, float *result, float *mask, const int n) {
	for (int i = 0; i < n; i++) {
		float sum = 0;
		for (int j = 0; j < MASK_SIZE; j++) {
			int idx = i - MASK_RADIUS + j;
			if (idx >= 0 && idx < n)
				sum += data[idx] * mask[j];
		}
		result[i] = sum;
	}
}

void checkResult(float *d_result, float *h_result, int n) {
	double epsilon = 1.0E-8;

	for (int i = 0; i < n; i++)
		if (abs(h_result[i] - d_result[i]) > epsilon) {
			printf("different on entry (%d) |h_result - d_result| >  %f\n", i, epsilon);
			break;
		}
}



In [None]:
# Compilazione ed esecuzione
!nvcc -arch=sm_75  src/SMEM/conv1D.cu -o conv1D
!./conv1D

Convoluzione 2D...

### ↘️ *`TODO...`*

In [None]:
%%cuda
#include <stdlib.h>
#include <string.h>
#include "/content/GPUcomputing/utils/common.h"

#define DATA_WIDTH   (5*1024)
#define DATA_HEIGHT  (5*1024)
#define BLOCK_SIZE   32
#define MASK_RADIUS  10
#define MASK_WIDTH   (2 * MASK_RADIUS + 1)
#define TILE_WIDTH   (BLOCK_SIZE + MASK_WIDTH - 1)
#define DEBUG 0

// constant mem
__constant__ float M_dev[MASK_WIDTH * MASK_WIDTH];

/*
 * Basic kernel for 2D convolution
 */
__global__ void conv2D_basic(float* A, float* B) {

	//index computation
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if(x < DATA_WIDTH && y < DATA_WIDTH){

		float result = 0.0f;

		//start index on the input array
		int startX = x - MASK_RADIUS;
		int startY = y - MASK_RADIUS;

		//compute convolution
		for (int i = 0; i < MASK_WIDTH; ++i) {
			for(int j = 0; j < MASK_WIDTH; j++ ) {

				//boundary check
				if((startY + i >= 0) && (startY + i < DATA_WIDTH) && (startX + j >= 0) && (startX + j < DATA_WIDTH))
					result += A[((startY + i) * DATA_WIDTH) + (startX + j)] * M_dev[(i * MASK_WIDTH) + j];
			}
		}
		//store final value
		B[(y * DATA_WIDTH) + x] = result;
	}
}

/*
 * kernel for convolution 2D (it holds only if MASK_RADIUS < BLOCK_SIZE)
 */
__global__ void conv2D_SMEM(float *A, float *B) {
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int RAD = MASK_RADIUS;
  int BmR = BLOCK_SIZE - RAD;
  int W = DATA_WIDTH;
  int H = DATA_HEIGHT;
	int m = MASK_WIDTH;

	// shared mem
	__shared__ float A_s[TILE_WIDTH][TILE_WIDTH];

  // START SHARED MEMORY LOADING

  // 1. copy the tile upper halo
  if ((threadIdx.y < RAD) ) {

    // left corner
    if (threadIdx.x < RAD && (x-RAD) >= 0 && (y-RAD) >= 0)
      A_s[threadIdx.y][threadIdx.x] = A[(y-RAD) * W + x - RAD];

    // right corner
    if (threadIdx.x >= BmR && (x+RAD) < W && (y-RAD) >= 0)
      A_s[threadIdx.y][threadIdx.x + 2*RAD] = A[(y-RAD) * W + x + RAD];

    // edge
    if ((y-RAD) >= 0)
      A_s[threadIdx.y][threadIdx.x + RAD] = A[(y-RAD) * W + x ];
  }

  // 2. copy the tile bottom halo
  if (threadIdx.y >= BmR) {

    // left corner
    if (threadIdx.x < RAD && (x-RAD) >= 0 && (y+RAD) < H)
      A_s[threadIdx.y + 2*RAD][threadIdx.x] = A[(y+RAD) * W + x - RAD];

    // right corner
    if (threadIdx.x >= BmR && (y+RAD) < H)
      A_s[threadIdx.y + 2*RAD][threadIdx.x + 2*RAD] = A[(y+RAD) * W + x + RAD];

    // edge
    if ((y+RAD) < H)
      A_s[threadIdx.y + 2*RAD][threadIdx.x + RAD] = A[(y+RAD) * W + x];
  }

  // 3. copy the tile left-edge halo
  if (threadIdx.x < RAD)
    // edge
    if ((x-RAD) >= 0)
      A_s[threadIdx.y + RAD][threadIdx.x] = A[y * W + x - RAD];

  // 4. copy the tile right-edge halo
  if (threadIdx.x >= BmR)
    // edge
    if ((x+RAD) < W)
      A_s[threadIdx.y + RAD][threadIdx.x + 2*RAD] = A[y * W + x + RAD];


  // 5. copy the tile center <-> block
	A_s[RAD + threadIdx.y][RAD + threadIdx.x] = A[y*W+x];

  // END SHARED MEMORY LOADING

	__syncthreads();

	float conv_sum = 0.0;
	for (int i = 0; i < m; i++)
		for (int j = 0; j < m; j++)
			conv_sum += A_s[threadIdx.y+i][threadIdx.x+j] * M_dev[i*m + j];

  // store conv result
  B[x*W+y] = conv_sum;
}

// functions definition
void conv2D_host(float*, float*, const float*);
void Avg_mask(float*);


/*
 * main
 */
int main(void) {


	int nW = DATA_WIDTH;
  int nH = DATA_HEIGHT;
	int b = BLOCK_SIZE;

	float M[MASK_WIDTH * MASK_WIDTH]; // const size
	float *A, *B, *A_dev, *B_dev;
	int datasize = nW * nH * sizeof(float);
  int masksize = MASK_WIDTH * MASK_WIDTH * sizeof(float);

  printf("Data size: %.2f (MB)\n", (float)datasize/(1024.0*1024.0));
	printf("Initializing data...\n");
	A = (float *) malloc(datasize);
	B = (float *) malloc(datasize);

	// initialize data
	for (int i = 0; i < nH; i++)
		for (int j = 0; j < nW; j++)
			A[i*nW+j] = 1.0f; //rand()%10;

  // initialize mask
	Avg_mask(M);

	/***********************************************************/
	/*               convolution on host                       */
	/***********************************************************/
	double start = seconds();
	conv2D_host(A, B, M);
	double hostElaps = seconds() - start;


#if DEBUG
	// print data
	printf("Print matrix A...\n");
	for (int i = 0; i < nH; i++) {
    if (i%8 == 0 && i>0)
      printf("\n");

		for (int j = 0; j < nW; j++)
      if (j%8 == 0 && j>0)
			  printf(" %0.0f ", A[i*nW+j]);
      else
        printf("%0.0f ", A[i*nW+j]);
		printf("\n");
	}

	printf("Print matrix M ...\n");
	for (int i = 0; i < MASK_WIDTH; i++) {
		for (int j = 0; j < MASK_WIDTH; j++)
			  printf(" %1.2f ", M[i * MASK_WIDTH + j]);
		printf("\n");
	}

	// print out data
	printf("Print results...\n");
	for (int i = 0; i < nH; i++) {
    if (i%8 == 0 && i>0)
      printf("\n");
		for (int j = 0; j < nW; j++)
      if (j%8 == 0 && j>0)
			  printf(" %0.2f ", B[i*nW+j]);
      else
        printf("%0.2f ", B[i*nW+j]);
		printf("\n");
	}
#endif


	/***********************************************************/
	/*             convolution on device basic                 */
	/***********************************************************/

	// cuda allocation
	CHECK(cudaMemcpyToSymbol(M_dev, M, masksize));
	CHECK(cudaMalloc((void **) &A_dev, datasize));
	CHECK(cudaMalloc((void **) &B_dev, datasize));
	CHECK(cudaMemcpy(A_dev, A, datasize, cudaMemcpyHostToDevice));

	// block, grid dims, kernel
	dim3 block(b, b);
	dim3 grid((nW+b-1)/b, (nH+b-1)/b);

	start = seconds();
	conv2D_basic<<<grid, block>>>(A_dev, B_dev);
  cudaDeviceSynchronize();
  double devElaps = seconds() - start;
	printf("\nconv2D<<<(%d,%d), (%d,%d)>>> \n\n", grid.x, grid.y, block.x, block.y);
	CHECK(cudaGetLastError());

	/***********************************************************/
	/*             convolution on device SMEM                  */
	/***********************************************************/

	start = seconds();
	conv2D_SMEM<<<grid, block>>>(A_dev, B_dev);
  cudaDeviceSynchronize();
  double devElaps1 = seconds() - start;
	printf("\nconv2D<<<(%d,%d), (%d,%d)>>> \n\n", grid.x, grid.y, block.x, block.y);
	CHECK(cudaGetLastError());

	// print exec times
	printf("Times:\n");
	printf("   - CPU elapsed time         = %f\n", hostElaps);
  printf("   - GPU elapsed time (SMEM)  = %f\n", devElaps1);
	printf("   - GPU elapsed time (basic) = %f\n", devElaps);
  printf("   - Speed-up (H/SMEM)        = %f\n", hostElaps / devElaps);
	printf("   - Speed-up (basic/SMEM)    = %f\n", devElaps / devElaps1);

	CHECK(cudaMemcpy(B, B_dev, datasize, cudaMemcpyDeviceToHost));

#if DEBUG
	// print out data
	printf("Print results...\n");
	for (int i = 0; i < nH; i++) {
    if (i%8 == 0 && i>0)
      printf("\n");
		for (int j = 0; j < nW; j++)
      if (j%8 == 0 && j>0)
			  printf(" %0.2f ", B[i*nW+j]);
      else
        printf("%0.2f ", B[i*nW+j]);
		printf("\n");
	}
#endif

	cudaFree(A_dev);
	cudaFree(B_dev);
  cudaDeviceReset();
	free(A);
	free(B);
	return 0;
}


void conv2D_host(float* A, float* B, const float* M) {
	// find center position of kernel (half of kernel size)
	int kCenterX = MASK_RADIUS;
	int kCenterY = MASK_RADIUS;

	for (int x = 0; x < DATA_WIDTH; x++)             // rows
		for (int y = 0; y < DATA_HEIGHT; y++) {        // columns

			float result = 0.0f;

			// start index on the input array
			int startX = x - MASK_RADIUS;
			int startY = y - MASK_RADIUS;

			// compute convolution
			for (int i = 0; i < MASK_WIDTH; ++i)
				for(int j = 0; j < MASK_WIDTH; j++ ) {
					int X = startX + i;
					int Y = startY + j;

					//boundary check
					if((X >= 0) && (X < DATA_WIDTH) && (Y >= 0) && (Y < DATA_HEIGHT)) {
						result += A[(Y * DATA_WIDTH) + X] * M[(j * MASK_WIDTH) + i];
					}
				}

			//store final value
			B[(y * DATA_WIDTH) + x] = result;
			//printf("B[(%d * DATA_WIDTH) + %d] = %f\n",y, x, B[(y * DATA_WIDTH) + x]);
		}
}

// Average filter
void Avg_mask(float *mask) {
	int n = MASK_WIDTH;
	for (int i = 0; i < n*n; i++)
		mask[i] = (float) 1.0f / (n * n);
}